Extracting information from Wikipedia City pages

In [None]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import sqlalchemy

def extract_city_info(city: str):
    """
    Extracts information about a city from Wikipedia. This includes the city's
    country, population, latitude, and longitude.

    :param city: Name of the city (string)
    :return: Dictionary containing city information
    """
    city_data = {}
    url = f"https://www.wikipedia.org/wiki/{city}"
    response = requests.get(url)
    city_soup = BeautifulSoup(response.content, 'html.parser')

    city_data["City"] = city
    city_data["Country"] = city_soup.find(class_="infobox-data").get_text()

    city_population = city_soup.find(string=re.compile("Population")).find_next("td").get_text()
    city_population_clean = int(city_population.replace(",", ""))
    city_data["Population"] = city_population_clean

    city_data["Latitude"] = city_soup.find(class_="latitude").get_text()
    city_data["Longitude"] = city_soup.find(class_="longitude").get_text()

    return city_data

def transform_coordinates(lat_str, lon_str):
    """
    Transforms latitude and longitude from string format (degrees, minutes, seconds)
    to decimal format.

    :param lat_str: Latitude in string format (degrees, minutes, seconds)
    :param lon_str: Longitude in string format (degrees, minutes, seconds)
    :return: Tuple containing latitude and longitude in decimal format
    """
    def dms_to_decimal(dms_str):
        # Regular expression for degree-minute-second format
        dms_match = re.match(r"(\d+)°(\d+)′(\d+)″([NSEW])", dms_str)
        if dms_match:
            d, m, s, direction = dms_match.groups()
            decimal = float(d) + float(m)/60 + float(s)/3600

        # Regular expression for simple degree format
        else:
            degree_match = re.match(r"(\d+)°(\d+)′([NSEW])", dms_str)
            if degree_match:
                d, m, direction = degree_match.groups()
                decimal = float(d) + float(m)/60
            else:
                print(f"Could not parse the coordinate string: {dms_str}")
                return None

        if direction in ['S', 'W']:
            decimal = -decimal

        return decimal

    return dms_to_decimal(lat_str), dms_to_decimal(lon_str)

def process_cities(cities):
    """
    Processes a list of cities by extracting their information from Wikipedia,
    transforming their coordinates, and compiling everything into a DataFrame.

    :param cities: List of city names (list of strings)
    :return: DataFrame containing processed city information
    """
    processed_data = []

    for city in cities:
        city_info = extract_city_info(city)
        lat_decimal, lon_decimal = transform_coordinates(city_info["Latitude"], city_info["Longitude"])
        city_info["Latitude"] = lat_decimal
        city_info["Longitude"] = lon_decimal
        processed_data.append(city_info)

    return pd.DataFrame(processed_data)

def send_to_sql(df, connection_string, table_name):
    """
    Sends a DataFrame to a SQL database.
    :param df: DataFrame to be sent to the SQL database
    :param connection_string: SQL database connection string
    :param table_name: Name of the table where the data will be inserted
    """
    engine = sqlalchemy.create_engine(connection_string)
    df.to_sql(
        table_name,
        engine,
        if_exists='append',
        index=False)


# Example Usage:
cities = ["Berlin", "Munich","Hamburg"] # Replace with your list of cities
cities_df = process_cities(cities)
print(cities_df)

Before you can store the extracted city information you will have to create your local database with the corresponding table in order to obtain the data. I am using MySQLWorkbench and I have created the database and its table the following way:


```
CREATE DATABASE gans;

USE gans;

CREATE TABLE cities (
    city_id INT AUTO_INCREMENT PRIMARY KEY,
    city VARCHAR(255) NOT NULL,
    country VARCHAR(255),
    population INT,
    latitude DECIMAL(10, 7),  -- Decimal format to store latitude
    longitude DECIMAL(10, 7)  -- Decimal format to store longitude
);
```



Afterwards you are ready to send the extracted information as a dataframe to your database at MySQL.

In [None]:
# Replace the following values with your specific details
username = 'root'            # Your MySQL username
password = 'password'        # Your MySQL password
host = '127.0.0.1'           # The host where your database is running (e.g., localhost or an IP address)
port = '3306'                # The port number for the database. MySQL typically uses 3306
database = 'gans'            # The specific database name to connect to

# Connection string
connection_string = f"mysql+pymysql://{username}:{password}@{host}:{port}/{database}"

# Function call
send_to_sql(cities_df, connection_string, "cities")