### Creating Dataframes of Cities and Population

In [57]:
!pip install dotenv



In [59]:
import dotenv
import os 
dotenv.load_dotenv()

True

In [46]:
import requests
from bs4 import BeautifulSoup

In [47]:
import re
!pip install lat-lon-parser
from lat_lon_parser import parse



In [48]:
!pip install pandas
import pandas as pd



In [49]:
#Creating function for city's data

def city_info(cities):
  city_data = []  # List to store the city info

  for city_name in cities: # Changed city to city_name
     url = f'https://en.wikipedia.org/wiki/{city_name}' # Using city_name for the url
     response = requests.get(url)
     if response.status_code != 200:
            print(f"Failed to load page for {city_name}. Status code: {response.status_code}")
            continue
     else:
            city_soup = BeautifulSoup(response.content, 'html.parser')  # assign to city_soup
#City
     city_name_element = city_soup.find('span', class_= "mw-page-title-main")
     city_name = city_name_element.text if city_name_element else None # handle cases where the element is not found

#Country
     country_element = city_soup.find('th', string='Country').find_next('td')
     country_match = re.search(r'([A-Za-z\s,\(\)]+)', country_element.text)
     country = country_match.group(1).strip() if country_match else None
     # handle cases where the element is not found

#Latitude
     lat_element = city_soup.find('span', class_= 'latitude')
     lat = lat_element.text if lat_element else None # handle cases where the element is not found

#Longitude
     lon_element = city_soup.find('span', class_= 'longitude')
     lon = lon_element.text if lon_element else None # handle cases where the element is not found


# Appending city info to the list
     city_data.append({'City': city_name, 'Country': country, 'Latitude': parse(lat), 'Longitude': parse(lon)})  #parse changes long and lat into decimal

  dataframe= pd.DataFrame(city_data)
  return dataframe

In [50]:
cities_df= city_info(['Berlin','Hamburg', 'Munich'])

In [51]:
#For Timestamp
import datetime

In [52]:
#Creating Function for Population of city/cities

def pop_info(cities):
  city_pop_data = []  # List to store the city info

  for city_name in cities: # Changed city to city_name
     url = f'https://en.wikipedia.org/wiki/{city_name}' # Using city_name for the url
     response = requests.get(url)
     if response.status_code != 200:
            print(f"Failed to load page for {city_name}. Status code: {response.status_code}")
            continue
     else:
            city_soup = BeautifulSoup(response.content, 'html.parser')  # assign to city_soup


            pop = city_soup.find(string='Population').find_next(string=re.compile(r'\d+,\d+,\d+'))
            pop = pop.replace(',', '')  #in case want to remove commas
            pop = int(pop)
            times= datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")



# Appending city info to the list
     city_pop_data.append({'City': city_name, 'Population': pop, 'Timestamp_data_retrieved': times})

  dataframe= pd.DataFrame(city_pop_data)
  return dataframe

In [53]:
population_df= pop_info(['Berlin','Hamburg', 'Munich'])

In [54]:
#To establish a connection with the SQL database, we need to provide the notebook with the necessary information, 
# which we do using the connection string below.
# install to enable the connection between Python and MYSQL databases managed through MYSQL Workbench
!pip install sqlalchemy
!pip install pymysql



In [60]:
schema = "ganz"
host = "127.0.0.1"
user = os.environ['user_name']
password = os.environ['MYSQL_password']
port = 3306

connection_string = f'mysql+pymysql://{user}:{password}@{host}:{port}/{schema}'

In [61]:
#Transfer scrapped data of three cities to sql database
#First for table 'cities'

cities_df.to_sql('cities',
                 if_exists= 'append',
                 con= connection_string,
                 index= False)

3

In [62]:
#Transfer scrapped data of three cities to SQL database
#Then for table 'population'

cities_from_sql= pd.read_sql('cities', con= connection_string)
cities_from_sql

Unnamed: 0,City_id,City,Country,Latitude,Longitude
0,1,Berlin,Germany,52.52,13.405
1,2,Hamburg,Germany,53.55,10.0
2,3,Munich,Germany,48.1375,11.575


In [63]:
merged_df= population_df.merge(cities_from_sql,
                              on= 'City',
                              how= 'left')

merged_df

Unnamed: 0,City,Population,Timestamp_data_retrieved,City_id,Country,Latitude,Longitude
0,Berlin,3596999,2025-04-02 19:28:19,1,Germany,52.52,13.405
1,Hamburg,1964021,2025-04-02 19:28:21,2,Germany,53.55,10.0
2,Munich,1510378,2025-04-02 19:28:22,3,Germany,48.1375,11.575


In [64]:
merged_df= merged_df.drop(columns= ['Country', 'Latitude', 'Longitude'])
merged_df

Unnamed: 0,City,Population,Timestamp_data_retrieved,City_id
0,Berlin,3596999,2025-04-02 19:28:19,1
1,Hamburg,1964021,2025-04-02 19:28:21,2
2,Munich,1510378,2025-04-02 19:28:22,3


In [65]:
#Transfer it to sql

merged_df.to_sql('population',
                 if_exists= 'append',
                 con= connection_string,
                 index= False)

3