# 1 Imports

In [1]:
import pandas as pd
import requests
import re
from pytz import timezone #to select current timezone
from datetime import datetime, timedelta # creating timestamps 
from bs4 import BeautifulSoup
from lat_lon_parser import parse    # for decimal coordinates

In [None]:
# sql root and api keys. Utilizes .env file and config.py to configure password management
from config import SQL_ROOT, OPENWEATHER_API_KEY, RAPID_API_KEY
from con_cloud import con as cloud_con
from con_local import con as local_con

# 2 Functions for data acquisition and handling

## 2.1 Dataframe creation for city data

In [None]:
def cities_dataframe(cities):
  city_data = []
  population_data = []

  for city in cities:
    url = f"https://www.wikipedia.org/wiki/{city}"
    response = requests.get(url)
    city_soup = BeautifulSoup(response.content, 'html.parser')

    # extract the relevant information
    city_latitude = city_soup.find(class_="latitude").get_text()
    city_longitude = city_soup.find(class_="longitude").get_text()
    country = city_soup.find(class_="infobox-data").get_text()

    # keep track of data per city
    city_data.append({"city": city,
                    "country": country,
                    "latitude": parse(city_latitude), # latitude in decimal format
                    "longitude": parse(city_longitude), # longitude in decimal format
                    })
    
    # extract the relevant information
    city_population = city_soup.find(string="Population").find_next("td").get_text()
    try:
      city_population_clean = int(city_population.replace(",", ""))
    except:
      city_population_clean = -999
    today = datetime.today().strftime("%Y-%m-%d")
    pop_census = city_soup.find(string = 'Population').find_next('div').get_text()
    try:
      yr_census = re.findall(r'[0-9]{4}', pop_census)[0]
    except:
      yr_census = -999

    # for each city we append a dictionary of values to the list
    population_data.append({"city": city,
                          "population": city_population_clean,
                          "population_timestamp": int(yr_census),
                          "retrieval_timestamp": today
                          })


  return pd.DataFrame(population_data), pd.DataFrame(city_data)

## 2.2 Dataframe creation for weather data

In [None]:
def get_weather_data(InputCityData):
    
    # Setting up lists for later dataframe creation
    city_id = []
    forecast_time = []
    outlook = []
    temperature = []
    temperature_felt = []
    wind_speed = []
    rain_expected_mm = []
    retrieval_time = []

    # Prepping time and source related references
    berlin_timezone = timezone('Europe/Berlin')
    
    url = "https://api.openweathermap.org/data/2.5/forecast"

    for id in InputCityData['city_id']:
        querystring = {"lat": InputCityData.loc[InputCityData['city_id'] == id, :]['latitude'], 
                       "lon": InputCityData.loc[InputCityData['city_id'] == id, :]['longitude'], 
                       "appid": OPENWEATHER_API_KEY, 
                       "units": "metric"
                       }
        retrieval_timestamp = datetime.now(berlin_timezone).strftime("%Y-%m-%d %H:%M:%S")
        # Reference the sections in the request.
        weather = requests.request("GET", url, params=querystring)
        weather_json = weather.json()

        for element in weather_json['list']:
            city_id.append(id)
            forecast_time.append(element['dt_txt'])
            outlook.append(element['weather'][0]['description'])
            temperature.append(element['main']['temp'])
            temperature_felt.append(element['main']['feels_like'])
            wind_speed.append(element['wind']['speed'])
            try:
                rain_expected_mm.append(element['rain']['3h'])
            except:
                rain_expected_mm.append(0)
            retrieval_time.append(retrieval_timestamp)

    weather_df_dictionary = pd.DataFrame({"city_id": city_id,
                                        "forecast_time": forecast_time,
                                        "outlook": outlook,
                                        "temperature": temperature,
                                        "temperature_felt": temperature_felt,
                                        "wind_speed": wind_speed,
                                        "rain_expected_mm": rain_expected_mm,
                                        "retrieval_time": retrieval_time
                                        })
    
    weather_df_dictionary["forecast_time"] = pd.to_datetime(weather_df_dictionary["forecast_time"])
    weather_df_dictionary["retrieval_time"] = pd.to_datetime(weather_df_dictionary["retrieval_time"]) 
    return weather_df_dictionary


## 2.3 Dataframe Creation for Airport Info

In [None]:
def get_airport_info(InputCityData):
    
    url = "https://aerodatabox.p.rapidapi.com/airports/search/location"
    cities_airports = []

    for id in InputCityData['city_id']:
        querystring = {"lat":InputCityData.loc[InputCityData['city_id'] == id, :]['latitude'],
                "lon":InputCityData.loc[InputCityData['city_id'] == id, :]['longitude'],
                "radiusKm":"30",
                "limit":"8",
                "withFlightInfoOnly":"True"}

        headers = {
                "x-rapidapi-key": RAPID_API_KEY,
                "x-rapidapi-host": "aerodatabox.p.rapidapi.com"
                }

        airport_geo_json = requests.get(url, headers=headers, params=querystring).json()

        city_airports = pd.json_normalize(airport_geo_json['items'])
        city_airports['city_id'] = id
        cities_airports.append(city_airports)

    cities_airports_df = pd.concat(cities_airports, ignore_index=True)
    cities_airports_df.rename(columns = {'name': 'airport_name', 
                                         'location.lat': 'latitude', 
                                         'location.lon': 'longitude'}, 
                                         inplace = True)
    return cities_airports_df


In [None]:
# Creating the airports_df for unique icao/iata combination. icao is FK to icao in cities_airports
def create_airport_df(cities_airports_df):
    
    airports_icao = []
    airports_iata = []
    airports_name = []
    for icao_id in cities_airports_df['icao']:
        if icao_id in cities_airports_df['icao'].unique() and icao_id not in airports_icao:
            airports_icao.append(icao_id)
            airports_iata.append(cities_airports_df.loc[cities_airports_df['icao'] == icao_id]['iata'].reset_index(drop=True)[0])
            airports_name.append(cities_airports_df.loc[cities_airports_df['icao'] == icao_id]['airport_name'].reset_index(drop=True)[0])
    
    airports_df_dictionary = pd.DataFrame({"icao": airports_icao,
                                        "iata": airports_iata,
                                        "airport_name": airports_name})
    return airports_df_dictionary

## 2.4 Function for flights dataframe creation

In [None]:
def get_flights(InputIcaoList):

    querystring = {"withLeg":"True",
                "direction":"Arrival",
                "withCancelled":"False",
                "withCodeshared":"True",
                "withCargo":"False",
                "withPrivate":"False",
                "withLocation":"False"}

    headers = 	{"x-rapidapi-key": RAPID_API_KEY,
                "x-rapidapi-host": "aerodatabox.p.rapidapi.com"
                }

    berlin_timezone = timezone('Europe/Berlin')
    today = datetime.now(berlin_timezone).date()
    tomorrow = (today + timedelta(days=1))
    times = [["00:00","11:59"],
            ["12:00","23:59"]]

    flight_num = []
    departure_icao = []
    arrival_icao = []
    arrival_time = []
    retrieval_time = []


    retrieval_timestamp = datetime.now(berlin_timezone).strftime("%Y-%m-%d %H:%M:%S")

    for airport in InputIcaoList:
        for time in times:
            url = f"https://aerodatabox.p.rapidapi.com/flights/airports/icao/{airport}/{tomorrow}T{time[0]}/{tomorrow}T{time[1]}"

            flights_response = requests.get(url, headers=headers, params=querystring)
            try:
                flights_json = flights_response.json()
            except requests.JSONDecodeError:
                continue
            for flight in flights_json['arrivals']:
                flight_num.append(flight['number'])
                try:
                    departure_icao.append(flight['departure']['airport']['icao'])
                except:
                    departure_icao.append('')
                arrival_icao.append(airport)
                arrival_time.append(flight['arrival']['scheduledTime']['local'])
                retrieval_time.append(retrieval_timestamp)

    airports_df_dictionary = pd.DataFrame({"flight_num": flight_num,
                                        "departure_icao": departure_icao,
                                        "arrival_icao": arrival_icao,
                                        "arrival_time": arrival_time,
                                        "retrieval_time": retrieval_time
                                        })
    
    airports_df_dictionary["arrival_time"] = airports_df_dictionary["arrival_time"].str[:-6]
    airports_df_dictionary["arrival_time"] = pd.to_datetime(airports_df_dictionary["arrival_time"])
    airports_df_dictionary["retrieval_time"] = pd.to_datetime(airports_df_dictionary["retrieval_time"])
    return airports_df_dictionary
        

## 2.5 Function that fetches city and population airports and cities_airports data from a database

In [9]:
def fetch_static_data(connection):
    
    city_df = pd.read_sql("city", 
                          con=connection)
    population_df = pd.read_sql("population",
                                con = connection)
    airports_df = pd.read_sql("airports",
                              con = connection)
    cities_airports_df = pd.read_sql("cities_airports",
                                     con = connection)
    return city_df, population_df, airports_df, cities_airports_df

## 2.6 Function that fetches flights and weather data from a database

In [10]:
def fetch_dynamic_data(connection):
    
    weather_df = pd.read_sql("weather", 
                             con=connection)
    flights_df = pd.read_sql("flights",
                             con = connection)
    return weather_df, flights_df

## 2.7 Function that submits static databases to SQL

In [11]:
def submit_static_data(city_df_in, population_df_in, airports_df_in, cities_airports_df_in, connection):
    
    city_df_in.to_sql('city',
                  if_exists='append',
                  con=connection,
                  index=False)
    population_df_in.to_sql('population',
                         if_exists='append',
                         con=connection,
                         index=False)
    airports_df_in.to_sql('airports',  
                       if_exists='append',
                       con=connection,
                       index=False)
    cities_airports_df_in.to_sql('cities_airports',
                              if_exists='append',
                              con=connection,
                              index=False)
    return "success"
    

## 2.8 Function that submits weather data to SQL

In [12]:
def submit_weather_data(weather_df_in, connection):
    
    weather_df_in.to_sql('weather',
                  if_exists='append',
                  con=connection,
                  index=False)
    return "success"

## 2.9 Function that submits flighhts data to SQL

In [13]:
def submit_flights_data(flights_df_in, connection):
    
    flights_df_in.to_sql('flights',
                  if_exists='append',
                  con=connection,
                  index=False)
    return "success"

## 2.10 Function that automatically parses and updates weather and flights tables

In [14]:
def update_weather_flights(connection):
    
    city_df, population_df, airports_df, cities_airports_df = fetch_static_data(connection)
    weather_df = get_weather_data(city_df)
    flights_df = get_flights(airports_df['icao'])
    submit_weather_data(weather_df, connection)
    submit_flights_data(flights_df, connection)
    return "success"
    


# 3 Executing Data Acquisition for select places

## 3.1 Filling the dataframe for selected cities

In [None]:
# Setting up the SQL connection locally
schema = "gans_wbsproject"
host = "127.0.0.1"
user = "root"
password = SQL_ROOT
port = 3306

connection_string = f'mysql+pymysql://{user}:{password}@{host}:{port}/{schema}'

In [None]:
# Select the cities for which we want to create a database
cities = ["Berlin", "Hamburg", "Munich", "Cologne", "London"]

# Create the tables for cities and their populations
pop_df, city_df = cities_dataframe(cities)

Unnamed: 0,city,country,latitude,longitude
0,Berlin,Germany,52.52,13.405
1,Hamburg,Germany,53.55,10.0
2,Munich,Germany,48.1375,11.575
3,Cologne,Germany,50.936389,6.952778
4,London,United Kingdom,51.507222,-0.1275


In [None]:
# Check for completeness and correctness of city (geo-)data
city_df

Unnamed: 0,city,country,latitude,longitude
0,Berlin,Germany,52.52,13.405
1,Hamburg,Germany,53.55,10.0
2,Munich,Germany,48.1375,11.575
3,Cologne,Germany,50.936389,6.952778
4,London,United Kingdom,51.507222,-0.1275


In [None]:
# Check for completeness and correctness of population data
pop_df

Unnamed: 0,city,population,population_timestamp,retrieval_timestamp
0,Berlin,3596999,2022,2025-05-20
1,Hamburg,1964021,2023,2025-05-20
2,Munich,1510378,2023,2025-05-20
3,Cologne,1024408,2023,2025-05-20
4,London,8866180,2022,2025-05-20


In [None]:
# Export the city data to SQL
city_df.to_sql('city',
                  if_exists='append',
                  con=connection_string,
                  index=False)

5

In [29]:
# Reimport city table with primary key assignments. Prepare population data for exporting 
city_from_sql = pd.read_sql("city", 
                            con=connection_string) # Reimporting

In [30]:
# Left Join city into population data
population_df = pop_df.merge(city_from_sql,
                                   on = "city",
                                   how = "left")

# Remove duplicate table info
population_df.drop(columns=["city", 'country', 'latitude', 'longitude'], inplace = True)

# Reorder columns to match SQL table generation: city_id, population, population_timestamp, retrieval_timestamp from l to r
population_df = population_df.iloc[:, [-1, 0, 1, 2]]

# Export to SQL population table
population_df.to_sql('population',
                  if_exists='append',
                  con=connection_string,
                  index=False)

5

## 3.2 Fetch and export weather data (requires section 3.1 being finished)

In [31]:
weather_df = get_weather_data(city_from_sql)
weather_df

Unnamed: 0,city_id,forecast_time,outlook,temperature,temperature_felt,wind_speed,rain_expected_mm,retrieval_time
0,1,2025-05-20 15:00:00,overcast clouds,23.30,22.86,2.81,0.00,2025-05-20 15:37:25
1,1,2025-05-20 18:00:00,overcast clouds,22.24,21.77,2.27,0.00,2025-05-20 15:37:25
2,1,2025-05-20 21:00:00,overcast clouds,20.06,19.64,2.33,0.00,2025-05-20 15:37:25
3,1,2025-05-21 00:00:00,overcast clouds,15.20,14.66,2.74,0.00,2025-05-20 15:37:25
4,1,2025-05-21 03:00:00,light rain,11.40,10.29,3.02,0.37,2025-05-20 15:37:25
...,...,...,...,...,...,...,...,...
195,5,2025-05-25 00:00:00,broken clouds,12.62,12.05,6.77,0.00,2025-05-20 15:37:25
196,5,2025-05-25 03:00:00,overcast clouds,12.56,11.75,5.42,0.00,2025-05-20 15:37:25
197,5,2025-05-25 06:00:00,broken clouds,11.45,10.50,5.04,0.00,2025-05-20 15:37:25
198,5,2025-05-25 09:00:00,broken clouds,15.48,14.34,5.83,0.00,2025-05-20 15:37:25


In [34]:
# Exporting the weather dataframe to SQL
weather_df.to_sql('weather',
                  if_exists='append',
                  con=connection_string,
                  index=False)

200

# 3.3 Fetch and export airport data (Requires 3.1)

In [35]:
cities_airports_df = get_airport_info(city_from_sql)
cities_airports_df

Unnamed: 0,icao,iata,airport_name,shortName,municipalityName,countryCode,timeZone,latitude,longitude,city_id
0,EDDT,TXL,Berlin -Tegel,-Tegel,Berlin,DE,Europe/Berlin,52.5597,13.287699,1
1,EDDB,BER,Berlin Brandenburg,Brandenburg,Berlin,DE,Europe/Berlin,52.35139,13.493889,1
2,EDDH,HAM,Hamburg,Hamburg,Hamburg,DE,Europe/Berlin,53.6304,9.988229,2
3,EDDM,MUC,Munich,Munich,Munich,DE,Europe/Berlin,48.3538,11.7861,3
4,EDDK,CGN,Cologne Bonn,Bonn,Cologne,DE,Europe/Berlin,50.8659,7.142739,4
5,EGLC,LCY,London City,City,London,GB,Europe/London,51.5053,0.055277,5
6,EGLL,LHR,London Heathrow,Heathrow,London,GB,Europe/London,51.4706,-0.461941,5


In [36]:
airports_df = create_airport_df(cities_airports_df)
airports_df

Unnamed: 0,icao,iata,airport_name
0,EDDT,TXL,Berlin -Tegel
1,EDDB,BER,Berlin Brandenburg
2,EDDH,HAM,Hamburg
3,EDDM,MUC,Munich
4,EDDK,CGN,Cologne Bonn
5,EGLC,LCY,London City
6,EGLL,LHR,London Heathrow


In [37]:
#Export airports_df to SQL
airports_df.to_sql('airports',
                  if_exists='append',
                  con=connection_string,
                  index=False)

7

In [38]:
#Export cities_airports_df to SQL 
cities_airports_df.to_sql('cities_airports',
                  if_exists='append',
                  con=connection_string,
                  index=False)

7

## 3.4 Filling the database with flight info

In [None]:
# Fetch flights for the airports
flights_df = get_flights(airports_df['icao'])
flights_df

Unnamed: 0,flight_num,departure_icao,arrival_icao,arrival_time,retrieval_time
0,XQ 1768,LTFH,EDDB,2025-05-21 06:45:00,2025-05-20 15:42:41
1,HU 489,ZBAA,EDDB,2025-05-21 06:45:00,2025-05-20 15:42:41
2,DL 92,KJFK,EDDB,2025-05-21 07:00:00,2025-05-20 15:42:41
3,VS 3846,KJFK,EDDB,2025-05-21 07:00:00,2025-05-20 15:42:41
4,KL 6153,KJFK,EDDB,2025-05-21 07:00:00,2025-05-20 15:42:41
...,...,...,...,...,...
3816,AI 7914,LPPT,EGLL,2025-05-21 22:55:00,2025-05-20 15:42:41
3817,AC 2708,LPPT,EGLL,2025-05-21 22:55:00,2025-05-20 15:42:41
3818,TP 1366,LPPT,EGLL,2025-05-21 22:55:00,2025-05-20 15:42:41
3819,AD 7174,LPPT,EGLL,2025-05-21 22:55:00,2025-05-20 15:42:41


In [40]:
#Export flights dataframes to SQL
flights_df.to_sql('flights',
                  if_exists='append',
                  con=connection_string,
                  index=False)

3821

# 4 Updating the weather and flight data on local database

In [None]:
# Fetch an update to the weather and flight data and automatically update the SQL tables
update_weather_flights(local_con)

'success'

# 5 Transferring data from local to cloud

In [None]:
# Importing the dataframes from local SQL
city_df, population_df, airports_df, cities_airports_df = fetch_static_data(local_con)
weather_df, flights_df = fetch_dynamic_data(local_con)

In [16]:
# Exporting the dataframes into cloud SQL
submit_static_data(city_df, population_df, airports_df, cities_airports_df, cloud_con)
submit_weather_data(weather_df, cloud_con)
submit_flights_data(flights_df, cloud_con)

'success'