In [1]:
!pip install sqlalchemy
!pip install pymysql
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
from datetime import datetime
from pytz import timezone



In [2]:
# Extracting a list of the biggest European cities
def list_cities():
    url_loop = "https://en.wikipedia.org/wiki/List_of_cities_in_the_European_Union_by_population_within_city_limits"
    response = requests.get(url_loop)
    original_soup_cities = BeautifulSoup(response.content, 'html.parser')
    side_table_cities = original_soup_cities.find("tbody")
    list_cities = side_table_cities.find_all(width="23")
    list_cities_extract = []
    for i in range(len(list_cities)):
        list_cities_extract.append(list_cities[i].find_previous("a").get_text())
    return list_cities_extract

In [3]:
#Cities : Function for Web Scraping population, latitude, country
def cities(example):
    city_name = []
    population = []
    latitude = []
    longitude = []
    country = []
    date_forecast = []
    temperature = []
    forecast = []
    rain_in_last_3h = []
    wind_speed = []

    """
    Scrapes information from Wikipedia about a city and appends the data to global lists.

    Parameters:
    - example (str): The name of the city to fetch information for.

    Returns:
    None

    Global Lists Modified:
    - city_name (list): List of city names.
    - population (list): List of city populations.
    - latitude (list): List of city latitudes.
    - longitude (list): List of city longitudes.
    - country (list): List of city countries.
    """

    # Replace spaces with underscores in the city name for constructing the Wikipedia URL
    example = example.replace(" ", "_")

    # Construct the Wikipedia URL for the city
    url_loop = f"https://en.wikipedia.org/wiki/{example}"

    # Make a request to the Wikipedia page
    response = requests.get(url_loop)

    # Parse the HTML content of the Wikipedia page
    original_soup = BeautifulSoup(response.content, 'html.parser')

    # Extract information from the infobox if it exists on the Wikipedia page
    side_table = original_soup.find("table", class_="infobox ib-settlement vcard")

    if side_table:
        # Extract population information
        population_value = side_table.find(string=re.compile("population", re.IGNORECASE))
        try:
            population.append(int(population_value.find_next("td").get_text().replace(",", "")) if population_value else None)
        except (ValueError, AttributeError):
            population.append(None)

        # Extract latitude information
        latitude_value = side_table.find(class_="latitude")
        latitude.append(latitude_value.get_text() if latitude_value else None)
        
        # Extract longitude information
        longitude_value = side_table.find(class_="longitude")
        longitude.append(longitude_value.get_text() if longitude_value else None)

        # Extract country information
        country_value = side_table.find(string=re.compile("country", re.IGNORECASE))
        country.append(country_value.find_next("td").get_text() if country_value else None)
        
    else:
        # If the table is not found, set NaN values for all fields
        population.append(None)
        latitude.append(None)
        longitude.append(None)
        country.append(None)
        
    city_name.append(example)
    
    information_city = pd.DataFrame({
        "city_name": city_name,
        "population": population,
        "latitude": latitude,
        "longitude": longitude,
        "country": country
    })
    
    return information_city

In [17]:
def dms_to_dd(coord_str):
    # Regular expression to extract degrees, optional minutes, and optional seconds
    pattern = re.compile(r'(\d+)°\s*(?:(\d+)′)?\s*(?:(\d+)″)?')
    match = pattern.match(coord_str)

    if match:
        degrees, minutes, seconds = map(int, match.groups(default='0'))
        dd = float(degrees) + float(minutes)/60 + float(seconds)/(60*60)

        return dd
    else:
        raise ValueError(f"Invalid coordinate format: {coord_str}")

In [5]:
def country_cars():
    url_loop = "https://en.wikipedia.org/wiki/List_of_countries_by_vehicles_per_capita"

    response = requests.get(url_loop)

    original_soup_autos = BeautifulSoup(response.content, 'html.parser')
    
    # Limiting data to the Y-Axis
    side_table_autos = original_soup_autos.find("tbody")
    
    list_countries = side_table_autos.find_all("span", class_="flagicon")
    
    # Creating list of countries and amount of vehicles
    list_countries_extract = []
    list_countries_extract_autos = []
    
    for i in range(len(list_countries)):
        list_countries_extract.append(list_countries[i].find_next("a").get_text())
        list_countries_extract_autos.append(list_countries[i].find_next("td").find_next("td").get_text())
    
    for n in range(len(list_countries_extract_autos)):
        list_countries_extract_autos[n] = list_countries_extract_autos[n].replace(",", "")
        list_countries_extract_autos[n] = list_countries_extract_autos[n].replace("\n", "")
        list_countries_extract_autos[n] = list_countries_extract_autos[n].replace("[", "")
        list_countries_extract_autos[n] = list_countries_extract_autos[n].replace("]", "")
        list_countries_extract_autos[n] = int(list_countries_extract_autos[n])
        
    countries_df = pd.DataFrame({"country": list_countries_extract, "cars": list_countries_extract_autos})
    return countries_df

In [6]:
def weather_request(example, information_df):
    berlin_timezone = timezone('Europe/Berlin')
    retrieval_time = datetime.now(berlin_timezone).strftime("%Y-%m-%d %H:%M:%S")
    weather_items = []
    latitude = information_df.loc[information_df["city_name"] == example, "latitude"].values[0]
    longitude = information_df.loc[information_df["city_name"] == example, "longitude"].values[0]

    # Construct the API request URL
    url_map = f"http://api.openweathermap.org/data/2.5/weather?lat={latitude}&lon={longitude}&appid=4bcaab846859609d453e249b3b3f0a16"
    
    # Make the API request and handle potential errors
    try:
        city_id_openweather = requests.get(url_map)
        city_id_openweather.raise_for_status()  # Raise an error for bad responses (non-2xx status codes)
        city_id_openweather = city_id_openweather.json()
        city_id_openweather = city_id_openweather["id"]
        
    except requests.exceptions.RequestException as e:
        # Handle request exceptions (e.g., network issues, API errors)
        print(f"Error in request for latitude {latitude}, longitude {longitude}: {e}")
        temperature_request = None  # Append None for temperature to indicate an error
        sky_request = None  # Append None for sky description to indicate an error
    
    url = f"http://api.openweathermap.org/data/2.5/forecast?id={city_id_openweather}&appid=4bcaab846859609d453e249b3b3f0a16"
 
    response = requests.get(url)
    
    response = response.json()
    
    for item in response["list"]:
        weather_item = {
            "city_name" : example,
            "forecast_time": item.get("dt_txt", None),
            "temperature": item["main"].get("temp", None),
            "forecast": item["weather"][0].get("main", None),
            "rain_in_last_3h": item.get("rain", {}).get("3h", 0),
            "wind_speed": item["wind"].get("speed", None),
            "retrieval_time" : retrieval_time
        }
        
        weather_items.append(weather_item)
        weather_df = pd.DataFrame(weather_items)
    return weather_df

In [7]:
def airport_location(latitude_decimal, longitude_decimal):
    airport_iata = []
    cities = []
    url = "https://aerodatabox.p.rapidapi.com/airports/search/location"

    querystring = {"lat": str(latitude_decimal), "lon": str(longitude_decimal), "radiusKm": "50", "limit": "10", "withFlightInfoOnly": "true"}

    headers = {
        "X-RapidAPI-Key": "4b54815ac4mshe11ca88893efc1ep170ae5jsn49ac4b2d04a5",
        "X-RapidAPI-Host": "aerodatabox.p.rapidapi.com"
    }

    response = requests.get(url, headers=headers, params=querystring)

    if response.status_code == 200:
        airports = response.json()["items"]

        for airport in airports:
            airport_iata.append(airport["iata"])
            city_name = airport.get("municipalityName", None)
            cities.append(city_name)
    else:
        airport_iata.append(None)
        cities.append(None)
            
     # Creating a DataFrame
    airport_df = pd.DataFrame({"Airport_IATA": airport_iata, "City": cities})
    return airport_df

In [8]:
def airport_schedule(iata_name):
    from datetime import datetime, timedelta
    import requests
    import pandas as pd
    
    tomorrow = (datetime.now().date() + timedelta(days=1)).strftime("%Y-%m-%dT")
    url = f"https://aerodatabox.p.rapidapi.com/flights/airports/iata/{iata_name}/{tomorrow}00:00/{tomorrow}11:59"

    querystring = {"withLeg": "true", "withCancelled": "true", "withCodeshared": "true", "withCargo": "true",
                   "withPrivate": "true", "withLocation": "false"}

    headers = {
        "X-RapidAPI-Key": "4b54815ac4mshe11ca88893efc1ep170ae5jsn49ac4b2d04a5",
        "X-RapidAPI-Host": "aerodatabox.p.rapidapi.com"
    }

    response = requests.get(url, headers=headers, params=querystring)

    airport_arrivals = []
    iata_names = []  # Corrected variable name
    airlines = []
    status = []

    if response.status_code == 200:
        arrivals = response.json()["arrivals"]

        for arrival in arrivals:
            # Check if the keys exist before accessing them
            if "arrival" in arrival and "scheduledTime" in arrival["arrival"] and "local" in arrival["arrival"]["scheduledTime"]:
                airport_arrivals.append(arrival["arrival"]["scheduledTime"]["local"])
            else:
                airport_arrivals.append(None)

            iata_names.append(iata_name)  # Using the corrected variable name iata_names
           
            if "airline" in arrival and "name" in arrival["airline"]:
                airlines.append(arrival["airline"]["name"])
            else:
                airlines.append(None)
                
            if "status" in arrival:
                status.append(arrival["status"])
            else:
                status.append(None)
    else:
        airport_arrivals.append(None)
        iata_names.append(None)
        airlines.append(None)
        status.append(None)
        
    airport_schedule_df = pd.DataFrame({"airport_arrivals": airport_arrivals, "airlines": airlines, "iata_name": iata_names, "status": status})
    return airport_schedule_df

In [9]:
def connection():
    schema = "gans"
    host = "127.0.0.1"
    user = "root"
    password = "Riverplate121."
    port = 3306
    return f'mysql+pymysql://{user}:{password}@{host}:{port}/{schema}'

In [10]:
def get_cities_data(connection_string):
    return pd.read_sql("cities", con=connection_string)

In [12]:
def get_countries_data(connection_string):
    return pd.read_sql("countries", con=connection_string)

In [23]:
def clean_tables_sql():
    from sqlalchemy import create_engine
    # Connect to MySQL server
    engine = create_engine(connection())

    # Drop and recreate the database
    engine.execute("DROP DATABASE IF EXISTS gans;")
    engine.execute("CREATE DATABASE gans;")
    engine.execute("USE gans;")

    # Create tables
    engine.execute("""
        CREATE TABLE countries (
            country_id INT AUTO_INCREMENT,
            country VARCHAR(225),
            PRIMARY KEY (country_id)
        );
    """)
    
    engine.execute("""
        CREATE TABLE cars (
            cars INT,
            country_id INT,
            FOREIGN KEY (country_id) REFERENCES countries(country_id)
        );
    """)

    engine.execute("""
        CREATE TABLE cities (
            city_id INT AUTO_INCREMENT,
            city_name VARCHAR(255),
            country_id INT,
            PRIMARY KEY (city_id),
            FOREIGN KEY (country_id) REFERENCES countries(country_id)
        );
    """)

    engine.execute("""
        CREATE TABLE information (
            population INT,
            latitude FLOAT,
            longitude FLOAT,
            city_id INT,
            country_id INT,
            FOREIGN KEY (city_id) REFERENCES cities(city_id)
        );
    """)

    engine.execute("""
        CREATE TABLE weather (
            city_id INT,
            country_id INT,
            forecast_time DATE,
            temperature FLOAT,
            forecast VARCHAR(255),
            rain_in_last_3h FLOAT,
            wind_speed FLOAT,
            retrieval_time DATE,
            FOREIGN KEY (city_id) REFERENCES cities(city_id),
            FOREIGN KEY (country_id) REFERENCES countries(country_id)
        );
    """)

    engine.execute("""
        CREATE TABLE airports (
            airport_id INT AUTO_INCREMENT,
            city_id INT,
            country_id INT,
            Airport_IATA VARCHAR(3),
            PRIMARY KEY (airport_id),
            FOREIGN KEY (city_id) REFERENCES cities(city_id),
            FOREIGN KEY (country_id) REFERENCES countries(country_id)
        );
    """)

    engine.execute("""
        CREATE TABLE airport_sheduldes (
            airport_arrivals DATETIME,
            airlines VARCHAR(255),
            status_ VARCHAR(255),
            airport_id INT,
            city_id INT,
            country_id INT,
            FOREIGN KEY (airport_id) REFERENCES airports(airport_id),
            FOREIGN KEY (city_id) REFERENCES cities(city_id),
            FOREIGN KEY (country_id) REFERENCES countries(country_id)
        );
    """)

In [29]:
def update():
    ####################################################---Extract---#####################################################
    list_cities_extract = list_cities()
    information_df = pd.DataFrame()
    for i in list_cities_extract:
        information_city = cities(i)
        information_df = pd.concat([information_df, information_city], ignore_index=True)

    # Removing null values
    information_df = information_df.loc[
        information_df["population"].notnull() & information_df["latitude"].notnull() & information_df[
            "longitude"].notnull(), :]

    # Converting latitude and longitude into decimal
    latitudes = []
    longitudes = []
    for i in information_df["latitude"]:
        latitudes.append(dms_to_dd(i))
    for n in information_df["longitude"]:
        longitudes.append(dms_to_dd(n))

    information_df["latitude"] = latitudes
    information_df["longitude"] = longitudes

    localizacion = pd.DataFrame({"latitudes": latitudes, "longitudes": longitudes})

    weather_df_final = pd.DataFrame()
    for city in information_df["city_name"]:
        weather_city = weather_request(city, information_df)
        weather_df_final = pd.concat([weather_df_final, weather_city], ignore_index=True)

    airports_final = pd.DataFrame()
    for index, row in localizacion.iterrows():
        airport_city = airport_location(row["latitudes"], row["longitudes"])
        airports_final = pd.concat([airports_final, airport_city], ignore_index=True)

    airport_shedulde_final = pd.DataFrame()
    for iata_name in airports_final["Airport_IATA"]:
        airport_shedulde_city = airport_schedule(iata_name)
        airport_shedulde_final = pd.concat([airport_shedulde_final, airport_shedulde_city], ignore_index=True)

    country_final = country_cars()

    ####################################################---Transform & STORE---#####################################################

    clean_tables_sql()
    
    ##################   COUNTRIES
    countries_for_sql = country_final.drop("cars", axis=1)
    countries_for_sql.to_sql('countries',
                             if_exists='append',
                             con=connection(),
                             index=False,
                             )
    countries_from_sql = pd.read_sql("countries", con=connection())

    ################   CARS
    cars_for_sql = countries_from_sql.merge(country_final,
                                             on="country",
                                             how="left",
                                             ).drop("country", axis=1)

    cars_for_sql.to_sql('cars',
                        if_exists='append',
                        con=connection(),
                        index=False,
                        )

    ################   CITIES
    cities_for_sql = information_df.loc[:, ["city_name", "country"]]

    cities_for_sql = cities_for_sql.merge(countries_from_sql,
                                          on="country",
                                          how="left"
                                          )
    
    cities_for_sql = cities_for_sql.drop("country",axis = 1)

    cities_for_sql = cities_for_sql.loc[~cities_for_sql["country_id"].isnull(), :]
    cities_for_sql["country_id"] = pd.to_numeric(cities_for_sql["country_id"], errors='coerce')
    cities_for_sql.loc[:, "country_id"] = cities_for_sql["country_id"].dropna().astype(int)

    cities_for_sql.to_sql('cities',
                          if_exists='append',
                          con=connection(),
                          index=False,
                          )

    cities_from_sql = pd.read_sql("cities", con=connection())

    ################ INFORMATION
    information_final = information_df.merge(cities_from_sql,
                                             on="city_name",
                                             how="inner").drop(["city_name", "country"], axis=1)

    information_final.to_sql('information',
                             if_exists='append',
                             con=connection(),
                             index=False,
                             )

    ################ WEATHER
    weather_df_final = weather_df_final.merge(cities_from_sql,
                                              on="city_name",
                                              how="inner").drop("city_name", axis=1)

    weather_df_final.to_sql('weather',
                            if_exists='append',
                            con=connection(),
                            index=False,
                            )

    ################ AIRPORTS
    airports_final = airports_final.merge(cities_from_sql,
                                          right_on="city_name",
                                          left_on="City",
                                          how="inner").drop(["City", "city_name"], axis=1)

    airports_final = airports_final.drop_duplicates()

    airports_final.to_sql('airports',
                          if_exists='append',
                          con=connection(),
                          index=False,
                          )

    airports_from_sql = pd.read_sql("airports", con=connection())

    ################ AIRPORTS  SHEDULDES

    airport_shedulde_final = airport_shedulde_final.merge(airports_from_sql,
                                                           right_on="Airport_IATA",
                                                           left_on="iata_name",
                                                           how="inner"
                                                           ).drop(["iata_name", "Airport_IATA"], axis=1).rename(
        columns={"status": "status_"})

    airport_shedulde_final['airport_arrivals'] = pd.to_datetime(airport_shedulde_final['airport_arrivals'], utc=True)

    # Format the datetime column as a string in the desired format
    airport_shedulde_final['airport_arrivals'] = airport_shedulde_final['airport_arrivals'].dt.strftime(
        '%Y-%m-%d %H:%M:%S')

    airport_shedulde_final.to_sql('airport_sheduldes',
                                  if_exists='append',
                                  con=connection(),
                                  index=False,
                                  )

In [30]:
update()

In [32]:
airport_schedule("BER")

Unnamed: 0,airport_arrivals,airlines,iata_name,status
0,2024-01-16 06:50+01:00,Qatar Airways,BER,Expected
1,2024-01-16 07:55+01:00,Aegean,BER,Expected
2,2024-01-16 07:55+01:00,Eurowings,BER,Expected
3,2024-01-16 07:35+01:00,Ryanair,BER,Expected
4,2024-01-16 07:55+01:00,Air Serbia,BER,Expected
5,2024-01-16 07:55+01:00,Lufthansa,BER,Expected
6,2024-01-16 07:55+01:00,airBaltic,BER,Expected
7,2024-01-16 07:55+01:00,United,BER,Expected
8,2024-01-16 07:45+01:00,Wizz Air,BER,Expected
9,2024-01-16 07:50+01:00,Wizz Air,BER,Expected
