In [1]:
# install all needed libraries
!pip install sqlalchemy
!pip install pymysql
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
import requests
from datetime import datetime
from pytz import timezone



In [2]:
#Exctracting a list of the biggest european cities

url_loop = "https://en.wikipedia.org/wiki/List_of_cities_in_the_European_Union_by_population_within_city_limits"

response = requests.get(url_loop)

original_soup_cities = BeautifulSoup(response.content, 'html.parser')

#Limiting data to the Y-Axis
side_table_cities = original_soup_cities.find("tbody")
    
list_cities = side_table_cities.find_all(width="23")

#Creating list of cities
list_cities_extract = []
for i in range(len(list_cities)):
    list_cities_extract.append(list_cities[i].find_previous("a").get_text())

In [5]:
#First, I will create all needed functions, second step will be to store information correctly in the DataFrames, 
#and as third step reflecting it into SQL

In [3]:
city_name = []
population = []
latitude = []
longitude = []
country = []
date_forecast = []
temperature = []
forecast = []
rain_in_last_3h = []
wind_speed = []

In [4]:
#Cities : Function for Web Scraping population, latitude, country
def cities(example):
    """
    Scrapes information from Wikipedia about a city and appends the data to global lists.

    Parameters:
    - example (str): The name of the city to fetch information for.

    Returns:
    None

    Global Lists Modified:
    - city_name (list): List of city names.
    - population (list): List of city populations.
    - latitude (list): List of city latitudes.
    - longitude (list): List of city longitudes.
    - country (list): List of city countries.
    """

    # Exit early if the city is already in the list
    if example in city_name:
        return
    
    # Append the city name to the global list
    city_name.append(example)
    
    # Replace spaces with underscores in the city name for constructing the Wikipedia URL
    example = example.replace(" ", "_")
    
    # Construct the Wikipedia URL for the city
    url_loop = f"https://en.wikipedia.org/wiki/{example}"

    # Make a request to the Wikipedia page
    response = requests.get(url_loop)

    # Parse the HTML content of the Wikipedia page
    original_soup = BeautifulSoup(response.content, 'html.parser')

    # Extract information from the infobox if it exists on the Wikipedia page
    side_table = original_soup.find("table", class_="infobox ib-settlement vcard")

    if side_table:
        # Extract population information
        population_value = side_table.find(string=re.compile("population", re.IGNORECASE))
        try:
            population.append(int(population_value.find_next("td").get_text().replace(",", "")) if population_value else None)
        except (ValueError, AttributeError):
            population.append(None)

        # Extract latitude information
        latitude_value = side_table.find(class_="latitude")
        latitude.append(latitude_value.get_text() if latitude_value else None)
        
        # Extract longitude information
        longitude_value = side_table.find(class_="longitude")
        longitude.append(longitude_value.get_text() if longitude_value else None)

        # Extract country information
        country_value = side_table.find(string=re.compile("country", re.IGNORECASE))
        country.append(country_value.find_next("td").get_text() if country_value else None)
    else:
        # If the table is not found, set NaN values for all fields
        population.append(None)
        latitude.append(None)
        longitude.append(None)
        country.append(None)

In [5]:
#Applying function "Cities" to this new list
for i in list_cities_extract:
    cities(i)

In [7]:
#Changing coordinates to decimals
import re

"""
Convert coordinates in degrees, minutes, and seconds (DMS) format to decimal degrees (DD).

Parameters:
- coord_str (str): String representation of coordinates in DMS format, e.g., '34°25′12″N'.

Returns:
float: Decimal degrees representation of the coordinates.

Raises:
ValueError: If the coordinate format is invalid.

Example:
>>> dms_to_dd('34°25′12″N')
34.42
"""

def dms_to_dd(coord_str):
    # Remove letters "W", "S", "N", or "E"
    coord_str = re.sub(r'[WwSsNnEe]', '', coord_str)
    
    # Regular expression to extract degrees, optional minutes, and optional seconds
    pattern = re.compile(r'(\d+)°(?:(\d+)′)?(?:(\d+)″)?')
    match = pattern.match(coord_str)
    
    if match:
        degrees, minutes, seconds = map(int, match.groups(default='0'))
        dd = float(degrees) + float(minutes)/60 + float(seconds)/(60*60)
        
        return dd
    else:
        raise ValueError(f"Invalid coordinate format: {coord_str}")

In [8]:
#Extracting a list of cars per country from a Wikipedia catalog (Total = 150 entries)
def country_cars():
    url_loop = "https://en.wikipedia.org/wiki/List_of_countries_by_vehicles_per_capita"

    response = requests.get(url_loop)

    original_soup_autos = BeautifulSoup(response.content, 'html.parser')
    
    #Limiting data to the Y-Axis
    side_table_autos = original_soup_autos.find("tbody")
    
    list_countries = side_table_autos.find_all("span", class_="flagicon")
    
    #Creating list of countries and amount of vehicles
    list_countries_extract = []
    list_countries_extract_autos = []
    for i in range(len(list_countries)):
        list_countries_extract.append(list_countries[i].find_next("a").get_text())
        list_countries_extract_autos.append(list_countries[i].find_next("td").find_next("td").get_text())
    for n in range(len(list_countries_extract_autos)):
        list_countries_extract_autos[n] = list_countries_extract_autos[n].replace(",","")
        list_countries_extract_autos[n] = list_countries_extract_autos[n].replace("\n","")
        list_countries_extract_autos[n] = list_countries_extract_autos[n].replace("[","")
        list_countries_extract_autos[n] = list_countries_extract_autos[n].replace("]","")
        list_countries_extract_autos[n] = int(list_countries_extract_autos[n])
        
    countries_df = pd.DataFrame({"country":list_countries_extract, "cars":list_countries_extract_autos})
    return countries_df

In [42]:
countries_df = country_cars()

In [37]:
#Next step is to create the function that gets:
#1) Name of the city using latitude and longitude
#2) Weather forecast in each city of the list

In [10]:
def weather_request(example):
    berlin_timezone = timezone('Europe/Berlin')
    retrieval_time = datetime.now(berlin_timezone).strftime("%Y-%m-%d %H:%M:%S")
    weather_items = []
    latitude = information_df.loc[information_df["city_name"] == example, "latitude"].values[0]
    longitude = information_df.loc[information_df["city_name"] == example, "longitude"].values[0]

    # Construct the API request URL
    url_map = f"http://api.openweathermap.org/data/2.5/weather?lat={latitude}&lon={longitude}&appid=4bcaab846859609d453e249b3b3f0a16"
    
    # Make the API request and handle potential errors
    try:
        city_id_openweather = requests.get(url_map)
        city_id_openweather.raise_for_status()  # Raise an error for bad responses (non-2xx status codes)
        city_id_openweather = city_id_openweather.json()
        city_id_openweather = city_id_openweather["id"]
        
    except requests.exceptions.RequestException as e:
        # Handle request exceptions (e.g., network issues, API errors)
        print(f"Error in request for latitude {latitude}, longitude {longitude}: {e}")
        temperature_request = None  # Append None for temperature to indicate an error
        sky_request = None  # Append None for sky description to indicate an error
    
    url = f"http://api.openweathermap.org/data/2.5/forecast?id={city_id_openweather}&appid=4bcaab846859609d453e249b3b3f0a16"
 
    response = requests.get(url)
    
    response = response.json()
    
    for item in response["list"]:
        weather_item = {
        "city_name" : example,
        "forecast_time": item.get("dt_txt", None),
        "temperature": item["main"].get("temp", None),
        "forecast": item["weather"][0].get("main", None),
        "rain_in_last_3h": item.get("rain", {}).get("3h", 0),
        "wind_speed": item["wind"].get("speed", None),
        "retrieval_time" : retrieval_time
        }
        
        weather_items.append(weather_item)
        weather_df = pd.DataFrame(weather_items)
    return weather_df

In [11]:
#Next step will be to create the functions to push DataFrames into SQL
def connection():
  schema = "gans"
  host = "127.0.0.1"
  user = "root"
  password = "Riverplate121."
  port = 3306
  return f'mysql+pymysql://{user}:{password}@{host}:{port}/{schema}'

In [12]:
def get_cities_data(connection_string):
  return pd.read_sql("cities", con=connection_string)

KeyError: 'city_name'

In [43]:
def retreiving_and_sending_data():
    connection_string=connection()
    #First step is to create send and retrieve Countries, therefore a new Dataframe needs to be created:
    countries_list_df = countries_df.loc[:,"country"].reset_index().drop("index",axis=1)
    #Second step is to send this data to sql and retrieve it to merge it
    countries_list_df.to_sql('countries',
                      if_exists='append',
                      con=connection_string,
                      index=False,
                        )
    countries_from_sql = get_countries_data(connection_string)
    #Now with the IDs, cars tables can be merged, and pushed into SQL as well
    cars = countries_from_sql.merge(countries_df,
                             on="country",
                             how="left")
    cars_df = cars.drop(["country"],axis = 1)
    cars_df.to_sql('cars',
                      if_exists='append',
                      con=connection_string,
                      index=False,
                  )
    
    #Same work has to be done with cities, information and weather
    for i in list_cities_extract:
        cities(i)
    
    #First we will create the DataFrame and erase all NaN rows
    information_df = pd.DataFrame({"city_name": city_name, "population": population, "latitude": latitude, "longitude": longitude, "country": country})
    information_df = information_df.loc[information_df["population"].notnull()&information_df["latitude"].notnull()&information_df["longitude"].notnull(),:]
    
    latitude_decimal = []
    for i in information_df["latitude"]:
        latitude_decimal.append(dms_to_dd(i))

    longitude_decimal =[]
    for n in information_df["longitude"]:
        longitude_decimal.append(dms_to_dd(n))
    
    information_df["latitude"] = latitude_decimal
    information_df["longitude"] = longitude_decimal
    
    
    cities_list = information_df.loc[:,["city_name","country"]].reset_index().drop("index",axis=1)
    cities_list = cities_list.merge(countries_from_sql,
                                    on = "country",
                                    how = "left"
                                   )
    cities_list = cities_list.drop("country",axis =1)
    cities_list.to_sql('cities',
                      if_exists='append',
                      con=connection_string,
                      index=False
                      )
    
    cities_from_sql = get_cities_data(connection_string)
    
    
    information_cities = information_df.merge(cities_from_sql,
                         on = "city_name",
                         how = "left"
                        )
    
    information_cities_countries = information_cities.merge(countries_from_sql,
                                                            on = "country",
                                                            how = "left"
                                                           )
    
    information_cities_countries = pd.DataFrame(information_cities_countries)
    information_cities_countries = information_cities_countries.drop(["city_name","country","country_id_x"],axis = 1)
    information_cities_countries = information_cities_countries.rename(columns = {"country_id_y":"country_id"})
    
    information_cities_countries.to_sql('information',
                      if_exists='append',
                      con=connection_string,
                      index=False,
                  )
    
    weather_df_final = pd.DataFrame()

    for city in information_df["city_name"]:
        result = weather_request(city)
    
    # Check if result is not empty before concatenating
        if not result.empty:
            weather_df_final = pd.concat([weather_df_final, result], ignore_index=True)
            
    weather_df_final = weather_df_final.merge(cities_from_sql,
                                              on = "city_name",
                                              how = "left"
                                             )
                                              
    weather_df_final = weather_df_final.drop("city_name",axis=1)
                                              
    weather_df_final.to_sql('weather',
                      if_exists='append',
                      con=connection_string,
                      index=False,
                           )

In [27]:
def airport_location(latitude_decimal, longitude_decimal):
    airport_iata = []
    cities = []
    url = "https://aerodatabox.p.rapidapi.com/airports/search/location"

    querystring = {"lat": str(latitude_decimal), "lon": str(longitude_decimal), "radiusKm": "50", "limit": "10", "withFlightInfoOnly": "true"}

    headers = {
        "X-RapidAPI-Key": "4b54815ac4mshe11ca88893efc1ep170ae5jsn49ac4b2d04a5",
        "X-RapidAPI-Host": "aerodatabox.p.rapidapi.com"
    }

    response = requests.get(url, headers=headers, params=querystring)

    if response.status_code == 200:
        airports = response.json()["items"]

        for airport in airports:
            airport_iata.append(airport["iata"])
            cities.append(airport["municipalityName"])

        # Creating a DataFrame
        airport_df = pd.DataFrame({"Airport_IATA": airport_iata, "City": cities})
        return airport_df
    else:
        return None

In [39]:
def airport_schedule(iata_name):
    from datetime import datetime, timedelta
    import requests
    import pandas as pd

    tomorrow = (datetime.now().date() + timedelta(days=1)).strftime("%Y-%m-%dT")
    url = f"https://aerodatabox.p.rapidapi.com/flights/airports/iata/{iata_name}/{tomorrow}00:00/{tomorrow}11:59"

    querystring = {"withLeg": "true", "withCancelled": "true", "withCodeshared": "true", "withCargo": "true",
                   "withPrivate": "true", "withLocation": "false"}

    headers = {
        "X-RapidAPI-Key": "4b54815ac4mshe11ca88893efc1ep170ae5jsn49ac4b2d04a5",
        "X-RapidAPI-Host": "aerodatabox.p.rapidapi.com"
    }

    response = requests.get(url, headers=headers, params=querystring)

    airport_arrivals = []
    iata_names = []  # Corrected variable name
    airlines = []
    status = []

    if response.status_code == 200:
        arrivals = response.json()["arrivals"]

        for arrival in arrivals:
            # Check if the keys exist before accessing them
            if "arrival" in arrival and "scheduledTime" in arrival["arrival"] and "local" in arrival["arrival"]["scheduledTime"]:
                airport_arrivals.append(arrival["arrival"]["scheduledTime"]["local"])
            else:
                airport_arrivals.append(None)

            iata_names.append(iata_name)  # Using the corrected variable name iata_names
           
            if "airline" in arrival and "name" in arrival["airline"]:
                airlines.append(arrival["airline"]["name"])
            else:
                airlines.append(None)
                
            if "status" in arrival:
                status.append(arrival["status"])
            else:
                status.append(None)
    else:
        airport_arrivals.append(None)
        iata_names.append(None)
        airlines.append(None)
        status.append(None)

    return pd.DataFrame({"airport_arrivals":airport_arrivals, "airlines":airlines, "iata_name":iata_names,"status":status})

In [40]:
def list_airports():
    airports_df_final = pd.DataFrame()
    airports_df = pd.DataFrame()
        for city in information_df["city_name"]:
            airports_df = airport_location(information_df.loc[information_df["city_name"]==city,"latitude"].values[0],information_df.loc[information_df["city_name"]== city,"longitude"].values[0])
          # Check if result is not empty before concatenating
        if not result.empty:
            airports_df_final = pd.concat([airports_df_final, airports_df], ignore_index=True)

IndentationError: unexpected indent (1043332848.py, line 4)

In [439]:
aux = pd.DataFrame()
airport_sheduldes_final = pd.DataFrame()

for airport in airports_df_final["Airport_IATA"]:
    aux = airport_shedulde(airport)
    if not aux.empty:
        airport_sheduldes_final = pd.concat([airport_sheduldes_final, aux], ignore_index=True)

In [349]:
#Now its time to merge the tables and export them to SQL
cities_from_sql = pd.read_sql("cities", con=connection())
airports_df_final = airports_df_final.merge(cities_from_sql,
                        left_on = "City",
                        right_on = "city_name",
                        how= "left")

In [352]:
airports_df_final = airports_df_final.drop(["City","city_name"],axis = 1)

In [356]:
import numpy as np
airports_df_final['city_id'].fillna(np.nan, inplace=True)
airports_df_final['country_id'].fillna(np.nan, inplace=True)

In [359]:
airports_df_final.to_sql('airports',
                      if_exists='append',
                      con=connection(),
                      index=False,
                           )

70

In [362]:
airports_from_sql = pd.read_sql("airports", con=connection())
airports_from_sql

Unnamed: 0,airport_id,city_id,country_id,Airport_IATA
0,1,1.0,30.0,BER
1,2,3.0,21.0,LBG
2,3,3.0,21.0,ORY
3,4,3.0,21.0,CDG
4,5,4.0,30.0,HAM
...,...,...,...,...
65,66,30.0,30.0,DTM
66,67,11.0,30.0,CGN
67,68,55.0,,MMX
68,69,,,CPH


In [17]:
#Now its necessary to perform the retrieval of the data of airports, merge it with the flights and export it to SQL
airports_from_sql = pd.read_sql("airports", con=connection())
airport_sheduldes_final = airport_sheduldes_final.merge(airports_from_sql,
                        left_on = "iata_name",
                        right_on = "Airport_IATA",
                        how= "left")


NameError: name 'airport_sheduldes_final' is not defined

In [445]:
airport_sheduldes_final = airport_sheduldes_final.drop(["iata_name"],axis = 1)

In [468]:
# Assuming "airport_arrivals" is the column containing datetime-like strings
airport_sheduldes_final['airport_arrivals'] = pd.to_datetime(airport_sheduldes_final['airport_arrivals'], utc=True)

# Format the datetime column as a string in the desired format
airport_sheduldes_final['airport_arrivals'] = airport_sheduldes_final['airport_arrivals'].dt.strftime('%Y-%m-%d %H:%M:%S')

In [449]:
airport_sheduldes_final = airport_sheduldes_final.drop("Airport_IATA",axis = 1)

In [465]:
airport_sheduldes_final.rename(columns = {"status":"status_"},inplace = True)

In [471]:
airport_sheduldes_final = airport_sheduldes_final.to_sql("airport_sheduldes",
                                                         if_exists='append',
                                                        con=connection(),
                                                        index=False,
                                                                   )
                                                         