# Flight Status - Final Project

In [1]:
import os
import shutil
import warnings
import zipfile

import pandas as pd
#import pandas_profiling
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import opendatasets as od
from sklearn.model_selection import train_test_split

from sqlalchemy import create_engine
import psycopg2
from dotenv import load_dotenv
import sqlite3

warnings.filterwarnings('ignore')

## TAS-12 Data Acquisition

### Data Acquisition - Use Kaggle API

[How to use Kaggle API - Step-by-step guide](https://www.geeksforgeeks.org/how-to-download-kaggle-datasets-into-jupyter-notebook/)

Since almost all combined CSV files are larger than 60k records and more than 20 columns the best course of action would be to only use one year data, in this case we are going to use the `Combined_Flights_2022.csv` with the shape `(4078318, 61)` and the `Airlines.csv` for labeling if needed

The **Data Acquisition - Use Kaggle API** section can be re-run only in case we need to access the data from the kaggle API again. After this section we cleaned the data and saved it as CSV in the `/data/processed/` folder, accessed from there, then we'll create a DB with 2 tables `airlines` and `flights`, with the corresponding data and we'll work the data by retrieving it from the DB tables

In [2]:
!kaggle datasets files robikscube/flight-delay-dataset-20182022

name                            size  creationDate         
-----------------------------  -----  -------------------  
Combined_Flights_2018.parquet  215MB  2022-10-07 16:28:11  
Combined_Flights_2021.parquet  232MB  2022-10-07 16:28:11  
Combined_Flights_2019.parquet  294MB  2022-10-07 16:28:11  
Combined_Flights_2018.csv        2GB  2022-10-07 16:28:11  
readme.md                       36KB  2022-10-07 16:28:11  
Combined_Flights_2019.csv        3GB  2022-10-07 16:28:11  
readme.html                     14KB  2022-10-07 16:28:11  
Airlines.csv                    38KB  2022-10-07 16:28:11  
Combined_Flights_2022.parquet  143MB  2022-10-07 16:28:11  
Combined_Flights_2022.csv        1GB  2022-10-07 16:28:11  
Combined_Flights_2020.csv        2GB  2022-10-07 16:28:11  
Combined_Flights_2021.csv        2GB  2022-10-07 16:28:11  
Combined_Flights_2020.parquet  175MB  2022-10-07 16:28:11  


In [3]:
!kaggle datasets download robikscube/flight-delay-dataset-20182022 -f Combined_Flights_2022.csv --path ../data/processed

Downloading Combined_Flights_2022.csv.zip to ../data/processed




  0%|          | 0.00/206M [00:00<?, ?B/s]
  0%|          | 1.00M/206M [00:00<00:41, 5.19MB/s]
  4%|▍         | 8.00M/206M [00:00<00:06, 32.5MB/s]
  6%|▋         | 13.0M/206M [00:00<00:05, 34.6MB/s]
 10%|▉         | 20.0M/206M [00:00<00:04, 42.8MB/s]
 13%|█▎        | 27.0M/206M [00:00<00:04, 42.9MB/s]
 17%|█▋        | 34.0M/206M [00:00<00:04, 37.1MB/s]
 20%|█▉        | 41.0M/206M [00:01<00:04, 38.1MB/s]
 23%|██▎       | 48.0M/206M [00:01<00:03, 44.5MB/s]
 26%|██▌       | 53.0M/206M [00:01<00:04, 35.5MB/s]
 28%|██▊       | 57.0M/206M [00:01<00:05, 28.3MB/s]
 31%|███       | 64.0M/206M [00:01<00:04, 35.3MB/s]
 34%|███▎      | 69.0M/206M [00:02<00:05, 24.6MB/s]
 36%|███▌      | 74.0M/206M [00:02<00:04, 28.8MB/s]
 39%|███▉      | 81.0M/206M [00:02<00:04, 32.4MB/s]
 43%|████▎     | 89.0M/206M [00:02<00:04, 28.0MB/s]
 47%|████▋     | 97.0M/206M [00:03<00:03, 32.8MB/s]
 51%|█████     | 105M/206M [00:03<00:02, 37.4MB/s] 
 55%|█████▍    | 113M/206M [00:03<00:02, 41.4MB/s]
 59%|█████▉    | 121M

In [4]:
!kaggle datasets download robikscube/flight-delay-dataset-20182022 -f Airlines.csv --path ../data/processed --unzip 

Downloading Airlines.csv to ../data/processed




  0%|          | 0.00/38.1k [00:00<?, ?B/s]
100%|██████████| 38.1k/38.1k [00:00<00:00, 599kB/s]


In [5]:
with zipfile.ZipFile('../data/processed/Combined_Flights_2022.csv.zip', 'r') as zip_ref:
    # Extract all the contents to the specified directory
    zip_ref.extractall('../data/processed/')

In [6]:
def remove_compressed():
    os.remove('../data/processed/Combined_Flights_2022.csv.zip')

In [7]:
remove_compressed()

### Data Acquisition - Clean and Store Data to CSV

In [8]:
data = pd.read_csv('../data/processed/Combined_Flights_2022.csv')

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4078318 entries, 0 to 4078317
Data columns (total 61 columns):
 #   Column                                   Dtype  
---  ------                                   -----  
 0   FlightDate                               object 
 1   Airline                                  object 
 2   Origin                                   object 
 3   Dest                                     object 
 4   Cancelled                                bool   
 5   Diverted                                 bool   
 6   CRSDepTime                               int64  
 7   DepTime                                  float64
 8   DepDelayMinutes                          float64
 9   DepDelay                                 float64
 10  ArrTime                                  float64
 11  ArrDelayMinutes                          float64
 12  AirTime                                  float64
 13  CRSElapsedTime                           float64
 14  ActualElapsedTime 

In [10]:
data = data.sample(
    n = 500000,
    random_state = 42
)

In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 500000 entries, 3811797 to 818443
Data columns (total 61 columns):
 #   Column                                   Non-Null Count   Dtype  
---  ------                                   --------------   -----  
 0   FlightDate                               500000 non-null  object 
 1   Airline                                  500000 non-null  object 
 2   Origin                                   500000 non-null  object 
 3   Dest                                     500000 non-null  object 
 4   Cancelled                                500000 non-null  bool   
 5   Diverted                                 500000 non-null  bool   
 6   CRSDepTime                               500000 non-null  int64  
 7   DepTime                                  485442 non-null  float64
 8   DepDelayMinutes                          485439 non-null  float64
 9   DepDelay                                 485439 non-null  float64
 10  ArrTime                        

In [12]:
data.describe()

Unnamed: 0,CRSDepTime,DepTime,DepDelayMinutes,DepDelay,ArrTime,ArrDelayMinutes,AirTime,CRSElapsedTime,ActualElapsedTime,Distance,...,TaxiOut,WheelsOff,WheelsOn,TaxiIn,CRSArrTime,ArrDelay,ArrDel15,ArrivalDelayGroups,DistanceGroup,DivAirportLandings
count,500000.0,485442.0,485439.0,485439.0,484973.0,483825.0,483825.0,500000.0,483825.0,500000.0,...,485165.0,485165.0,484972.0,484972.0,500000.0,483825.0,483825.0,483825.0,500000.0,500000.0
mean,1329.12083,1333.669225,16.100501,13.176115,1457.545614,15.86781,111.126192,141.433184,135.96469,798.916914,...,16.968201,1355.691816,1454.706041,7.882546,1485.733588,7.609537,0.216299,-0.058467,3.666546,0.003784
std,490.729503,505.963408,52.437194,53.44516,543.01821,52.165167,70.0182,71.843747,71.938226,591.845864,...,9.492607,507.930657,537.736098,6.678576,518.277098,55.430586,0.411721,2.497218,2.321489,0.116592
min,1.0,1.0,0.0,-55.0,1.0,0.0,8.0,5.0,16.0,31.0,...,1.0,1.0,1.0,1.0,1.0,-91.0,0.0,-2.0,1.0,0.0
25%,914.0,917.0,0.0,-5.0,1046.0,0.0,60.0,89.0,83.0,368.0,...,11.0,932.0,1044.0,4.0,1102.0,-14.0,0.0,-1.0,2.0,0.0
50%,1320.0,1325.0,0.0,-1.0,1459.0,0.0,94.0,125.0,119.0,646.0,...,15.0,1338.0,1456.0,6.0,1512.0,-5.0,0.0,-1.0,3.0,0.0
75%,1735.0,1743.0,11.0,11.0,1913.0,10.0,141.0,172.0,167.0,1035.0,...,19.0,1757.0,1908.0,9.0,1920.0,10.0,0.0,0.0,5.0,0.0
max,2359.0,2400.0,2650.0,2650.0,2400.0,2678.0,673.0,690.0,711.0,5095.0,...,183.0,2400.0,2400.0,251.0,2359.0,2678.0,1.0,12.0,11.0,9.0


In [13]:
def check_nulls(df):
    # Assuming df is your DataFrame
    # Check for null values in each column
    percentage = 10
    percent = (percentage * len(df)) / 100
    null_counts = df.isnull().sum()

    # Filter columns with null values and print their sum
    columns_with_nulls_ten = null_counts[null_counts > percent]
    columns_with_nulls = null_counts[null_counts > 0]
    if len(columns_with_nulls_ten) > 0:
        for column, count in columns_with_nulls.items():
            print(f"Column '{column}' has {count} null values.")
    else:
        print("The null values in the dataframe don't exceed {percent} values or {percentage}% of the total data".format(percent=percent, percentage=percentage))
        print("Depending on Duplicated values we might want to consider dropping them since that low percentage of null values would hardly make any difference in the EDA or the model creation and prediction")

In [14]:
check_nulls(data)

The null values in the dataframe don't exceed 50000.0 values or 10% of the total data
Depending on Duplicated values we might want to consider dropping them since that low percentage of null values would hardly make any difference in the EDA or the model creation and prediction


In [15]:
# data.duplicated().sum()

In [16]:
def treat_nulls(value):
    if pd.isnull(value):
        return 0
    else:
        return int(value)
    
# Define a function to transform values
def transform_time(value):
    if isinstance(value, int):
        value = str(value)  # Convert integer to string
    value = value.zfill(4)  # Pad with leading zeros if necessary
    if len(value) == 4:
        if int(value) == 2400:
            return '00:00'
        if int(value) < 10:  # For values less than 10
            return f'00:0{value[0]}'
        elif int(value) < 100:  # For values between 10 and 100
            return f'00:{value[:2]}'
        else:  # For values over 100
            return f'{value[:2]}:{value[2:]}'
    else:
        return value[:2] + ':' + value[2:]  # Format as 'HH:MM'

In [17]:
def format_dates():
    data['FlightDate'] = pd.to_datetime(data['FlightDate'])

    # Format time WheelsOff
    data['WheelsOff'] = data['WheelsOff'].apply(treat_nulls)
    data['WheelsOff'] = data['WheelsOff'].apply(transform_time)
    data['WheelsOff'] = pd.to_datetime(data['WheelsOff'], format='%H:%M').dt.time

    # Format time WheelsOn
    data['WheelsOn'] = data['WheelsOn'].apply(treat_nulls)
    data['WheelsOn'] = data['WheelsOn'].apply(transform_time)
    data['WheelsOn'] = pd.to_datetime(data['WheelsOn'], format='%H:%M').dt.time

    # Format time ArrTime
    data['ArrTime'] = data['ArrTime'].apply(treat_nulls)
    data['ArrTime'] = data['ArrTime'].apply(transform_time)
    data['ArrTime'] = pd.to_datetime(data['ArrTime'], format='%H:%M').dt.time


    # Format time DepTime
    data['DepTime'] = data['DepTime'].apply(treat_nulls)
    data['DepTime'] = data['DepTime'].apply(transform_time)
    data['DepTime'] = pd.to_datetime(data['DepTime'], format='%H:%M').dt.time

In [18]:
format_dates()

In [19]:
origins = data[['OriginAirportID', 'OriginAirportSeqID', 'OriginCityMarketID', 'Origin', 'OriginCityName', 'OriginState', 'OriginStateFips', 'OriginStateName', 'OriginWac']]
origins.drop_duplicates(inplace=True)
origins.to_csv('../data/processed/origins.csv', index = False)

In [20]:
destinations = data[['DestAirportID', 'DestAirportSeqID', 'DestCityMarketID', 'Dest', 'DestCityName', 'DestState', 'DestStateFips', 'DestStateName', 'DestWac']]
destinations.drop_duplicates(inplace=True)
destinations.to_csv('../data/processed/destinations.csv', index = False)

In [21]:
data.drop(
    columns=[
        'CRSDepTime',
        'ActualElapsedTime',
        'CRSArrTime',
        'OriginAirportSeqID',
        'OriginCityMarketID',
        'Origin',
        'OriginCityName',
        'OriginState',
        'OriginStateFips',
        'OriginStateName',
        'OriginWac',
        'TaxiIn',
        'TaxiOut',
        'ArrDel15',
        'ArrivalDelayGroups',
        'ArrTimeBlk',
        'DistanceGroup',
        'CRSDepTime',
        'DepDel15',
        'DepartureDelayGroups',
        'DepTimeBlk',
        'DestAirportSeqID',
        'DestCityMarketID',
        'Dest',
        'DestCityName',
        'DestState',
        'DestStateFips',
        'DestStateName',
        'DestWac',
        'CRSArrTime',
        'CRSElapsedTime'
    ], 
    inplace=True
)

In [22]:
colms = data.columns

print(colms)

Index(['FlightDate', 'Airline', 'Cancelled', 'Diverted', 'DepTime',
       'DepDelayMinutes', 'DepDelay', 'ArrTime', 'ArrDelayMinutes', 'AirTime',
       'Distance', 'Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek',
       'Marketing_Airline_Network', 'Operated_or_Branded_Code_Share_Partners',
       'DOT_ID_Marketing_Airline', 'IATA_Code_Marketing_Airline',
       'Flight_Number_Marketing_Airline', 'Operating_Airline',
       'DOT_ID_Operating_Airline', 'IATA_Code_Operating_Airline',
       'Tail_Number', 'Flight_Number_Operating_Airline', 'OriginAirportID',
       'DestAirportID', 'WheelsOff', 'WheelsOn', 'ArrDelay',
       'DivAirportLandings'],
      dtype='object')


In [23]:
data.to_csv('../data/processed/Combined_Flights_2022.csv', index = False)

In [24]:
data = pd.read_csv('../data/processed/Combined_Flights_2022.csv')
origins = pd.read_csv('../data/processed/origins.csv')
destinations = pd.read_csv('../data/processed/destinations.csv')
airlines = pd.read_csv('../data/processed/Airlines.csv')

In [25]:
data.columns

Index(['FlightDate', 'Airline', 'Cancelled', 'Diverted', 'DepTime',
       'DepDelayMinutes', 'DepDelay', 'ArrTime', 'ArrDelayMinutes', 'AirTime',
       'Distance', 'Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek',
       'Marketing_Airline_Network', 'Operated_or_Branded_Code_Share_Partners',
       'DOT_ID_Marketing_Airline', 'IATA_Code_Marketing_Airline',
       'Flight_Number_Marketing_Airline', 'Operating_Airline',
       'DOT_ID_Operating_Airline', 'IATA_Code_Operating_Airline',
       'Tail_Number', 'Flight_Number_Operating_Airline', 'OriginAirportID',
       'DestAirportID', 'WheelsOff', 'WheelsOn', 'ArrDelay',
       'DivAirportLandings'],
      dtype='object')

In [26]:
data.head()

Unnamed: 0,FlightDate,Airline,Cancelled,Diverted,DepTime,DepDelayMinutes,DepDelay,ArrTime,ArrDelayMinutes,AirTime,...,DOT_ID_Operating_Airline,IATA_Code_Operating_Airline,Tail_Number,Flight_Number_Operating_Airline,OriginAirportID,DestAirportID,WheelsOff,WheelsOn,ArrDelay,DivAirportLandings
0,2022-03-19,SkyWest Airlines Inc.,False,False,18:26:00,0.0,-5.0,19:23:00,0.0,31.0,...,20304,OO,N794SK,5745,10372,11292,18:45:00,19:16:00,-10.0,0
1,2022-02-16,SkyWest Airlines Inc.,False,False,16:05:00,0.0,0.0,18:12:00,35.0,194.0,...,20304,OO,N127SY,5733,15412,11292,16:14:00,17:28:00,35.0,0
2,2022-01-18,American Airlines Inc.,False,False,17:14:00,0.0,-5.0,20:52:00,3.0,314.0,...,19805,AA,N400AN,205,12892,13830,17:28:00,20:42:00,3.0,0
3,2022-01-23,Southwest Airlines Co.,False,False,15:33:00,18.0,18.0,23:02:00,27.0,251.0,...,19393,WN,N7828A,312,12889,10821,15:48:00,22:59:00,27.0,0
4,2022-07-01,Delta Air Lines Inc.,False,False,07:09:00,0.0,-6.0,08:37:00,0.0,73.0,...,19790,DL,N333NB,698,11433,14492,07:21:00,08:34:00,-16.0,0


In [27]:
data.Cancelled.dtype

dtype('bool')

In [28]:
origins.head()

Unnamed: 0,OriginAirportID,OriginAirportSeqID,OriginCityMarketID,Origin,OriginCityName,OriginState,OriginStateFips,OriginStateName,OriginWac
0,10372,1037205,30372,ASE,"Aspen, CO",CO,8,Colorado,82
1,15412,1541205,35412,TYS,"Knoxville, TN",TN,47,Tennessee,54
2,12892,1289208,32575,LAX,"Los Angeles, CA",CA,6,California,91
3,12889,1288903,32211,LAS,"Las Vegas, NV",NV,32,Nevada,85
4,11433,1143302,31295,DTW,"Detroit, MI",MI,26,Michigan,43


In [29]:
destinations.head()

Unnamed: 0,DestAirportID,DestAirportSeqID,DestCityMarketID,Dest,DestCityName,DestState,DestStateFips,DestStateName,DestWac
0,11292,1129202,30325,DEN,"Denver, CO",CO,8,Colorado,82
1,13830,1383002,33830,OGG,"Kahului, HI",HI,15,Hawaii,2
2,10821,1082106,30852,BWI,"Baltimore, MD",MD,24,Maryland,35
3,14492,1449202,34492,RDU,"Raleigh/Durham, NC",NC,37,North Carolina,36
4,14869,1486903,34614,SLC,"Salt Lake City, UT",UT,49,Utah,87


In [30]:
airlines.head()

Unnamed: 0,Code,Description
0,02Q,Titan Airways
1,04Q,Tradewind Aviation
2,05Q,"Comlux Aviation, AG"
3,06Q,Master Top Linhas Aereas Ltd.
4,07Q,Flair Airlines Ltd.


From a bussines perspective point of view I wouldn't impute null values, since, for example, DepTime might be null because the flight might have been cancelled and if we impute or drop that registry we might affect the future predictions or even the hypothesis

### Data Acquisition - Database Creation

In [31]:
load_dotenv()

con = sqlite3.connect("../flight-information.db")
cur = con.cursor()

In [32]:
def map_to_bool(cols):
    mapping = {True: 1, False: 0}
    cols = list(cols)

    for c in cols:
        # Convert boolean values to integers (0 and 1)
        data[c] = data[c].astype(int)
    # for c in cols:
    #     #data[c] = data[c].map(mapping)
    #     data[c] = data[c].astype('int')

In [33]:
map_to_bool(['Cancelled', 'Diverted'])

In [34]:
data['Diverted'].unique()

array([0, 1])

In [35]:
airlines.columns

Index(['Code', 'Description'], dtype='object')

In [36]:
airlines_cols = {
    'Code': 'VARCHAR(50)', 
    'Description': 'VARCHAR(100)'
}

orgs_cols = {
    'OriginAirportID': 'INTEGER',
    'OriginAirportSeqID': 'INTEGER',
    'OriginCityMarketID': 'INTEGER',
    'Origin': 'VARCHAR(10)',
    'OriginCityName': 'VARCHAR(60)',
    'OriginState': 'VARCHAR(10)',
    'OriginStateFips': 'INTEGER',
    'OriginStateName': 'VARCHAR(60)',
    'OriginWac': 'INTEGER',
}
dests_cols = {
    'DestAirportID': 'INTEGER',
    'DestAirportSeqID': 'INTEGER',
    'DestCityMarketID': 'INTEGER',
    'Dest': 'VARCHAR(10)',
    'DestCityName': 'VARCHAR(60)',
    'DestState': 'VARCHAR(10)',
    'DestStateFips': 'INTEGER',
    'DestStateName': 'VARCHAR(60)',
    'DestWac': 'INTEGER',
}

main_cols = {
    'FlightDate': 'DATE',
    'Airline': 'VARCHAR(100)',
    'Cancelled': 'BOOLEAN',
    'Diverted': 'BOOLEAN',
    'DepTime': 'TIME',
    'DepDelayMinutes': 'NUMERIC',
    'DepDelay': 'NUMERIC',
    'ArrTime': 'TIME',
    'ArrDelayMinutes': 'NUMERIC',
    'AirTime': 'NUMERIC',
    'Distance': 'NUMERIC',
    'Year': 'INT',
    'Quarter': 'INT',
    'Month': 'INT',
    'DayofMonth': 'INT',
    'DayOfWeek': 'INT',
    'Marketing_Airline_Network': 'VARCHAR(10)',
    'Operated_or_Branded_Code_Share_Partners': 'VARCHAR(20)',
    'DOT_ID_Marketing_Airline': 'INT',
    'IATA_Code_Marketing_Airline': 'VARCHAR(10)',
    'Flight_Number_Marketing_Airline': 'INT',
    'Operating_Airline': 'VARCHAR(5)',
    'DOT_ID_Operating_Airline': 'INT',
    'IATA_Code_Operating_Airline': 'VARCHAR(5)',
    'Tail_Number': 'VARCHAR(20)', 
    'Flight_Number_Operating_Airline': 'INT',
    'OriginAirportID': 'INT',
    'DestAirportID': 'INT',
    'WheelsOff': 'TIME',
    'WheelsOn': 'TIME',
    'ArrDelay': 'NUMERIC',
    'DivAirportLandings': 'INT' 
}

In [37]:
main_table = f"""
CREATE TABLE flights(
    {", ".join([f"{col} {dtype}" for col, dtype in main_cols.items()])}
)
"""
origins_table = f"""
CREATE TABLE origins(
    {", ".join([f"{col} {dtype}" for col, dtype in orgs_cols.items()])}
)
"""
dests_table = f"""
CREATE TABLE destinations(
    {", ".join([f"{col} {dtype}" for col, dtype in dests_cols.items()])}
)
"""
airlines_table = f"""
CREATE TABLE airlines(
    {", ".join([f"{col} {dtype}" for col, dtype in airlines_cols.items()])}
)
"""

con.execute(main_table)
con.execute(origins_table)
con.execute(dests_table)
con.execute(airlines_table)





# Iterate over rows in the DataFrame and insert data using parameterized queries
for index, row in data.iterrows():
    # Define the SQL query with placeholders for parameters
    insert_sql = f"""
    INSERT INTO flights ({", ".join(main_cols.keys())}) 
    VALUES ({", ".join(['?' for _ in main_cols.keys()])})
    """
    # Extract values from the row as a tuple
    values = tuple(row[col] for col in main_cols.keys())
    
    # Execute the SQL statement with parameterized values
    cur.execute(insert_sql, values)

# Commit the transaction
con.commit()


for index, row in origins.iterrows():
    # Define the SQL query with placeholders for parameters
    insert_sql = f"""
    INSERT INTO origins ({", ".join(orgs_cols.keys())}) 
    VALUES ({", ".join(['?' for _ in orgs_cols.keys()])})
    """
    # Extract values from the row as a tuple
    values = tuple(row[col] for col in orgs_cols.keys())
    
    # Execute the SQL statement with parameterized values
    cur.execute(insert_sql, values)

# Commit the transaction
con.commit()


for index, row in destinations.iterrows():
    # Define the SQL query with placeholders for parameters
    insert_sql = f"""
    INSERT INTO destinations ({", ".join(dests_cols.keys())}) 
    VALUES ({", ".join(['?' for _ in dests_cols.keys()])})
    """
    # Extract values from the row as a tuple
    values = tuple(row[col] for col in dests_cols.keys())
    
    # Execute the SQL statement with parameterized values
    cur.execute(insert_sql, values)

# Commit the transaction
con.commit()


for index, row in airlines.iterrows():
    # Define the SQL query with placeholders for parameters
    insert_sql = f"""
    INSERT INTO airlines ({", ".join(airlines_cols.keys())}) 
    VALUES ({", ".join(['?' for _ in airlines_cols.keys()])})
    """
    # Extract values from the row as a tuple
    values = tuple(row[col] for col in airlines_cols.keys())
    
    # Execute the SQL statement with parameterized values
    cur.execute(insert_sql, values)

# Commit the transaction
con.commit()

### Data Acquisition - Queries to obtain the data

In [38]:
def get_information():
    sql_query = """
    SELECT 
    flights.*, 
    destinations.DestAirportSeqID,
    destinations.DestCityMarketID,
    destinations.Dest,
    destinations.DestCityName,
    destinations.DestState,
    destinations.DestStateFips,
    destinations.DestStateName,
    destinations.DestWac,
    origins.OriginAirportSeqID,
    origins.OriginCityMarketID,
    origins.Origin,
    origins.OriginCityName,
    origins.OriginState,
    origins.OriginStateFips,
    origins.OriginStateName,
    origins.OriginWac
    FROM flights
    INNER JOIN origins ON flights.OriginAirportID = origins.OriginAirportID
    INNER JOIN destinations ON flights.DestAirportID = destinations.DestAirportID
    """
    return pd.read_sql_query(sql_query, con)

In [39]:
df = get_information()

In [40]:
cur.close()
con.close()

In [41]:
df.head()

Unnamed: 0,FlightDate,Airline,Cancelled,Diverted,DepTime,DepDelayMinutes,DepDelay,ArrTime,ArrDelayMinutes,AirTime,...,DestStateName,DestWac,OriginAirportSeqID,OriginCityMarketID,Origin,OriginCityName,OriginState,OriginStateFips,OriginStateName,OriginWac
0,2022-03-19,SkyWest Airlines Inc.,0,0,18:26:00,0.0,-5.0,19:23:00,0.0,31.0,...,Colorado,82,1037205,30372,ASE,"Aspen, CO",CO,8,Colorado,82
1,2022-02-16,SkyWest Airlines Inc.,0,0,16:05:00,0.0,0.0,18:12:00,35.0,194.0,...,Colorado,82,1541205,35412,TYS,"Knoxville, TN",TN,47,Tennessee,54
2,2022-01-18,American Airlines Inc.,0,0,17:14:00,0.0,-5.0,20:52:00,3.0,314.0,...,Hawaii,2,1289208,32575,LAX,"Los Angeles, CA",CA,6,California,91
3,2022-01-23,Southwest Airlines Co.,0,0,15:33:00,18.0,18.0,23:02:00,27.0,251.0,...,Maryland,35,1288903,32211,LAS,"Las Vegas, NV",NV,32,Nevada,85
4,2022-07-01,Delta Air Lines Inc.,0,0,07:09:00,0.0,-6.0,08:37:00,0.0,73.0,...,North Carolina,36,1143302,31295,DTW,"Detroit, MI",MI,26,Michigan,43
