# Flight Status - Final Project

In [16]:
import os
import shutil
import warnings

import pandas as pd
#import pandas_profiling
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import opendatasets as od

from sqlalchemy import create_engine
from dotenv import load_dotenv


warnings.filterwarnings('ignore')

## TAS-12 Data Acquisition

### Data Acquisition - Use Kaggle API

[How to use Kaggle API - Step-by-step guide](https://www.geeksforgeeks.org/how-to-download-kaggle-datasets-into-jupyter-notebook/)

Since almost all combined CSV files are larger than 60k records and more than 20 columns the best course of action would be to only use one year data, in this case we are going to use the `Combined_Flights_2022.csv` with the shape `(4078318, 61)` and the `Airlines.csv` for labeling if needed

The **Data Acquisition - Use Kaggle API** section can be re-run only in case we need to access the data from the kaggle API again. After this section we cleaned the data and saved it as CSV in the `/data/processed/` folder, accessed from there, then we'll create a DB with 2 tables `airlines` and `flights`, with the corresponding data and we'll work the data by retrieving it from the DB tables

In [2]:
# ONLY RUN ONCE
od.download('https://www.kaggle.com/datasets/robikscube/flight-delay-dataset-20182022')

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username:Your Kaggle Key:Downloading flight-delay-dataset-20182022.zip to .\flight-delay-dataset-20182022


100%|██████████| 3.73G/3.73G [01:45<00:00, 37.9MB/s]





In [3]:
# ONLY RUN ONCE
def move_folder():
    source = './flight-delay-dataset-20182022'
    destination = '../data/raw/flight-delay-dataset-20182022'

    shutil.move(source, destination)

In [4]:
# ONLY RUN ONCE
def remove_unnecesary():
    destination = '../data/raw/flight-delay-dataset-20182022'
    shutil.rmtree(os.path.join(destination, 'raw'))
    
    files = [x for x in os.listdir(destination) if not ('2022.csv' in x or 'Airlines' in x)]
    for f in files:
        os.remove(os.path.join(destination, f))


In [5]:
# ONLY RUN ONCE
move_folder()

In [6]:
# ONLY RUN ONCE
remove_unnecesary()

In [8]:
# ONLY RUN ONCE
for file in os.listdir('../data/raw/flight-delay-dataset-20182022'):
    shutil.move(os.path.join('../data/raw/flight-delay-dataset-20182022/', file), os.path.join('../data/processed'), file)

### Data Acquisition - Clean and Store Data to CSV

In [9]:
data = pd.read_csv('../data/processed/Combined_Flights_2022.csv')

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4078318 entries, 0 to 4078317
Data columns (total 61 columns):
 #   Column                                   Dtype  
---  ------                                   -----  
 0   FlightDate                               object 
 1   Airline                                  object 
 2   Origin                                   object 
 3   Dest                                     object 
 4   Cancelled                                bool   
 5   Diverted                                 bool   
 6   CRSDepTime                               int64  
 7   DepTime                                  float64
 8   DepDelayMinutes                          float64
 9   DepDelay                                 float64
 10  ArrTime                                  float64
 11  ArrDelayMinutes                          float64
 12  AirTime                                  float64
 13  CRSElapsedTime                           float64
 14  ActualElapsedTime 

In [11]:
data.describe()

Unnamed: 0,CRSDepTime,DepTime,DepDelayMinutes,DepDelay,ArrTime,ArrDelayMinutes,AirTime,CRSElapsedTime,ActualElapsedTime,Distance,...,TaxiOut,WheelsOff,WheelsOn,TaxiIn,CRSArrTime,ArrDelay,ArrDel15,ArrivalDelayGroups,DistanceGroup,DivAirportLandings
count,4078318.0,3957885.0,3957823.0,3957823.0,3954079.0,3944916.0,3944916.0,4078318.0,3944916.0,4078318.0,...,3955652.0,3955652.0,3954076.0,3954076.0,4078318.0,3944916.0,3944916.0,3944916.0,4078318.0,4078318.0
mean,1329.587,1334.374,16.01494,13.09049,1457.886,15.78307,111.0075,141.3211,135.8624,797.8657,...,16.97375,1356.576,1455.073,7.894387,1486.058,7.528486,0.2164715,-0.06256103,3.663516,0.003685098
std,490.4801,505.6219,52.31498,53.32016,543.1841,51.98424,69.96246,71.79635,71.85501,591.4742,...,9.495407,507.558,537.8428,6.663118,518.5078,55.24625,0.4118393,2.487442,2.320848,0.1141331
min,1.0,1.0,0.0,-78.0,1.0,0.0,8.0,-48.0,14.0,31.0,...,1.0,1.0,1.0,1.0,1.0,-100.0,0.0,-2.0,1.0,0.0
25%,914.0,917.0,0.0,-5.0,1046.0,0.0,60.0,89.0,83.0,368.0,...,11.0,932.0,1044.0,4.0,1103.0,-14.0,0.0,-1.0,2.0,0.0
50%,1320.0,1325.0,0.0,-2.0,1500.0,0.0,94.0,124.0,119.0,643.0,...,15.0,1338.0,1456.0,6.0,1513.0,-5.0,0.0,-1.0,3.0,0.0
75%,1735.0,1744.0,11.0,11.0,1914.0,10.0,141.0,171.0,167.0,1035.0,...,19.0,1758.0,1909.0,9.0,1920.0,10.0,0.0,0.0,5.0,0.0
max,2359.0,2400.0,7223.0,7223.0,2400.0,7232.0,727.0,690.0,764.0,5095.0,...,221.0,2400.0,2400.0,290.0,2359.0,7232.0,1.0,12.0,11.0,9.0


In [12]:
def check_nulls(df):
    # Assuming df is your DataFrame
    # Check for null values in each column
    percentage = 10
    percent = (percentage * len(df)) / 100
    null_counts = df.isnull().sum()

    # Filter columns with null values and print their sum
    columns_with_nulls_ten = null_counts[null_counts > percent]
    columns_with_nulls = null_counts[null_counts > 0]
    if len(columns_with_nulls_ten) > 0:
        for column, count in columns_with_nulls.items():
            print(f"Column '{column}' has {count} null values.")
    else:
        print("The null values in the dataframe don't exceed {percent} values or {percentage}% of the total data".format(percent=percent, percentage=percentage))
        print("Depending on Duplicated values we might want to consider dropping them since that low percentage of null values would hardly make any difference in the EDA or the model creation and prediction")

In [13]:
check_nulls(data)

The null values in the dataframe don't exceed 407831.8 values or 10% of the total data
Depending on Duplicated values we might want to consider dropping them since that low percentage of null values would hardly make any difference in the EDA or the model creation and prediction


In [14]:
data.duplicated().sum()

0

In [15]:
data.columns

Index(['FlightDate', 'Airline', 'Origin', 'Dest', 'Cancelled', 'Diverted',
       'CRSDepTime', 'DepTime', 'DepDelayMinutes', 'DepDelay', 'ArrTime',
       'ArrDelayMinutes', 'AirTime', 'CRSElapsedTime', 'ActualElapsedTime',
       'Distance', 'Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek',
       'Marketing_Airline_Network', 'Operated_or_Branded_Code_Share_Partners',
       'DOT_ID_Marketing_Airline', 'IATA_Code_Marketing_Airline',
       'Flight_Number_Marketing_Airline', 'Operating_Airline',
       'DOT_ID_Operating_Airline', 'IATA_Code_Operating_Airline',
       'Tail_Number', 'Flight_Number_Operating_Airline', 'OriginAirportID',
       'OriginAirportSeqID', 'OriginCityMarketID', 'OriginCityName',
       'OriginState', 'OriginStateFips', 'OriginStateName', 'OriginWac',
       'DestAirportID', 'DestAirportSeqID', 'DestCityMarketID', 'DestCityName',
       'DestState', 'DestStateFips', 'DestStateName', 'DestWac', 'DepDel15',
       'DepartureDelayGroups', 'DepTimeBlk', 'TaxiOu

From a bussines perspective point of view I wouldn't impute null values, since, for example, DepTime might be null because the flight might have been cancelled and if we impute or drop that registry we might affect the future predictions or even the hypothesis

### Data Acquisition - Database Creation

```
# load the .env file variables
load_dotenv()

# 1) Connect to the database here using the SQLAlchemy's create_engine function
def conn_string():
    user = os.getenv('DB_USER')
    pwd = os.getenv('DB_PASSWORD')
    host = os.getenv('DB_HOST')
    db = os.getenv('DB_NAME')

    return f"postgresql://{user}:{pwd}@{host}/{db}"

def connect():
    global engine
    print("Starting connection...")
    engine = create_engine(conn_string()).execution_options(autocommit=True)
    engine.connect()

    return engine

connect()

# 2) Execute the SQL sentences to create your tables using the SQLAlchemy's execute function
engine.execute(
    """
    CREATE TABLE publishers(
        publisher_id INT NOT NULL,
        name VARCHAR(255) NOT NULL,
        PRIMARY KEY(publisher_id)
    );
    """
)

# 3) Execute the SQL sentences to insert your data using the SQLAlchemy's execute function
engine.execute(
    """
    INSERT INTO publishers(publisher_id, name) VALUES (1, 'O Reilly Media');

    """
)
# 4) Use pandas to print one of the tables as dataframes using read_sql function
df = pd.read_sql('SELECT * FROM authors', engine)
print(df)
```

### Data Acquisition - Queries to obtain the data