## Import dependencies

In [2]:
import numpy as np
import pandas as pd

import dask.dataframe as dd
from dask.distributed import Client
from dask_ml.model_selection import train_test_split
from joblib import parallel_backend
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sqlalchemy import create_engine

## Connect to database and import data

In [2]:
# Connect to Flights_DB

db_path = Path('Resources/Flights_DB.sqlite')
engine = create_engine(f'sqlite:///{db_path}')

conn = engine.connect()

In [3]:
# Load Flights table, join Cancelled_Flights and Delayed_Flights for reasons

flights_df = pd.read_sql_query('''
                                  SELECT F.Operating_Airline
                                      ,F.Origin_Airport
                                      ,F.Destination_Airport
                                      ,F.Flight_Date
                                      ,F.Scheduled_Arrival_Time
                                      ,F.Scheduled_Departure_Time
                                      ,D.Carrier_Delay_Minutes
                                      ,D.Weather_Delay_Minutes
                                      ,D.NAS_Delay_Minutes
                                      ,D.Security_Delay_Minutes
                                      ,D.Late_Aircraft_Delay_Minutes
                                      ,C.Cancellation_Code
                                      ,F.Target
                                  FROM Flights F
                                  LEFT JOIN Cancelled_Flights C ON F.ID = C.ID
                                  LEFT JOIN Delayed_Flights D ON F.ID = D.ID
                               ''',
                               con=engine,
                               dtype={
                                   'Operating_Airline': object,
                                   'Origin_Airport': object,
                                   'Destination_Airport': object,
                                   'Flight_Date': np.datetime64,
                                   'Scheduled_Arrival_Time': np.datetime64,
                                   'Scheduled_Departure_Time': np.datetime64,
                                   'Carrier_Delay_Minutes': object,
                                   'Weather_Delay_Minutes': object,
                                   'NAS_Delay_Minutes': object,
                                   'Security_Delay_Minutes': object,
                                   'Late_Aircraft_Delay_Minutes': object,
                                   'Cancellation_Code': object,
                                   'Target': np.int32
                               })

flights_df.head(10)

Unnamed: 0,Operating_Airline,Origin_Airport,Destination_Airport,Flight_Date,Scheduled_Arrival_Time,Scheduled_Departure_Time,Carrier_Delay_Minutes,Weather_Delay_Minutes,NAS_Delay_Minutes,Security_Delay_Minutes,Late_Aircraft_Delay_Minutes,Cancellation_Code,Target
0,DL,FLL,LGA,2022-01-06,2023-02-15 14:19:00,2023-02-15 11:26:00,,,,,,A,2
1,DL,ATL,FLL,2022-01-06,2023-02-15 18:21:00,2023-02-15 16:31:00,,,,,,,0
2,DL,FLL,ATL,2022-01-06,2023-02-15 21:27:00,2023-02-15 19:31:00,,,,,,,0
3,DL,FLL,RDU,2022-01-06,2023-02-15 12:27:00,2023-02-15 10:24:00,,,,,,,0
4,DL,ATL,JAN,2022-01-06,2023-02-15 11:42:00,2023-02-15 11:17:00,,,,,,,0
5,DL,JAN,ATL,2022-01-06,2023-02-15 14:58:00,2023-02-15 12:37:00,,,,,,,0
6,DL,RIC,ATL,2022-01-06,2023-02-15 10:50:00,2023-02-15 09:00:00,,,,,,,0
7,DL,MSP,RSW,2022-01-06,2023-02-15 14:33:00,2023-02-15 10:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,,1
8,DL,ATL,RDU,2022-01-06,2023-02-15 15:33:00,2023-02-15 14:14:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,,1
9,DL,RDU,ATL,2022-01-06,2023-02-15 18:14:00,2023-02-15 16:43:00,,,,,,,0


In [5]:
# Checking to make sure timedeltas have values

flights_df.nunique()

Operating_Airline                21
Origin_Airport                  375
Destination_Airport             375
Flight_Date                     212
Scheduled_Arrival_Time         1397
Scheduled_Departure_Time       1320
Carrier_Delay_Minutes          1433
Weather_Delay_Minutes           889
NAS_Delay_Minutes               696
Security_Delay_Minutes          170
Late_Aircraft_Delay_Minutes    1075
Cancellation_Code                 4
Target                            3
dtype: int64

## Preprocessing

In [6]:
# Make separate columns for Month, Day and Weekday

flights_df['Month'] = flights_df['Flight_Date'].dt.month
flights_df['Day'] = flights_df['Flight_Date'].dt.day
flights_df['Weekday'] = flights_df['Flight_Date'].dt.dayofweek

flights_df.head(10)

Unnamed: 0,Operating_Airline,Origin_Airport,Destination_Airport,Flight_Date,Scheduled_Arrival_Time,Scheduled_Departure_Time,Carrier_Delay_Minutes,Weather_Delay_Minutes,NAS_Delay_Minutes,Security_Delay_Minutes,Late_Aircraft_Delay_Minutes,Cancellation_Code,Target,Month,Day,Weekday
0,DL,FLL,LGA,2022-01-06,2023-02-15 14:19:00,2023-02-15 11:26:00,,,,,,A,2,1,6,3
1,DL,ATL,FLL,2022-01-06,2023-02-15 18:21:00,2023-02-15 16:31:00,,,,,,,0,1,6,3
2,DL,FLL,ATL,2022-01-06,2023-02-15 21:27:00,2023-02-15 19:31:00,,,,,,,0,1,6,3
3,DL,FLL,RDU,2022-01-06,2023-02-15 12:27:00,2023-02-15 10:24:00,,,,,,,0,1,6,3
4,DL,ATL,JAN,2022-01-06,2023-02-15 11:42:00,2023-02-15 11:17:00,,,,,,,0,1,6,3
5,DL,JAN,ATL,2022-01-06,2023-02-15 14:58:00,2023-02-15 12:37:00,,,,,,,0,1,6,3
6,DL,RIC,ATL,2022-01-06,2023-02-15 10:50:00,2023-02-15 09:00:00,,,,,,,0,1,6,3
7,DL,MSP,RSW,2022-01-06,2023-02-15 14:33:00,2023-02-15 10:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,,1,1,6,3
8,DL,ATL,RDU,2022-01-06,2023-02-15 15:33:00,2023-02-15 14:14:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,,1,1,6,3
9,DL,RDU,ATL,2022-01-06,2023-02-15 18:14:00,2023-02-15 16:43:00,,,,,,,0,1,6,3


In [7]:
# Extract hour from arrival/departure columns

time_columns = ['Scheduled_Arrival_Time', 'Scheduled_Departure_Time']

for col in time_columns:
    flights_df[col] = flights_df[col].dt.hour

In [8]:
# Fill None in intervals with 0

none_cols = ['Carrier_Delay_Minutes', 'Weather_Delay_Minutes', 'NAS_Delay_Minutes',
                 'Security_Delay_Minutes', 'Late_Aircraft_Delay_Minutes', 'Cancellation_Code']

flights_df[none_cols] = flights_df[none_cols].apply(lambda x: x.fillna(0))

flights_df.head(10)

Unnamed: 0,Operating_Airline,Origin_Airport,Destination_Airport,Flight_Date,Scheduled_Arrival_Time,Scheduled_Departure_Time,Carrier_Delay_Minutes,Weather_Delay_Minutes,NAS_Delay_Minutes,Security_Delay_Minutes,Late_Aircraft_Delay_Minutes,Cancellation_Code,Target,Month,Day,Weekday
0,DL,FLL,LGA,2022-01-06,14,11,0,0,0,0,0,A,2,1,6,3
1,DL,ATL,FLL,2022-01-06,18,16,0,0,0,0,0,0,0,1,6,3
2,DL,FLL,ATL,2022-01-06,21,19,0,0,0,0,0,0,0,1,6,3
3,DL,FLL,RDU,2022-01-06,12,10,0,0,0,0,0,0,0,1,6,3
4,DL,ATL,JAN,2022-01-06,11,11,0,0,0,0,0,0,0,1,6,3
5,DL,JAN,ATL,2022-01-06,14,12,0,0,0,0,0,0,0,1,6,3
6,DL,RIC,ATL,2022-01-06,10,9,0,0,0,0,0,0,0,1,6,3
7,DL,MSP,RSW,2022-01-06,14,10,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0,1,1,6,3
8,DL,ATL,RDU,2022-01-06,15,14,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0,1,1,6,3
9,DL,RDU,ATL,2022-01-06,18,16,0,0,0,0,0,0,0,1,6,3


In [9]:
# Double check existence of nulls

flights_df.isna().sum()

Operating_Airline              0
Origin_Airport                 0
Destination_Airport            0
Flight_Date                    0
Scheduled_Arrival_Time         0
Scheduled_Departure_Time       0
Carrier_Delay_Minutes          0
Weather_Delay_Minutes          0
NAS_Delay_Minutes              0
Security_Delay_Minutes         0
Late_Aircraft_Delay_Minutes    0
Cancellation_Code              0
Target                         0
Month                          0
Day                            0
Weekday                        0
dtype: int64

In [10]:
# Drop Flight_Date column

flights_df = flights_df.drop(columns='Flight_Date')

flights_df.head(10)

Unnamed: 0,Operating_Airline,Origin_Airport,Destination_Airport,Scheduled_Arrival_Time,Scheduled_Departure_Time,Carrier_Delay_Minutes,Weather_Delay_Minutes,NAS_Delay_Minutes,Security_Delay_Minutes,Late_Aircraft_Delay_Minutes,Cancellation_Code,Target,Month,Day,Weekday
0,DL,FLL,LGA,14,11,0,0,0,0,0,A,2,1,6,3
1,DL,ATL,FLL,18,16,0,0,0,0,0,0,0,1,6,3
2,DL,FLL,ATL,21,19,0,0,0,0,0,0,0,1,6,3
3,DL,FLL,RDU,12,10,0,0,0,0,0,0,0,1,6,3
4,DL,ATL,JAN,11,11,0,0,0,0,0,0,0,1,6,3
5,DL,JAN,ATL,14,12,0,0,0,0,0,0,0,1,6,3
6,DL,RIC,ATL,10,9,0,0,0,0,0,0,0,1,6,3
7,DL,MSP,RSW,14,10,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0,1,1,6,3
8,DL,ATL,RDU,15,14,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0,1,1,6,3
9,DL,RDU,ATL,18,16,0,0,0,0,0,0,0,1,6,3


In [11]:
# Extract numbers from Minutes column string

interval_cols = ['Carrier_Delay_Minutes', 'Weather_Delay_Minutes', 'NAS_Delay_Minutes',
                 'Security_Delay_Minutes', 'Late_Aircraft_Delay_Minutes']

flights_df[interval_cols] = flights_df[interval_cols].apply(lambda x: pd.to_timedelta(x).dt.total_seconds().astype(int) // 60)

flights_df.head(10)

Unnamed: 0,Operating_Airline,Origin_Airport,Destination_Airport,Scheduled_Arrival_Time,Scheduled_Departure_Time,Carrier_Delay_Minutes,Weather_Delay_Minutes,NAS_Delay_Minutes,Security_Delay_Minutes,Late_Aircraft_Delay_Minutes,Cancellation_Code,Target,Month,Day,Weekday
0,DL,FLL,LGA,14,11,0,0,0,0,0,A,2,1,6,3
1,DL,ATL,FLL,18,16,0,0,0,0,0,0,0,1,6,3
2,DL,FLL,ATL,21,19,0,0,0,0,0,0,0,1,6,3
3,DL,FLL,RDU,12,10,0,0,0,0,0,0,0,1,6,3
4,DL,ATL,JAN,11,11,0,0,0,0,0,0,0,1,6,3
5,DL,JAN,ATL,14,12,0,0,0,0,0,0,0,1,6,3
6,DL,RIC,ATL,10,9,0,0,0,0,0,0,0,1,6,3
7,DL,MSP,RSW,14,10,0,0,0,0,0,0,1,1,6,3
8,DL,ATL,RDU,15,14,0,0,0,0,0,0,1,1,6,3
9,DL,RDU,ATL,18,16,0,0,0,0,0,0,0,1,6,3


In [12]:
# Check unique values to make sure no data lost

flights_df.nunique()

Operating_Airline                21
Origin_Airport                  375
Destination_Airport             375
Scheduled_Arrival_Time           24
Scheduled_Departure_Time         24
Carrier_Delay_Minutes          1433
Weather_Delay_Minutes           889
NAS_Delay_Minutes               696
Security_Delay_Minutes          169
Late_Aircraft_Delay_Minutes    1075
Cancellation_Code                 5
Target                            3
Month                             7
Day                              31
Weekday                           7
dtype: int64

In [13]:
# Make columns that track reasons for both delays and cancellations as a binary

def check_condition(delay_col, cancel_code):
    
    condition = ((flights_df[delay_col] > 0) | (flights_df['Cancellation_Code'] == cancel_code))
    
    return condition

carrier = check_condition('Carrier_Delay_Minutes', 'A')
bad_weather = check_condition('Weather_Delay_Minutes', 'B')
NAS = check_condition('NAS_Delay_Minutes', 'C')
security = check_condition('Security_Delay_Minutes', 'D')
late_aircraft = (flights_df['Late_Aircraft_Delay_Minutes'] > 0)

flights_df['Carrier_Issue'] = np.where(carrier, 1, 0)
flights_df['Bad_Weather'] = np.where(bad_weather, 1, 0)
flights_df['NAS'] = np.where(NAS, 1, 0)
flights_df['Security_Issue'] = np.where(security, 1, 0)
flights_df['Late_Aircraft'] = np.where(security, 1, 0)

In [14]:
# Create function to output encoding dictionaries 

def encoder (MasterDF, ColName, Dictionary):
    uni_row_vals =[MasterDF[ColName].unique()][0]
    for x in range(len(uni_row_vals)):
        Dictionary[uni_row_vals[x]] = x

In [15]:
# Integer encode Origin and Destination airports

loc_dict = {}
encoder(flights_df, 'Origin_Airport', loc_dict)
flights_df['Origin_Airport'] = flights_df['Origin_Airport'].apply(lambda x: loc_dict[x])
flights_df['Destination_Airport'] = flights_df['Destination_Airport'].apply(lambda x: loc_dict[x])

In [16]:
# Integer encode operating airlines

airline_dict = {}
encoder(flights_df, 'Operating_Airline', airline_dict)
flights_df['Operating_Airline'] = flights_df['Operating_Airline'].apply(lambda x: airline_dict[x])

In [17]:
# Drop extra columns and check for complete DF

drop_cols = ['Carrier_Delay_Minutes', 'Weather_Delay_Minutes', 'NAS_Delay_Minutes',
                 'Security_Delay_Minutes', 'Late_Aircraft_Delay_Minutes', 'Cancellation_Code']

flights_df = flights_df.drop(columns=drop_cols).copy()

flights_df.head(10)

Unnamed: 0,Operating_Airline,Origin_Airport,Destination_Airport,Scheduled_Arrival_Time,Scheduled_Departure_Time,Target,Month,Day,Weekday,Carrier_Issue,Bad_Weather,NAS,Security_Issue,Late_Aircraft
0,0,0,8,14,11,2,1,6,3,1,0,0,0,0
1,0,1,0,18,16,0,1,6,3,0,0,0,0,0
2,0,0,1,21,19,0,1,6,3,0,0,0,0,0
3,0,0,5,12,10,0,1,6,3,0,0,0,0,0
4,0,1,2,11,11,0,1,6,3,0,0,0,0,0
5,0,2,1,14,12,0,1,6,3,0,0,0,0,0
6,0,3,1,10,9,0,1,6,3,0,0,0,0,0
7,0,4,17,14,10,1,1,6,3,0,0,0,0,0
8,0,1,5,15,14,1,1,6,3,0,0,0,0,0
9,0,5,1,18,16,0,1,6,3,0,0,0,0,0


In [18]:
# Checkpoint DF for use in future sessions

model_df_path = Path('Resources/flights_model_df.csv')
flights_df.to_csv(model_df_path, index=False)

## Load DF as Dask DF and start parallel processing client

In [3]:
# Load DF as a Dask DF in order to use ML model on large dataset

model_df_path = Path('Resources/flights_model_df.csv')
flights_df = dd.read_csv(model_df_path)

flights_df.head(10)

Unnamed: 0,Operating_Airline,Origin_Airport,Destination_Airport,Scheduled_Arrival_Time,Scheduled_Departure_Time,Target,Month,Day,Weekday,Carrier_Issue,Bad_Weather,NAS,Security_Issue,Late_Aircraft
0,0,0,8,14,11,2,1,6,3,1,0,0,0,0
1,0,1,0,18,16,0,1,6,3,0,0,0,0,0
2,0,0,1,21,19,0,1,6,3,0,0,0,0,0
3,0,0,5,12,10,0,1,6,3,0,0,0,0,0
4,0,1,2,11,11,0,1,6,3,0,0,0,0,0
5,0,2,1,14,12,0,1,6,3,0,0,0,0,0
6,0,3,1,10,9,0,1,6,3,0,0,0,0,0
7,0,4,17,14,10,1,1,6,3,0,0,0,0,0
8,0,1,5,15,14,1,1,6,3,0,0,0,0,0
9,0,5,1,18,16,0,1,6,3,0,0,0,0,0


In [3]:
# Check datatypes

flights_df.dtypes

Operating_Airline           int64
Origin_Airport              int64
Destination_Airport         int64
Scheduled_Arrival_Time      int64
Scheduled_Departure_Time    int64
Target                      int64
Month                       int64
Day                         int64
Weekday                     int64
Carrier_Issue               int64
Bad_Weather                 int64
NAS                         int64
Security_Issue              int64
Late_Aircraft               int64
dtype: object

In [4]:
# Start client for parallel computation

client = Client()

## Random Forest Classifier Model Function

In [6]:
# Function that runs a Random Forest Model and outputs stats on model

def rfmodel_complete (MasterDF,ColumnList,estimators=100,randomstate=52):

    X = MasterDF[ColumnList]
    y = MasterDF['Target']

    X_train, X_test, y_train, y_test = train_test_split(X,y,shuffle=True,random_state=randomstate)
    
    with parallel_backend('threading', n_jobs=2):
        
        rf_model = RandomForestClassifier(n_estimators=estimators,random_state=randomstate, n_jobs=2)
        
        rf_model.fit(X_train,y_train)
            
    rf_predictions = rf_model.predict(X_test)

    cm = confusion_matrix(y_test, rf_predictions)

    acc_score = accuracy_score(y_test, rf_predictions)

    # Create a classification report
    class_report = classification_report(y_test, rf_predictions)
    
    # Create a DataFrame from the confusion matrix.
    cm_df = pd.DataFrame(
        cm, index=["On Time", "Delayed","Cancelled"], columns=["On Time (Pred)", "Delayed (Pred)", "Cancelled (Pred)"])

    # Calculate feature importance in the Random Forest model.
    importances = rf_model.feature_importances_

    importances_dict = sorted(zip(importances, X.columns), reverse=True)
    
    return (# Displaying results
        print("\nConfusion Matrix\n"),
        
        print(cm_df),
        
        print(f"\nAccuracy Score : {acc_score}\n"),
        
        print("\nClassification Report\n"),

        print(class_report),
        
        "Feature Importances",
        
        importances_dict
    )

In [7]:
flights_df.columns

Index(['Operating_Airline', 'Origin_Airport', 'Destination_Airport',
       'Scheduled_Arrival_Time', 'Scheduled_Departure_Time', 'Target', 'Month',
       'Day', 'Weekday', 'Carrier_Issue', 'Bad_Weather', 'NAS',
       'Security_Issue', 'Late_Aircraft'],
      dtype='object')

In [8]:
# Run through with all features

features = ['Operating_Airline', 'Origin_Airport', 'Destination_Airport',
       'Scheduled_Arrival_Time', 'Scheduled_Departure_Time', 'Month',
       'Day', 'Weekday', 'Carrier_Issue', 'Bad_Weather', 'NAS',
       'Security_Issue', 'Late_Aircraft']

rfmodel_complete(flights_df, features)


Confusion Matrix

           On Time (Pred)  Delayed (Pred)  Cancelled (Pred)
On Time            167353           37528                 0
Delayed             71791          118441              1716
Cancelled               0            4409              7548

Accuracy Score : 0.7175930682557622


Classification Report

              precision    recall  f1-score   support

           0       0.70      0.82      0.75    204881
           1       0.74      0.62      0.67    191948
           2       0.81      0.63      0.71     11957

    accuracy                           0.72    408786
   macro avg       0.75      0.69      0.71    408786
weighted avg       0.72      0.72      0.71    408786



(None,
 None,
 None,
 None,
 None,
 'Feature Importances',
 [(0.1695194925780946, 'Destination_Airport'),
  (0.15928295869264444, 'Origin_Airport'),
  (0.15356977277409223, 'Day'),
  (0.1218390179575396, 'Carrier_Issue'),
  (0.08294031651725764, 'NAS'),
  (0.06302347340334041, 'Scheduled_Departure_Time'),
  (0.05529173450211047, 'Scheduled_Arrival_Time'),
  (0.05511680046955792, 'Month'),
  (0.05361333613920521, 'Operating_Airline'),
  (0.04940863243334124, 'Weekday'),
  (0.035248866797902095, 'Bad_Weather'),
  (0.0005961122064201825, 'Late_Aircraft'),
  (0.0005494855284940783, 'Security_Issue')])