## Import dependencies

In [1]:
import numpy as np
import pandas as pd

from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sqlalchemy import create_engine

## Connect to database and import data

In [2]:
# Connect to Flights_DB

db_path = Path('Resources/Flights_DB.sqlite')
engine = create_engine(f'sqlite:///{db_path}')

conn = engine.connect()

In [3]:
# Load Flights table, join Cancelled_Flights and Delayed_Flights for reasons

flights_df = pd.read_sql_query('''
                                  SELECT F.Operating_Airline
                                      ,F.Origin_Airport
                                      ,F.Destination_Airport
                                      ,F.Flight_Date
                                      ,F.Scheduled_Arrival_Time
                                      ,F.Scheduled_Departure_Time
                                      ,D.Carrier_Delay_Minutes
                                      ,D.Weather_Delay_Minutes
                                      ,D.NAS_Delay_Minutes
                                      ,D.Security_Delay_Minutes
                                      ,D.Late_Aircraft_Delay_Minutes
                                      ,C.Cancellation_Code
                                      ,F.Target
                                  FROM Flights F
                                  LEFT JOIN Cancelled_Flights C ON F.ID = C.ID
                                  LEFT JOIN Delayed_Flights D ON F.ID = D.ID
                               ''',
                               con=engine,
                               dtype={
                                   'Operating_Airline': object,
                                   'Origin_Airport': object,
                                   'Destination_Airport': object,
                                   'Flight_Date': np.datetime64,
                                   'Scheduled_Arrival_Time': np.datetime64,
                                   'Scheduled_Departure_Time': np.datetime64,
                                   'Carrier_Delay_Minutes': object,
                                   'Weather_Delay_Minutes': object,
                                   'NAS_Delay_Minutes': object,
                                   'Security_Delay_Minutes': object,
                                   'Late_Aircraft_Delay_Minutes': object,
                                   'Cancellation_Code': object,
                                   'Target': np.int32
                               })

flights_df.head(10)

Unnamed: 0,Operating_Airline,Origin_Airport,Destination_Airport,Flight_Date,Scheduled_Arrival_Time,Scheduled_Departure_Time,Carrier_Delay_Minutes,Weather_Delay_Minutes,NAS_Delay_Minutes,Security_Delay_Minutes,Late_Aircraft_Delay_Minutes,Cancellation_Code,Target
0,DL,FLL,LGA,2022-01-06,2023-02-14 14:19:00,2023-02-14 11:26:00,,,,,,A,2
1,DL,ATL,FLL,2022-01-06,2023-02-14 18:21:00,2023-02-14 16:31:00,,,,,,,0
2,DL,FLL,ATL,2022-01-06,2023-02-14 21:27:00,2023-02-14 19:31:00,,,,,,,0
3,DL,FLL,RDU,2022-01-06,2023-02-14 12:27:00,2023-02-14 10:24:00,,,,,,,0
4,DL,ATL,JAN,2022-01-06,2023-02-14 11:42:00,2023-02-14 11:17:00,,,,,,,0
5,DL,JAN,ATL,2022-01-06,2023-02-14 14:58:00,2023-02-14 12:37:00,,,,,,,0
6,DL,RIC,ATL,2022-01-06,2023-02-14 10:50:00,2023-02-14 09:00:00,,,,,,,0
7,DL,MSP,RSW,2022-01-06,2023-02-14 14:33:00,2023-02-14 10:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,,1
8,DL,ATL,RDU,2022-01-06,2023-02-14 15:33:00,2023-02-14 14:14:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,,1
9,DL,RDU,ATL,2022-01-06,2023-02-14 18:14:00,2023-02-14 16:43:00,,,,,,,0


In [5]:
# Checking to make sure timedeltas have values

flights_df.nunique()

Operating_Airline                21
Origin_Airport                  375
Destination_Airport             375
Flight_Date                     212
Scheduled_Arrival_Time         1397
Scheduled_Departure_Time       1320
Carrier_Delay_Minutes          1433
Weather_Delay_Minutes           889
NAS_Delay_Minutes               696
Security_Delay_Minutes          170
Late_Aircraft_Delay_Minutes    1075
Cancellation_Code                 4
Target                            3
dtype: int64

## Preprocessing

In [6]:
# Make separate columns for Month, Day and Weekday

flights_df['Month'] = flights_df['Flight_Date'].dt.month
flights_df['Day'] = flights_df['Flight_Date'].dt.day
flights_df['Weekday'] = flights_df['Flight_Date'].dt.dayofweek

flights_df.head(10)

Unnamed: 0,Operating_Airline,Origin_Airport,Destination_Airport,Flight_Date,Scheduled_Arrival_Time,Scheduled_Departure_Time,Carrier_Delay_Minutes,Weather_Delay_Minutes,NAS_Delay_Minutes,Security_Delay_Minutes,Late_Aircraft_Delay_Minutes,Cancellation_Code,Target,Month,Day,Weekday
0,DL,FLL,LGA,2022-01-06,2023-02-14 14:19:00,2023-02-14 11:26:00,,,,,,A,2,1,6,3
1,DL,ATL,FLL,2022-01-06,2023-02-14 18:21:00,2023-02-14 16:31:00,,,,,,,0,1,6,3
2,DL,FLL,ATL,2022-01-06,2023-02-14 21:27:00,2023-02-14 19:31:00,,,,,,,0,1,6,3
3,DL,FLL,RDU,2022-01-06,2023-02-14 12:27:00,2023-02-14 10:24:00,,,,,,,0,1,6,3
4,DL,ATL,JAN,2022-01-06,2023-02-14 11:42:00,2023-02-14 11:17:00,,,,,,,0,1,6,3
5,DL,JAN,ATL,2022-01-06,2023-02-14 14:58:00,2023-02-14 12:37:00,,,,,,,0,1,6,3
6,DL,RIC,ATL,2022-01-06,2023-02-14 10:50:00,2023-02-14 09:00:00,,,,,,,0,1,6,3
7,DL,MSP,RSW,2022-01-06,2023-02-14 14:33:00,2023-02-14 10:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,,1,1,6,3
8,DL,ATL,RDU,2022-01-06,2023-02-14 15:33:00,2023-02-14 14:14:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,,1,1,6,3
9,DL,RDU,ATL,2022-01-06,2023-02-14 18:14:00,2023-02-14 16:43:00,,,,,,,0,1,6,3


In [7]:
# Get rid of added Year-Month-Day from Scheduled_Arrival_Time/Scheduled_Departure_Time

time_columns = ['Scheduled_Arrival_Time', 'Scheduled_Departure_Time']

for col in time_columns:
    flights_df[col] = flights_df[col].dt.strftime('%H:%M')

In [9]:
# Fill None in intervals with 0

none_cols = ['Carrier_Delay_Minutes', 'Weather_Delay_Minutes', 'NAS_Delay_Minutes',
                 'Security_Delay_Minutes', 'Late_Aircraft_Delay_Minutes', 'Cancellation_Code']

flights_df[none_cols] = flights_df[none_cols].apply(lambda x: x.fillna(0))

flights_df.head(10)

Unnamed: 0,Operating_Airline,Origin_Airport,Destination_Airport,Flight_Date,Scheduled_Arrival_Time,Scheduled_Departure_Time,Carrier_Delay_Minutes,Weather_Delay_Minutes,NAS_Delay_Minutes,Security_Delay_Minutes,Late_Aircraft_Delay_Minutes,Cancellation_Code,Target,Month,Day,Weekday
0,DL,FLL,LGA,2022-01-06,14:19,11:26,0,0,0,0,0,A,2,1,6,3
1,DL,ATL,FLL,2022-01-06,18:21,16:31,0,0,0,0,0,0,0,1,6,3
2,DL,FLL,ATL,2022-01-06,21:27,19:31,0,0,0,0,0,0,0,1,6,3
3,DL,FLL,RDU,2022-01-06,12:27,10:24,0,0,0,0,0,0,0,1,6,3
4,DL,ATL,JAN,2022-01-06,11:42,11:17,0,0,0,0,0,0,0,1,6,3
5,DL,JAN,ATL,2022-01-06,14:58,12:37,0,0,0,0,0,0,0,1,6,3
6,DL,RIC,ATL,2022-01-06,10:50,09:00,0,0,0,0,0,0,0,1,6,3
7,DL,MSP,RSW,2022-01-06,14:33,10:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0,1,1,6,3
8,DL,ATL,RDU,2022-01-06,15:33,14:14,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0,1,1,6,3
9,DL,RDU,ATL,2022-01-06,18:14,16:43,0,0,0,0,0,0,0,1,6,3


In [10]:
# Double check existence of nulls

flights_df.isna().sum()

Operating_Airline              0
Origin_Airport                 0
Destination_Airport            0
Flight_Date                    0
Scheduled_Arrival_Time         0
Scheduled_Departure_Time       0
Carrier_Delay_Minutes          0
Weather_Delay_Minutes          0
NAS_Delay_Minutes              0
Security_Delay_Minutes         0
Late_Aircraft_Delay_Minutes    0
Cancellation_Code              0
Target                         0
Month                          0
Day                            0
Weekday                        0
dtype: int64

In [11]:
# Drop Flight_Date column

flights_df = flights_df.drop(columns='Flight_Date')

flights_df.head(10)

Unnamed: 0,Operating_Airline,Origin_Airport,Destination_Airport,Scheduled_Arrival_Time,Scheduled_Departure_Time,Carrier_Delay_Minutes,Weather_Delay_Minutes,NAS_Delay_Minutes,Security_Delay_Minutes,Late_Aircraft_Delay_Minutes,Cancellation_Code,Target,Month,Day,Weekday
0,DL,FLL,LGA,14:19,11:26,0,0,0,0,0,A,2,1,6,3
1,DL,ATL,FLL,18:21,16:31,0,0,0,0,0,0,0,1,6,3
2,DL,FLL,ATL,21:27,19:31,0,0,0,0,0,0,0,1,6,3
3,DL,FLL,RDU,12:27,10:24,0,0,0,0,0,0,0,1,6,3
4,DL,ATL,JAN,11:42,11:17,0,0,0,0,0,0,0,1,6,3
5,DL,JAN,ATL,14:58,12:37,0,0,0,0,0,0,0,1,6,3
6,DL,RIC,ATL,10:50,09:00,0,0,0,0,0,0,0,1,6,3
7,DL,MSP,RSW,14:33,10:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0,1,1,6,3
8,DL,ATL,RDU,15:33,14:14,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0 days 00:00:00,0,1,1,6,3
9,DL,RDU,ATL,18:14,16:43,0,0,0,0,0,0,0,1,6,3


In [16]:
# Extract numbers from Minutes column string

interval_cols = ['Carrier_Delay_Minutes', 'Weather_Delay_Minutes', 'NAS_Delay_Minutes',
                 'Security_Delay_Minutes', 'Late_Aircraft_Delay_Minutes']

flights_df[interval_cols] = flights_df[interval_cols].apply(lambda x: pd.to_timedelta(x).dt.total_seconds().astype(int) // 60)

flights_df.head(10)

Unnamed: 0,Operating_Airline,Origin_Airport,Destination_Airport,Scheduled_Arrival_Time,Scheduled_Departure_Time,Carrier_Delay_Minutes,Weather_Delay_Minutes,NAS_Delay_Minutes,Security_Delay_Minutes,Late_Aircraft_Delay_Minutes,Cancellation_Code,Target,Month,Day,Weekday
0,DL,FLL,LGA,14:19,11:26,0,0,0,0,0,A,2,1,6,3
1,DL,ATL,FLL,18:21,16:31,0,0,0,0,0,0,0,1,6,3
2,DL,FLL,ATL,21:27,19:31,0,0,0,0,0,0,0,1,6,3
3,DL,FLL,RDU,12:27,10:24,0,0,0,0,0,0,0,1,6,3
4,DL,ATL,JAN,11:42,11:17,0,0,0,0,0,0,0,1,6,3
5,DL,JAN,ATL,14:58,12:37,0,0,0,0,0,0,0,1,6,3
6,DL,RIC,ATL,10:50,09:00,0,0,0,0,0,0,0,1,6,3
7,DL,MSP,RSW,14:33,10:00,0,0,0,0,0,0,1,1,6,3
8,DL,ATL,RDU,15:33,14:14,0,0,0,0,0,0,1,1,6,3
9,DL,RDU,ATL,18:14,16:43,0,0,0,0,0,0,0,1,6,3


In [17]:
# Check unique values to make sure no data lost

flights_df.nunique()

Operating_Airline                21
Origin_Airport                  375
Destination_Airport             375
Scheduled_Arrival_Time         1397
Scheduled_Departure_Time       1320
Carrier_Delay_Minutes          1433
Weather_Delay_Minutes           889
NAS_Delay_Minutes               696
Security_Delay_Minutes          169
Late_Aircraft_Delay_Minutes    1075
Cancellation_Code                 5
Target                            3
Month                             7
Day                              31
Weekday                           7
dtype: int64

In [18]:
# Make columns that track reasons for both delays and cancellations as a binary

def check_condition(delay_col, cancel_code):
    
    condition = ((flights_df[delay_col] > 0) | (flights_df['Cancellation_Code'] == cancel_code))
    
    return condition

carrier = check_condition('Carrier_Delay_Minutes', 'A')
bad_weather = check_condition('Weather_Delay_Minutes', 'B')
NAS = check_condition('NAS_Delay_Minutes', 'C')
security = check_condition('Security_Delay_Minutes', 'D')
late_aircraft = (flights_df['Late_Aircraft_Delay_Minutes'] > 0)

flights_df['Carrier_Issue'] = np.where(carrier, 1, 0)
flights_df['Bad_Weather'] = np.where(bad_weather, 1, 0)
flights_df['NAS'] = np.where(NAS, 1, 0)
flights_df['Security_Issue'] = np.where(security, 1, 0)
flights_df['Late_Aircraft'] = np.where(security, 1, 0)

In [19]:
# Drop extra columns and check for complete DF

drop_cols = ['Carrier_Delay_Minutes', 'Weather_Delay_Minutes', 'NAS_Delay_Minutes',
                 'Security_Delay_Minutes', 'Late_Aircraft_Delay_Minutes', 'Cancellation_Code']

flights_df = flights_df.drop(columns=drop_cols).copy()

flights_df.head(10)

Unnamed: 0,Operating_Airline,Origin_Airport,Destination_Airport,Scheduled_Arrival_Time,Scheduled_Departure_Time,Target,Month,Day,Weekday,Carrier_Issue,Bad_Weather,NAS,Security_Issue,Late_Aircraft
0,DL,FLL,LGA,14:19,11:26,2,1,6,3,1,0,0,0,0
1,DL,ATL,FLL,18:21,16:31,0,1,6,3,0,0,0,0,0
2,DL,FLL,ATL,21:27,19:31,0,1,6,3,0,0,0,0,0
3,DL,FLL,RDU,12:27,10:24,0,1,6,3,0,0,0,0,0
4,DL,ATL,JAN,11:42,11:17,0,1,6,3,0,0,0,0,0
5,DL,JAN,ATL,14:58,12:37,0,1,6,3,0,0,0,0,0
6,DL,RIC,ATL,10:50,09:00,0,1,6,3,0,0,0,0,0
7,DL,MSP,RSW,14:33,10:00,1,1,6,3,0,0,0,0,0
8,DL,ATL,RDU,15:33,14:14,1,1,6,3,0,0,0,0,0
9,DL,RDU,ATL,18:14,16:43,0,1,6,3,0,0,0,0,0


In [20]:
# Checkpoint DF for use in future sessions

model_df_path = Path('Resources/flights_model_df.csv')
flights_df.to_csv(model_df_path, index=False)

## Random Forest Classifier Model Function

In [None]:
# Function that runs a Random Forest Model and outputs stats on model

def rfmodel_complete (MasterDF,ColumnList,estimators=500,randomstate=52):

    X = MasterDF[ColumnList]
    y = MasterDF['Target']

    X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=randomstate)

    rf_model = RandomForestClassifier(n_estimators=estimators,random_state=randomstate)

    rf_model = rf_model.fit(X_train,y_train)

    rf_predictions = rf_model.predict(X_test)

    cm = confusion_matrix(y_test, rf_predictions)

    # Create a DataFrame from the confusion matrix.
    cm_df = pd.DataFrame(
        cm, index=["On Time", "Delayed","Cancelled"], columns=["On Time (Pred)", "Delayed (Pred)", "Cancelled (Pred)"])

    acc_score = accuracy_score(y_test, rf_predictions)

    # Create a classification report
    class_report = classification_report(y_test, rf_predictions)

    # Calculate feature importance in the Random Forest model.
    importances = rf_model.feature_importances_

    importances_dict = sorted(zip(importances, X.columns), reverse=True)

    return (# Displaying results
        print("\nConfusion Matrix\n"),
        
        print(cm_df),
        
        print(f"\nAccuracy Score : {acc_score}\n"),
        
        print("\nClassification Report\n"),

        print(class_report),
        
        "Feature Importances",
        
        importances_dict
    )

In [None]:
flights_df.columns

In [None]:
# Run through with all features

features = ['Operating_Airline', 'Origin_Airport', 'Destination_Airport',
       'Scheduled_Arrival_Time', 'Scheduled_Departure_Time',
       'Carrier_Delay_Minutes', 'Weather_Delay_Minutes', 'NAS_Delay_Minutes',
       'Security_Delay_Minutes', 'Late_Aircraft_Delay_Minutes',
       'Cancellation_Code', 'Month', 'Day', 'Weekday']

rfmodel_complete(flights_df, features)