In [72]:
import pandas as pd
import numpy as np
import os

In [73]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline
import seaborn as sns
sns.set()

In [74]:
from math import sqrt

In [86]:
from sklearn.metrics import (r2_score, mean_squared_error, mean_absolute_error)
from sklearn.model_selection import (cross_val_score, cross_val_predict, train_test_split, 
                                     KFold, StratifiedKFold, GridSearchCV)
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, roc_auc_score, roc_curve
from sklearn.metrics import confusion_matrix

In [76]:
from sklearn.ensemble import RandomForestClassifier

In [77]:
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB

In [78]:
selection = 25
year = '2019'
airline = 'DL'
download_path = r'/home/desbrium/Metis/PredictingFlightDelays/Data/BTS Departure Data'
file_path = os.path.join(download_path, f'{selection}airports{airline}{year}.csv')

In [79]:
delta_df = pd.read_csv(file_path)

In [8]:
X = delta_df[['Origin Airport', 'Origin Region', 'Actual Departure Time Name',
       'Distance Range Name', 'Minutes Delayed By Carrier',
       'Minutes Delayed By Weather', 'Minutes Delayed By NAS',
       'Minutes Delayed By Security', 'Minutes Delayed By Late Arrival',
        ]]

In [9]:
y = delta_df['Delayed Departure']

In [10]:
delayed = y.sum()
perc_delayed = round(y.sum()/y.count(),2)
print(f'Number of delayed Delta flights in 2019: {delayed}, {int(perc_delayed*100)}% were delayed')

not_delayed = delta_df_target.count() - delayed 
perc_not_delayed = 1 - perc_delayed
print(f'Number of not delayed Delta flights in 2019: {not_delayed}, {int(perc_not_delayed*100)}% were not delayed')

Number of delayed Delta flights in 2019: 105882, 15% were delayed
Number of not delayed Delta flights in 2019: -105881, 85% were not delayed


In [18]:
X = pd.get_dummies(X,columns=['Origin Airport', 'Origin Region', 'Actual Departure Time Name',
       'Distance Range Name'],drop_first=True)

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42, stratify = y)

X_train2, X_val, y_train2, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify = y_train)

# Random Forest

In [21]:
rf = RandomForestClassifier(n_estimators=100, max_depth=7)

In [22]:
rf.fit(X_train2, y_train2)

rf.score(X_train2, y_train2)

0.9479018807665011

In [23]:
y_val_pred = rf.predict(X_val)

In [24]:
print(f'recall_score:{recall_score(y_val, y_val_pred)}')
print(f'precision_score:{precision_score(y_val, y_val_pred)}')
print(f'accuracy_score:{accuracy_score(y_val, y_val_pred)}')
print(f'f1_score:{f1_score(y_val, y_val_pred)}')
print(f'roc_auc_score:{roc_auc_score(y_val, y_val_pred)}')

recall_score:0.6804202821557169
precision_score:0.9751290076981642
accuracy_score:0.949361249112846
f1_score:0.8015437034976706
roc_auc_score:0.8386753578790361


In [25]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

cross_val_score(rf, X_train2, y_train2, cv=kfold)

array([0.94732523, 0.94749157, 0.94758029, 0.94608322, 0.94853398])

In [26]:
rf_features_dict = {'Features': X.columns, 'Importance': rf.feature_importances_}
rf_features = pd.DataFrame(rf_features_dict)
rf_features = rf_features.sort_values('Importance', ascending = False)
rf_features

In [27]:
rf_features = pd.DataFrame(rf_features_dict)

In [31]:
rf_features = rf_features.sort_values('Importance', ascending = False)

In [33]:
rf_features

Unnamed: 0,Features,Importance
0,Minutes Delayed By Carrier,0.461026
4,Minutes Delayed By Late Arrival,0.298448
2,Minutes Delayed By NAS,0.140121
1,Minutes Delayed By Weather,0.052594
34,Actual Departure Time Name_Night,0.012884
30,Actual Departure Time Name_Early Morning,0.008998
32,Actual Departure Time Name_Late Night,0.007277
33,Actual Departure Time Name_Morning,0.005187
31,Actual Departure Time Name_Evening,0.003329
27,Origin Region_Northeast,0.001846


In [40]:
rf = RandomForestClassifier(n_estimators=100, max_depth=10)

In [41]:
rf.fit(X_train2, y_train2)

rf.score(X_train2, y_train2)

0.9522001419446416

In [42]:
y_val_pred = rf.predict(X_val)

In [43]:
print(f'recall_score:{recall_score(y_val, y_val_pred)}')
print(f'precision_score:{precision_score(y_val, y_val_pred)}')
print(f'accuracy_score:{accuracy_score(y_val, y_val_pred)}')
print(f'f1_score:{f1_score(y_val, y_val_pred)}')
print(f'roc_auc_score:{roc_auc_score(y_val, y_val_pred)}')

recall_score:0.7058024909981702
precision_score:0.9800819672131148
accuracy_score:0.9536284599006387
f1_score:0.8206307264678632
roc_auc_score:0.8516327002020992


# Using Less Features

In [44]:
X = delta_df[['Origin Region', 'Actual Departure Time Name',
       'Minutes Delayed By Carrier','Minutes Delayed By Weather', 'Minutes Delayed By NAS',
       'Minutes Delayed By Security', 'Minutes Delayed By Late Arrival',
        ]]

In [46]:
X = pd.get_dummies(X,columns=['Origin Region', 'Actual Departure Time Name'],drop_first=True)

In [47]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42, stratify = y)

X_train2, X_val, y_train2, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify = y_train)

In [60]:
rf = RandomForestClassifier(n_estimators=100, max_depth=12)

In [61]:
rf.fit(X_train2, y_train2)

rf.score(X_train2, y_train2)

0.9562699609652235

In [62]:
y_val_pred = rf.predict(X_val)

In [63]:
print(f'recall_score:{recall_score(y_val, y_val_pred)}')
print(f'precision_score:{precision_score(y_val, y_val_pred)}')
print(f'accuracy_score:{accuracy_score(y_val, y_val_pred)}')
print(f'f1_score:{f1_score(y_val, y_val_pred)}')
print(f'roc_auc_score:{roc_auc_score(y_val, y_val_pred)}')

recall_score:0.7262853432501033
precision_score:0.9836117995043568
accuracy_score:0.9570440028388928
f1_score:0.8355857385398981
roc_auc_score:0.8620724996666892


In [64]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

cross_val_score(rf, X_train2, y_train2, cv=kfold)

array([0.95567557, 0.95580864, 0.95535397, 0.95442246, 0.95677342])

In [None]:
#Baseline on couple features with random forest, then boosting. Feature engineering and hyper parametersa(complexity)

# Trying Different Feature Combinations

In [82]:
delta_df.columns

Index(['Unique_Id', 'Carrier Code', 'Flight Date', 'Flight Number', 'Month',
       'Tail Number', 'Origin Airport', 'Origin City', 'Origin State',
       'Origin Divison', 'Origin Region', 'Dest Airport', 'Dest City',
       'Dest State', 'Dest Divison', 'Dest Region', 'Scheduled Departure Time',
       'Actual Departure Time', 'Actual Departure Time Name',
       'Minutes Delayed Departing', 'Delayed Departure',
       'Scheduled Elapsed Time', 'Actual Elapsed Time', 'Wheels-off Time',
       'Taxi-out Time', 'Minutes Delayed By Carrier',
       'Minutes Delayed By Weather', 'Minutes Delayed By NAS',
       'Minutes Delayed By Security', 'Minutes Delayed By Late Arrival',
       'Cancelled', 'Diverted', 'Flights', 'Distance', 'Distance Name',
       'Distance Range Name', 'Scheduled Arrival Time', 'Actual Arrival Time',
       'Minutes Delayed Arriving', 'AirTime'],
      dtype='object')

In [90]:
X = delta_df[['Origin Airport', 'Delayed Departure']]

delay_by_airport_ratio = delta_df[['Origin Airport', 'Delayed Departure']].groupby(['Origin Airport']).sum()/ delta_df[['Origin Airport', 'Delayed Departure']].groupby(['Origin Airport']).count()

delay_by_airport_ratio = delay_by_airport_ratio.reset_index()

X = X.merge(delay_by_airport_ratio, how = 'left', on = 'Origin Airport')

In [91]:
X.columns

Index(['Origin Airport', 'Delayed Departure_x', 'Delayed Departure_y'], dtype='object')

In [92]:
y = X['Delayed Departure_x']
X = X[['Origin Airport', 'Delayed Departure_y']]

In [99]:
X = pd.get_dummies(X,columns=['Origin Airport'],drop_first=True)

In [100]:
rf = RandomForestClassifier(n_estimators=100, max_depth=12)

In [101]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42, stratify = y)

X_train2, X_val, y_train2, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify = y_train)

In [103]:
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cross_val_score(rf, X_train2, y_train2, cv=skfold)

array([0.84971611, 0.84970502, 0.84970502, 0.84970502, 0.84970502])

# Naive Bayes

In [69]:
nb = GaussianNB()
nb.fit(X_train2, y_train2)
nb.score(X_val, y_val)

0.943443931866572

In [70]:
y_val_pred = nb.predict(X_val)

In [71]:
print(f'recall_score:{recall_score(y_val, y_val_pred)}')
print(f'precision_score:{precision_score(y_val, y_val_pred)}')
print(f'accuracy_score:{accuracy_score(y_val, y_val_pred)}')
print(f'f1_score:{f1_score(y_val, y_val_pred)}')
print(f'roc_auc_score:{roc_auc_score(y_val, y_val_pred)}')

recall_score:0.7406882710583791
precision_score:0.8635925671025465
accuracy_score:0.943443931866572
f1_score:0.7974325569571986
roc_auc_score:0.8599973998146802


In [68]:
nb = BernoulliNB()
nb.fit(X_train2,y_train2)
nb.score(X_val,y_val)

0.9400195173882185