In [2]:
import pandas as pd
import numpy as np
import os

In [2]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
%matplotlib inline
import seaborn as sns
sns.set()

In [3]:
from math import sqrt

In [34]:
from sklearn.metrics import (r2_score, mean_squared_error, mean_absolute_error)
from sklearn.model_selection import (cross_val_score, cross_val_predict, train_test_split, 
                                     KFold, StratifiedKFold, GridSearchCV)
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, fbeta_score, roc_auc_score, roc_curve, make_scorer
from sklearn.metrics import confusion_matrix

In [5]:
from sklearn.linear_model import LogisticRegression

In [6]:
from sklearn.ensemble import RandomForestClassifier

In [7]:
import xgboost as xgb

In [8]:
selection = 25
year = '2019'
airline = 'DL'
download_path = r'/home/desbrium/Metis/PredictingFlightDelays/Data/BTS Departure Data'
file_path = os.path.join(download_path, f'{selection}airports{airline}{year}.csv')

In [9]:
delta_df = pd.read_csv(file_path)

In [11]:
X = delta_df[['Avg Min Delayed Due To Delta Yesterday','Avg Min Delayed Due To NAS Yesterday','Avg Min Delayed Due To Late Arrival Yesterday']]

In [12]:
y = delta_df['Delayed Departure']

In [14]:
delayed = y.sum()
perc_delayed = round(y.sum()/y.count(),2)
print(f'Number of delayed Delta flights in 2019: {delayed}, {int(perc_delayed*100)}% were delayed')

not_delayed = y.count() - delayed 
perc_not_delayed = 1 - perc_delayed
print(f'Number of not delayed Delta flights in 2019: {not_delayed}, {int(perc_not_delayed*100)}% were not delayed')

Number of delayed Delta flights in 2019: 105882, 15% were delayed
Number of not delayed Delta flights in 2019: 598619, 85% were not delayed


In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42, stratify = y)

X_train2, X_val, y_train2, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42, stratify = y_train)

# Logistic Regression

In [17]:
lr = LogisticRegression(C=1000)

In [30]:
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print(f"Precision Scores: {cross_val_score(lr, X_train2, y_train2, cv=skfold, scoring = 'precision')}")

Precision Scores: [0.4595843  0.41119221 0.40625    0.37193764 0.3987069 ]


In [31]:
print(f"Recall Scores: {cross_val_score(lr, X_train2, y_train2, cv=skfold, scoring = 'recall')}")

Recall Scores: [0.01468418 0.01246956 0.01342876 0.012322   0.01365011]


In [36]:
print(f"F1 Scores: {cross_val_score(lr, X_train2, y_train2, cv=skfold, scoring = make_scorer(fbeta_score, beta = .8))}")

F1 Scores: [0.035839   0.0305077  0.03272118 0.03002109 0.0332023 ]


In [6]:
np.mean([0.4595843,  0.41119221, 0.40625 ,0.37193764 ,0.3987069 ])

0.40953421

In [7]:
np.mean([[0.01468418, 0.01246956, 0.01342876 ,0.012322, 0.01365011]])

0.013310922

# Random Forest

In [42]:
rf = RandomForestClassifier(n_estimators=100, max_depth=10)

In [38]:
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print(f"Precision Scores: {cross_val_score(rf, X_train2, y_train2, cv=skfold, scoring = 'precision')}")

Precision Scores: [0.7124183  0.82857143 0.65949821 0.61507937 0.67272727]


In [3]:
np.mean([0.7124183,0.82857143,0.65949821,0.61507937,0.67272727])

0.697658916

In [43]:
print(f"Recall Scores: {cross_val_score(rf, X_train2, y_train2, cv=skfold, scoring = 'recall')}")

Recall Scores: [0.03969894 0.0411717  0.04028628 0.03563787 0.03644949]


In [4]:
np.mean([0.03969894 ,0.0411717 ,0.04028628, 0.03563787 ,0.03644949])

0.038648856

In [None]:
print(f"F1 Scores: {cross_val_score(rf, X_train2, y_train2, cv=skfold, scoring = make_scorer(fbeta_score, beta = .8))}")

# XGBoost

In [44]:
gbm = xgb.XGBClassifier(
                        n_estimators=100,
                        max_depth = 3,
                        learning_rate=.1, #lowering learning_rate,
                        early_stopping_rounds=20
                        )

In [45]:
print(f"Precision Scores: {cross_val_score(gbm, X_train2, y_train2, cv=skfold, scoring = 'precision')}")



Parameters: { "early_stopping_rounds" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "early_stopping_rounds" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "early_stopping_rounds" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "early_stopping_rounds" } might not be used.

  This may not be accurate due to some para

In [8]:
np.mean([0.7202381, 0.55882353, 0.85321101, 0.8 ,0.68539326])

0.72353318

In [46]:
print(f"Recall Scores: {cross_val_score(rf, X_train2, y_train2, cv=skfold, scoring = 'recall')}")

Recall Scores: [0.04058442 0.04036007 0.03917952 0.03563787 0.03770383]


In [9]:
np.mean([0.04058442, 0.04036007, 0.03917952, 0.03563787, 0.03770383])

0.038693142