In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
import plotly.express as px
import os

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

# Read the CSV and Perform Basic Data Cleaning

In [4]:
from sklearn.preprocessing import LabelEncoder

In [5]:
#Indicate columns and target
columns = [
    "id","Airline","Delay","Flight","AirportFrom","AirportTo","DayOfWeek","Length"
]

target = ["Delay"]


In [6]:
file_path = Path("new_airlines")
airlines_df = pd.read_csv(file_path)
airlines_df

Unnamed: 0,id,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,Length,Delay
0,1,CO,269,SFO,IAH,Thursday,205,1
1,2,US,1558,PHX,CLT,Thursday,222,1
2,3,AA,2400,LAX,DFW,Thursday,165,1
3,4,AA,2466,SFO,DFW,Thursday,195,1
4,5,AS,108,ANC,SEA,Thursday,202,0
...,...,...,...,...,...,...,...,...
539378,539379,CO,178,OGG,SNA,Saturday,326,0
539379,539380,FL,398,SEA,ATL,Saturday,305,0
539380,539381,FL,609,SFO,MKE,Saturday,255,0
539381,539382,UA,78,HNL,SFO,Saturday,313,1


In [7]:
#Read data into pandas
#data = pd.read_csv("Airlines.csv")

airlines_df = airlines_df.loc[:, columns].copy()
airlines_df

Unnamed: 0,id,Airline,Delay,Flight,AirportFrom,AirportTo,DayOfWeek,Length
0,1,CO,1,269,SFO,IAH,Thursday,205
1,2,US,1,1558,PHX,CLT,Thursday,222
2,3,AA,1,2400,LAX,DFW,Thursday,165
3,4,AA,1,2466,SFO,DFW,Thursday,195
4,5,AS,0,108,ANC,SEA,Thursday,202
...,...,...,...,...,...,...,...,...
539378,539379,CO,0,178,OGG,SNA,Saturday,326
539379,539380,FL,0,398,SEA,ATL,Saturday,305
539380,539381,FL,0,609,SFO,MKE,Saturday,255
539381,539382,UA,1,78,HNL,SFO,Saturday,313


In [8]:
#Drop the null columns where all values are null
airlines_df = airlines_df.dropna(axis='columns', how='all')

# Drop the null rows
airlines_df = airlines_df.dropna()
airlines_df

Unnamed: 0,id,Airline,Delay,Flight,AirportFrom,AirportTo,DayOfWeek,Length
0,1,CO,1,269,SFO,IAH,Thursday,205
1,2,US,1,1558,PHX,CLT,Thursday,222
2,3,AA,1,2400,LAX,DFW,Thursday,165
3,4,AA,1,2466,SFO,DFW,Thursday,195
4,5,AS,0,108,ANC,SEA,Thursday,202
...,...,...,...,...,...,...,...,...
539378,539379,CO,0,178,OGG,SNA,Saturday,326
539379,539380,FL,0,398,SEA,ATL,Saturday,305
539380,539381,FL,0,609,SFO,MKE,Saturday,255
539381,539382,UA,1,78,HNL,SFO,Saturday,313


In [9]:
#Remove info that does not tell us anything about delay or no delay 
#Flight numbers cannot be used as unique identifier b/c some flights have same flight number but
#are based out of different airports
airlines_df = airlines_df.drop(columns =['id','Flight'],axis=1)
airlines_df

Unnamed: 0,Airline,Delay,AirportFrom,AirportTo,DayOfWeek,Length
0,CO,1,SFO,IAH,Thursday,205
1,US,1,PHX,CLT,Thursday,222
2,AA,1,LAX,DFW,Thursday,165
3,AA,1,SFO,DFW,Thursday,195
4,AS,0,ANC,SEA,Thursday,202
...,...,...,...,...,...,...
539378,CO,0,OGG,SNA,Saturday,326
539379,FL,0,SEA,ATL,Saturday,305
539380,FL,0,SFO,MKE,Saturday,255
539381,UA,1,HNL,SFO,Saturday,313


# Initial Visulizations/Counts Using Plotly

In [10]:
import pandas as pd
from path import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [11]:
#Create plot grouped by airline to count 
#df = airlines_df[['Airline','Delay']].groupby('Airline').agg('count').reset_index()
#fig = px.bar(airlines_df, x='Airline', y='Delay')
#fig.show()

In [12]:
#Create plot grouped by DayOfWeek to count 
#df = airlines_df[['DayOfWeek','Delay']].groupby('Airline').agg('count').reset_index()
#fig = px.bar(airlines_df, x='DayOfWeek', y='Delay')
#fig.show()

In [13]:

#Create plot grouped by departing airport to count 
#airlines_df = airlines_df[['AirportFrom','Delay']].groupby('AirportFrom').agg('count').reset_index()
#fig = px.bar(airlines_df, x='AirportFrom', y='Delay')
#fig.show()

In [14]:
#Create plot grouped by departing airport to count 
#airlines_df = airlines_df[['AirportTo','Delay']].groupby('AirportTo').agg('count').reset_index()
#fig = px.bar(airlines_df, x='AirportTo', y='Delay')
#fig.show()

In [15]:
#Create plot grouped by departing airport to count 
#airlines_df = data[['Length','Delay']].groupby('Length').agg('count').reset_index()
#fig = px.bar(airlines_df, x='Length', y='Delay')
#fig.show()

# Data Preparation

In [16]:
#Change numbers to day of week to later be converted to 1s and 0s - removed from code because already incorporated in updated CSV
#days = {1:'Monday', 2:'Tuesday', 3:'Wednesday', 4:'Thursday', 5:'Friday', 6:'Saturday', 7:'Sunday'}

In [17]:
#airlines_df['DayOfWeek'] = airlines_df['DayOfWeek'].map(days) - already updated in CSV
airlines_df

Unnamed: 0,Airline,Delay,AirportFrom,AirportTo,DayOfWeek,Length
0,CO,1,SFO,IAH,Thursday,205
1,US,1,PHX,CLT,Thursday,222
2,AA,1,LAX,DFW,Thursday,165
3,AA,1,SFO,DFW,Thursday,195
4,AS,0,ANC,SEA,Thursday,202
...,...,...,...,...,...,...
539378,CO,0,OGG,SNA,Saturday,326
539379,FL,0,SEA,ATL,Saturday,305
539380,FL,0,SFO,MKE,Saturday,255
539381,UA,1,HNL,SFO,Saturday,313


In [18]:
# Create our features
# Convert string columns to numbers and drop 'delay' column then assign to X
X = pd.get_dummies(airlines_df, columns=['Airline', 'AirportFrom', 'AirportTo', 'DayOfWeek',]).drop('Delay', axis=1)
X

Unnamed: 0,Length,Airline_9E,Airline_AA,Airline_AS,Airline_B6,Airline_CO,Airline_DL,Airline_EV,Airline_F9,Airline_FL,...,AirportTo_WRG,AirportTo_XNA,AirportTo_YAK,AirportTo_YUM,DayOfWeek_Friday,DayOfWeek_Saturday,DayOfWeek_Sunday,DayOfWeek_Thursday,DayOfWeek_Tuesday,DayOfWeek_Wednesday
0,205,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,222,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,165,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,195,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,202,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
539378,326,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
539379,305,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
539380,255,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
539381,313,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [19]:
X.shape

(469504, 610)

In [20]:
# Create our target
y = airlines_df['Delay']
X.head()

Unnamed: 0,Length,Airline_9E,Airline_AA,Airline_AS,Airline_B6,Airline_CO,Airline_DL,Airline_EV,Airline_F9,Airline_FL,...,AirportTo_WRG,AirportTo_XNA,AirportTo_YAK,AirportTo_YUM,DayOfWeek_Friday,DayOfWeek_Saturday,DayOfWeek_Sunday,DayOfWeek_Thursday,DayOfWeek_Tuesday,DayOfWeek_Wednesday
0,205,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,222,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,165,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,195,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,202,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [21]:
X.describe()

Unnamed: 0,Length,Airline_9E,Airline_AA,Airline_AS,Airline_B6,Airline_CO,Airline_DL,Airline_EV,Airline_F9,Airline_FL,...,AirportTo_WRG,AirportTo_XNA,AirportTo_YAK,AirportTo_YUM,DayOfWeek_Friday,DayOfWeek_Saturday,DayOfWeek_Sunday,DayOfWeek_Thursday,DayOfWeek_Tuesday,DayOfWeek_Wednesday
count,469504.0,469504.0,469504.0,469504.0,469504.0,469504.0,469504.0,469504.0,469504.0,469504.0,...,469504.0,469504.0,469504.0,469504.0,469504.0,469504.0,469504.0,469504.0,469504.0,469504.0
mean,132.114483,0.03836,0.08453,0.021271,0.033429,0.039282,0.11286,0.051868,0.012049,0.038675,...,0.000109,0.0021,0.000104,0.000622,0.194769,0.18157,0.125571,0.191151,0.154991,0.151948
std,70.053219,0.192063,0.278181,0.144288,0.179754,0.194265,0.316421,0.22176,0.109104,0.192819,...,0.010422,0.045779,0.010215,0.024931,0.396023,0.38549,0.331365,0.393208,0.361897,0.35897
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,81.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,115.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,162.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,655.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [22]:
y.value_counts()

0    260933
1    208571
Name: Delay, dtype: int64

In [23]:
## Labels are the values we want to predict
#labels = np.array(X['Delay'])

# Remove the labels from the features
# axis 1 refers to the columns
#X = X.drop('Delay', axis = 1)
# Saving feature names for later use
#X_list = list(X.columns)
# Convert to numpy array
#X = np.array(X)

In [24]:
#d = {}    
# airlines_df = airlines_df.replace(x)

# Split into training and testing splits

In [25]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
from collections import Counter


# Split the data into training and testing sets - used random_state to make the data reproducible and ensure that same rows are assigned to train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [40]:
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [41]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [42]:
#Check balances
print(Counter(y_train))
print(Counter(y_test))

Counter({0: 195700, 1: 156428})
Counter({0: 65233, 1: 52143})


# Ensemble Learners
## Used to compare which algorithm results in best performance
## Chosen in an effort to improve the accuracy and robustness of the model, decrease variance of the model, and increase overall performance of the model

# Balanced Random Forest Classifier
## Chosen to rank importance of input variables, run efficiently on large datasets, and are robust against overfitting 

In [44]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
random_f = RandomForestClassifier(n_estimators = 128)

In [45]:
#Fit the model
random_f = random_f.fit(X_train_scaled, y_train, sample_weight=None)

In [46]:
# Calculated the balanced accuracy score
y_pred = random_f.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.5927217829903274

In [47]:
# Display the confusion matrix
# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[59245,  5988],
       [37687, 14456]])

In [48]:
#Create dataframe for the cm 
cm_df = pd.DataFrame(
    cm, index=["Actual Not_Delayed", "Actual Delayed"], columns=["Predicted Not_Delayed", "Predicted Delayed"])
cm_df

Unnamed: 0,Predicted Not_Delayed,Predicted Delayed
Actual Not_Delayed,59245,5988
Actual Delayed,37687,14456


In [49]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, y_pred)

In [50]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predicted Not_Delayed,Predicted Delayed
Actual Not_Delayed,59245,5988
Actual Delayed,37687,14456


Accuracy Score : 0.627905193565976
Classification Report
              precision    recall  f1-score   support

           0       0.61      0.91      0.73     65233
           1       0.71      0.28      0.40     52143

    accuracy                           0.63    117376
   macro avg       0.66      0.59      0.56    117376
weighted avg       0.65      0.63      0.58    117376



In [51]:
# Calculate feature importance in the Random Forest model.
importances = random_f.feature_importances_
importances

array([4.57237371e-01, 2.05857052e-03, 3.32105637e-03, 1.84486492e-03,
       1.95260248e-03, 4.62759930e-03, 4.13599627e-03, 2.08424164e-03,
       1.18156228e-03, 5.78032836e-03, 8.00382429e-04, 3.77144670e-03,
       4.23653641e-03, 3.26007555e-03, 5.50720184e-03, 4.86182275e-03,
       7.16721459e-02, 2.62517883e-03, 5.25062755e-03, 2.60384741e-04,
       6.72113088e-05, 9.98617186e-04, 4.19638232e-05, 2.05470267e-05,
       1.17421188e-04, 3.63402705e-05, 1.49836794e-05, 3.52001367e-05,
       1.77385298e-04, 1.06185027e-04, 6.96418064e-04, 2.45206628e-04,
       3.71893656e-04, 1.68042476e-04, 2.87768504e-03, 3.35679769e-04,
       1.41882059e-03, 3.25030641e-04, 1.35049997e-04, 1.37285485e-04,
       1.15264990e-03, 6.88538296e-05, 1.31676696e-04, 5.67773718e-05,
       3.64642069e-05, 7.89153266e-04, 2.02008558e-04, 2.13547877e-04,
       3.84440391e-05, 1.58030180e-05, 3.28444781e-04, 1.54813625e-03,
       6.15074061e-04, 2.37402788e-03, 5.78175109e-05, 1.24032627e-04,
      

In [52]:
# We can sort the features by their importance.
sorted(zip(random_f.feature_importances_, X.columns), reverse=True)

[(0.45723737067498704, 'Length'),
 (0.07167214589933849, 'Airline_WN'),
 (0.019407803229071065, 'DayOfWeek_Friday'),
 (0.018619079163938953, 'DayOfWeek_Wednesday'),
 (0.016791944049454645, 'DayOfWeek_Tuesday'),
 (0.0162527048292347, 'DayOfWeek_Thursday'),
 (0.015412887221083627, 'DayOfWeek_Saturday'),
 (0.0133182275817866, 'DayOfWeek_Sunday'),
 (0.005780328355744227, 'Airline_FL'),
 (0.0055902547290859, 'AirportFrom_ORD'),
 (0.005507201844888948, 'Airline_UA'),
 (0.005250627546514792, 'Airline_YV'),
 (0.00486182275038647, 'Airline_US'),
 (0.0046275992969647215, 'Airline_CO'),
 (0.004459163279586243, 'AirportFrom_MDW'),
 (0.0042365364092031834, 'Airline_OH'),
 (0.004177986558225411, 'AirportTo_DFW'),
 (0.004135996269982753, 'Airline_DL'),
 (0.0037714466996526196, 'Airline_MQ'),
 (0.0033210563666646532, 'Airline_AA'),
 (0.00330000315939908, 'AirportTo_ATL'),
 (0.0032600755477790503, 'Airline_OO'),
 (0.003009994715061303, 'AirportFrom_LAX'),
 (0.0030012912633895668, 'AirportTo_ORD'),
 (0.

In [38]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.66      0.60      0.62      0.63      0.61      0.37     65233
          1       0.55      0.62      0.60      0.58      0.61      0.37     52143

avg / total       0.61      0.61      0.61      0.61      0.61      0.37    117376

