In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
import plotly.express as px
import os

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

# Read the CSV and Perform Basic Data Cleaning

In [4]:
from sklearn.preprocessing import LabelEncoder

In [5]:
#Indicate columns and target
columns = [
    "id","Airline","Delay","Flight","AirportFrom","AirportTo","DayOfWeek","Length"
]

target = ["Delay"]


In [6]:
file_path = Path("new_airlines")
airlines_df = pd.read_csv(file_path)
airlines_df

Unnamed: 0,id,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,Length,Delay
0,1,CO,269,SFO,IAH,Wednesday,205,1
1,2,US,1558,PHX,CLT,Wednesday,222,1
2,3,AA,2400,LAX,DFW,Wednesday,165,1
3,4,AA,2466,SFO,DFW,Wednesday,195,1
4,5,AS,108,ANC,SEA,Wednesday,202,0
...,...,...,...,...,...,...,...,...
539378,539379,CO,178,OGG,SNA,Friday,326,0
539379,539380,FL,398,SEA,ATL,Friday,305,0
539380,539381,FL,609,SFO,MKE,Friday,255,0
539381,539382,UA,78,HNL,SFO,Friday,313,1


In [7]:
#Read data into pandas
#data = pd.read_csv("Airlines.csv")

airlines_df = airlines_df.loc[:, columns].copy()
airlines_df

Unnamed: 0,id,Airline,Delay,Flight,AirportFrom,AirportTo,DayOfWeek,Length
0,1,CO,1,269,SFO,IAH,Wednesday,205
1,2,US,1,1558,PHX,CLT,Wednesday,222
2,3,AA,1,2400,LAX,DFW,Wednesday,165
3,4,AA,1,2466,SFO,DFW,Wednesday,195
4,5,AS,0,108,ANC,SEA,Wednesday,202
...,...,...,...,...,...,...,...,...
539378,539379,CO,0,178,OGG,SNA,Friday,326
539379,539380,FL,0,398,SEA,ATL,Friday,305
539380,539381,FL,0,609,SFO,MKE,Friday,255
539381,539382,UA,1,78,HNL,SFO,Friday,313


In [8]:
#Drop the null columns where all values are null
airlines_df = airlines_df.dropna(axis='columns', how='all')

# Drop the null rows
airlines_df = airlines_df.dropna()
airlines_df

Unnamed: 0,id,Airline,Delay,Flight,AirportFrom,AirportTo,DayOfWeek,Length
0,1,CO,1,269,SFO,IAH,Wednesday,205
1,2,US,1,1558,PHX,CLT,Wednesday,222
2,3,AA,1,2400,LAX,DFW,Wednesday,165
3,4,AA,1,2466,SFO,DFW,Wednesday,195
4,5,AS,0,108,ANC,SEA,Wednesday,202
...,...,...,...,...,...,...,...,...
539378,539379,CO,0,178,OGG,SNA,Friday,326
539379,539380,FL,0,398,SEA,ATL,Friday,305
539380,539381,FL,0,609,SFO,MKE,Friday,255
539381,539382,UA,1,78,HNL,SFO,Friday,313


In [9]:
#Remove info that does not tell us anything about delay or no delay 
#Flight numbers cannot be used as unique identifier b/c some flights have same flight number but
#are based out of different airports
airlines_df = airlines_df.drop(columns =['id','Flight'],axis=1)
airlines_df

Unnamed: 0,Airline,Delay,AirportFrom,AirportTo,DayOfWeek,Length
0,CO,1,SFO,IAH,Wednesday,205
1,US,1,PHX,CLT,Wednesday,222
2,AA,1,LAX,DFW,Wednesday,165
3,AA,1,SFO,DFW,Wednesday,195
4,AS,0,ANC,SEA,Wednesday,202
...,...,...,...,...,...,...
539378,CO,0,OGG,SNA,Friday,326
539379,FL,0,SEA,ATL,Friday,305
539380,FL,0,SFO,MKE,Friday,255
539381,UA,1,HNL,SFO,Friday,313


# Initial Visulizations/Counts Using Plotly - ran as separate file see (initial_comparison)

In [10]:
import pandas as pd
from path import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [11]:
#Create plot grouped by airline to count 
#df = airlines_df[['Airline','Delay']].groupby('Airline').agg('count').reset_index()
#fig = px.bar(airlines_df, x='Airline', y='Delay')
#fig.show()

In [12]:
#Create plot grouped by DayOfWeek to count 
#df = airlines_df[['DayOfWeek','Delay']].groupby('Airline').agg('count').reset_index()
#fig = px.bar(airlines_df, x='DayOfWeek', y='Delay')
#fig.show()

In [13]:

#Create plot grouped by departing airport to count 
#airlines_df = airlines_df[['AirportFrom','Delay']].groupby('AirportFrom').agg('count').reset_index()
#fig = px.bar(airlines_df, x='AirportFrom', y='Delay')
#fig.show()

In [14]:
#Create plot grouped by departing airport to count 
#airlines_df = airlines_df[['AirportTo','Delay']].groupby('AirportTo').agg('count').reset_index()
#fig = px.bar(airlines_df, x='AirportTo', y='Delay')
#fig.show()

In [15]:
#Create plot grouped by departing airport to count 
#airlines_df = data[['Length','Delay']].groupby('Length').agg('count').reset_index()
#fig = px.bar(airlines_df, x='Length', y='Delay')
#fig.show()

# Data Preparation

In [16]:
#Change numbers to day of week to later be converted to 1s and 0s - removed from code because already incorporated in updated CSV
#days = {1:'Monday', 2:'Tuesday', 3:'Wednesday', 4:'Thursday', 5:'Friday', 6:'Saturday', 7:'Sunday'}

In [17]:
#airlines_df['DayOfWeek'] = airlines_df['DayOfWeek'].map(days) - already updated in CSV
airlines_df

Unnamed: 0,Airline,Delay,AirportFrom,AirportTo,DayOfWeek,Length
0,CO,1,SFO,IAH,Wednesday,205
1,US,1,PHX,CLT,Wednesday,222
2,AA,1,LAX,DFW,Wednesday,165
3,AA,1,SFO,DFW,Wednesday,195
4,AS,0,ANC,SEA,Wednesday,202
...,...,...,...,...,...,...
539378,CO,0,OGG,SNA,Friday,326
539379,FL,0,SEA,ATL,Friday,305
539380,FL,0,SFO,MKE,Friday,255
539381,UA,1,HNL,SFO,Friday,313


In [18]:
# Create our features
# Convert string columns to numbers and drop 'delay' column then assign to X
X = pd.get_dummies(airlines_df, columns=['Airline', 'AirportFrom', 'AirportTo', 'DayOfWeek',]).drop('Delay', axis=1)
X

Unnamed: 0,Length,Airline_9E,Airline_AA,Airline_AS,Airline_B6,Airline_CO,Airline_DL,Airline_EV,Airline_F9,Airline_FL,...,AirportTo_XNA,AirportTo_YAK,AirportTo_YUM,DayOfWeek_Friday,DayOfWeek_Monday,DayOfWeek_Saturday,DayOfWeek_Sunday,DayOfWeek_Thursday,DayOfWeek_Tuesday,DayOfWeek_Wednesday
0,205,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,222,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,165,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,195,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,202,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
539378,326,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
539379,305,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
539380,255,0,0,0,0,0,0,0,0,1,...,0,0,0,1,0,0,0,0,0,0
539381,313,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [19]:
X.shape

(539383, 612)

In [20]:
# Create our target
y = airlines_df['Delay']
X.head()

Unnamed: 0,Length,Airline_9E,Airline_AA,Airline_AS,Airline_B6,Airline_CO,Airline_DL,Airline_EV,Airline_F9,Airline_FL,...,AirportTo_XNA,AirportTo_YAK,AirportTo_YUM,DayOfWeek_Friday,DayOfWeek_Monday,DayOfWeek_Saturday,DayOfWeek_Sunday,DayOfWeek_Thursday,DayOfWeek_Tuesday,DayOfWeek_Wednesday
0,205,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,222,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,165,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,195,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,202,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [21]:
X.describe()

Unnamed: 0,Length,Airline_9E,Airline_AA,Airline_AS,Airline_B6,Airline_CO,Airline_DL,Airline_EV,Airline_F9,Airline_FL,...,AirportTo_XNA,AirportTo_YAK,AirportTo_YUM,DayOfWeek_Friday,DayOfWeek_Monday,DayOfWeek_Saturday,DayOfWeek_Sunday,DayOfWeek_Thursday,DayOfWeek_Tuesday,DayOfWeek_Wednesday
count,539383.0,539383.0,539383.0,539383.0,539383.0,539383.0,539383.0,539383.0,539383.0,539383.0,...,539383.0,539383.0,539383.0,539383.0,539383.0,539383.0,539383.0,539383.0,539383.0,539383.0
mean,132.202007,0.038351,0.084645,0.021267,0.033579,0.039152,0.112981,0.05188,0.011969,0.038613,...,0.002082,0.000106,0.000627,0.158047,0.134912,0.109303,0.129554,0.169536,0.132262,0.166386
std,70.117016,0.192043,0.278353,0.144273,0.180143,0.193957,0.31657,0.221784,0.108747,0.19267,...,0.045582,0.010279,0.025025,0.364786,0.341629,0.312019,0.335812,0.375225,0.338776,0.372427
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,81.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,115.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,162.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,655.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [22]:
y.value_counts()

0    299119
1    240264
Name: Delay, dtype: int64

In [23]:
## Labels are the values we want to predict
#labels = np.array(X['Delay'])

# Remove the labels from the features
# axis 1 refers to the columns
#X = X.drop('Delay', axis = 1)
# Saving feature names for later use
#X_list = list(X.columns)
# Convert to numpy array
#X = np.array(X)

In [24]:
#d = {}    
# airlines_df = airlines_df.replace(x)

# Split into training and testing splits

In [64]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
from collections import Counter


# Split the data into training and testing sets - used random_state to make the data reproducible and ensure that same rows are assigned to train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=None, test_size= 0.3)

In [65]:
# Initial imports.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [66]:
# Creating a StandardScaler instance.
#scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
#X_scaler = scaler.fit(X_train)

# Scaling the data.
#X_train_scaled = X_scaler.transform(X_train)
#X_test_scaled = X_scaler.transform(X_test)

In [68]:
#Check balances
print(Counter(y_train))
print(Counter(y_test))

Counter({0: 209318, 1: 168250})
Counter({0: 89801, 1: 72014})


# Ensemble Learners
## Used to compare which algorithm results in best performance
## Chosen in an effort to improve the accuracy and robustness of the model, decrease variance of the model, and increase overall performance of the model

# Random Forest Classifier
## Chosen to rank importance of input variables, run efficiently on large datasets, and are robust against overfitting 

In [69]:
# sample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
random_f = RandomForestClassifier(n_estimators = 128, random_state=42)

In [70]:
#Fit the model
random_f = random_f.fit(X_train, y_train, sample_weight=None)

In [71]:
# Making predictions using the testing data.
predictions = random_f.predict(X_test)
predictions

array([0, 0, 0, ..., 1, 1, 1])

In [72]:
# Calculated the balanced accuracy score
y_pred = random_f.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6045733905724486

In [73]:
# Display the confusion matrix
# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[59779, 30022],
       [32877, 39137]])

In [74]:
#Create dataframe for the cm 
cm_df = pd.DataFrame(
    cm, index=["Actual Not_Delayed", "Actual Delayed"], columns=["Predicted Not_Delayed", "Predicted Delayed"])
cm_df

Unnamed: 0,Predicted Not_Delayed,Predicted Delayed
Actual Not_Delayed,59779,30022
Actual Delayed,32877,39137


In [75]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, y_pred)
acc_score

0.6112906714457869

In [76]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predicted Not_Delayed,Predicted Delayed
Actual Not_Delayed,59779,30022
Actual Delayed,32877,39137


Accuracy Score : 0.6112906714457869
Classification Report
              precision    recall  f1-score   support

           0       0.65      0.67      0.66     89801
           1       0.57      0.54      0.55     72014

    accuracy                           0.61    161815
   macro avg       0.61      0.60      0.60    161815
weighted avg       0.61      0.61      0.61    161815



In [77]:
# Calculate feature importance in the Random Forest model.
importances = random_f.feature_importances_
importances

array([4.56885434e-01, 1.97719920e-03, 3.17505850e-03, 1.97365751e-03,
       1.84696399e-03, 4.64097713e-03, 4.00590263e-03, 2.13663990e-03,
       1.13020722e-03, 4.77430740e-03, 7.17847498e-04, 3.84147271e-03,
       3.98777219e-03, 3.14787724e-03, 5.46297359e-03, 5.13468511e-03,
       6.37096440e-02, 2.46559182e-03, 5.31822288e-03, 2.58092642e-04,
       9.29712892e-05, 1.06397379e-03, 3.94302402e-06, 4.78758243e-05,
       2.51559513e-05, 1.11778643e-04, 4.19384748e-05, 1.36452536e-05,
       2.77003263e-05, 1.61491709e-04, 1.40268636e-04, 7.48388464e-04,
       2.57287624e-04, 3.47088499e-04, 1.91286592e-04, 2.81420491e-03,
       2.67971184e-04, 1.45286131e-03, 2.68282184e-04, 1.44892955e-04,
       1.60375477e-04, 1.10230973e-03, 6.91334688e-05, 1.50491943e-04,
       5.77394948e-05, 4.51116735e-05, 8.21923269e-04, 2.13125169e-04,
       2.23621313e-04, 3.93182155e-05, 1.66024939e-05, 2.86984413e-04,
       1.45843048e-03, 5.58860142e-04, 2.26544371e-03, 5.80186416e-05,
      

In [78]:
# We can sort the features by their importance.
sorted(zip(random_f.feature_importances_, X.columns), reverse=True)

[(0.45688543449091334, 'Length'),
 (0.06370964404935864, 'Airline_WN'),
 (0.019147734221829665, 'DayOfWeek_Thursday'),
 (0.018799388931829975, 'DayOfWeek_Tuesday'),
 (0.017773646087613528, 'DayOfWeek_Sunday'),
 (0.017645191297543987, 'DayOfWeek_Monday'),
 (0.015937261469867566, 'DayOfWeek_Wednesday'),
 (0.014757673710975914, 'DayOfWeek_Friday'),
 (0.012062734901788419, 'DayOfWeek_Saturday'),
 (0.005462973589413295, 'Airline_UA'),
 (0.0053182228838776924, 'Airline_YV'),
 (0.00513468510907757, 'Airline_US'),
 (0.005021087875409323, 'AirportFrom_ORD'),
 (0.004774307402136019, 'Airline_FL'),
 (0.004640977134271785, 'Airline_CO'),
 (0.004157821343145359, 'AirportFrom_MDW'),
 (0.004005902631075975, 'Airline_DL'),
 (0.003987772189768624, 'Airline_OH'),
 (0.0038414727106356613, 'Airline_MQ'),
 (0.003731719978276882, 'AirportTo_DFW'),
 (0.0031750584994072986, 'Airline_AA'),
 (0.0031478772369977343, 'Airline_OO'),
 (0.0030516075456975982, 'AirportFrom_LAX'),
 (0.0030441691632468936, 'AirportTo_A

In [79]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.65      0.67      0.54      0.66      0.60      0.37     89801
          1       0.57      0.54      0.67      0.55      0.60      0.36     72014

avg / total       0.61      0.61      0.60      0.61      0.60      0.36    161815

