In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
import plotly.express as px
import os

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

# Read the CSV and Perform Basic Data Cleaning

In [4]:
from sklearn.preprocessing import LabelEncoder

In [5]:
#Indicate columns and target
columns = [
    "id","Airline","Delay","Flight","AirportFrom","AirportTo","DayOfWeek","Length"
]

target = ["Delay"]


In [6]:
file_path = Path("new_airlines")
airlines_df = pd.read_csv(file_path)
airlines_df

Unnamed: 0,id,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,Length,Delay
0,1,CO,269,SFO,IAH,Thursday,205,1
1,2,US,1558,PHX,CLT,Thursday,222,1
2,3,AA,2400,LAX,DFW,Thursday,165,1
3,4,AA,2466,SFO,DFW,Thursday,195,1
4,5,AS,108,ANC,SEA,Thursday,202,0
...,...,...,...,...,...,...,...,...
539378,539379,CO,178,OGG,SNA,Saturday,326,0
539379,539380,FL,398,SEA,ATL,Saturday,305,0
539380,539381,FL,609,SFO,MKE,Saturday,255,0
539381,539382,UA,78,HNL,SFO,Saturday,313,1


In [7]:
#Read data into pandas
#data = pd.read_csv("Airlines.csv")

airlines_df = airlines_df.loc[:, columns].copy()
airlines_df

Unnamed: 0,id,Airline,Delay,Flight,AirportFrom,AirportTo,DayOfWeek,Length
0,1,CO,1,269,SFO,IAH,Thursday,205
1,2,US,1,1558,PHX,CLT,Thursday,222
2,3,AA,1,2400,LAX,DFW,Thursday,165
3,4,AA,1,2466,SFO,DFW,Thursday,195
4,5,AS,0,108,ANC,SEA,Thursday,202
...,...,...,...,...,...,...,...,...
539378,539379,CO,0,178,OGG,SNA,Saturday,326
539379,539380,FL,0,398,SEA,ATL,Saturday,305
539380,539381,FL,0,609,SFO,MKE,Saturday,255
539381,539382,UA,1,78,HNL,SFO,Saturday,313


In [8]:
#Drop the null columns where all values are null
airlines_df = airlines_df.dropna(axis='columns', how='all')

# Drop the null rows
airlines_df = airlines_df.dropna()
airlines_df

Unnamed: 0,id,Airline,Delay,Flight,AirportFrom,AirportTo,DayOfWeek,Length
0,1,CO,1,269,SFO,IAH,Thursday,205
1,2,US,1,1558,PHX,CLT,Thursday,222
2,3,AA,1,2400,LAX,DFW,Thursday,165
3,4,AA,1,2466,SFO,DFW,Thursday,195
4,5,AS,0,108,ANC,SEA,Thursday,202
...,...,...,...,...,...,...,...,...
539378,539379,CO,0,178,OGG,SNA,Saturday,326
539379,539380,FL,0,398,SEA,ATL,Saturday,305
539380,539381,FL,0,609,SFO,MKE,Saturday,255
539381,539382,UA,1,78,HNL,SFO,Saturday,313


In [9]:
#Remove info that does not tell us anything about delay or no delay 
#Flight numbers cannot be used as unique identifier b/c some flights have same flight number but
#are based out of different airports
airlines_df = airlines_df.drop(columns =['id','Flight'],axis=1)
airlines_df

Unnamed: 0,Airline,Delay,AirportFrom,AirportTo,DayOfWeek,Length
0,CO,1,SFO,IAH,Thursday,205
1,US,1,PHX,CLT,Thursday,222
2,AA,1,LAX,DFW,Thursday,165
3,AA,1,SFO,DFW,Thursday,195
4,AS,0,ANC,SEA,Thursday,202
...,...,...,...,...,...,...
539378,CO,0,OGG,SNA,Saturday,326
539379,FL,0,SEA,ATL,Saturday,305
539380,FL,0,SFO,MKE,Saturday,255
539381,UA,1,HNL,SFO,Saturday,313


# Initial Visulizations/Counts Using Plotly

In [None]:
import pandas as pd
from path import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [None]:
#Create plot grouped by airline to count 
df = airlines_df[['Airline','Delay']].groupby('Airline').agg('count').reset_index()
fig = px.bar(airlines_df, x='Airline', y='Delay')
fig.show()

In [None]:
#Create plot grouped by DayOfWeek to count 
#df = airlines_df[['DayOfWeek','Delay']].groupby('Airline').agg('count').reset_index()
#fig = px.bar(airlines_df, x='DayOfWeek', y='Delay')
#fig.show()

In [None]:

#Create plot grouped by departing airport to count 
#airlines_df = airlines_df[['AirportFrom','Delay']].groupby('AirportFrom').agg('count').reset_index()
#fig = px.bar(airlines_df, x='AirportFrom', y='Delay')
#fig.show()

In [None]:
#Create plot grouped by departing airport to count 
#airlines_df = airlines_df[['AirportTo','Delay']].groupby('AirportTo').agg('count').reset_index()
#fig = px.bar(airlines_df, x='AirportTo', y='Delay')
#fig.show()

In [None]:
#Create plot grouped by departing airport to count 
#airlines_df = data[['Length','Delay']].groupby('Length').agg('count').reset_index()
#fig = px.bar(airlines_df, x='Length', y='Delay')
#fig.show()

# Data Preparation

In [None]:
#Change numbers to day of week to later be converted to 1s and 0s 
days = {1:'Monday', 2:'Tuesday', 3:'Wednesday', 4:'Thursday', 5:'Friday', 6:'Saturday', 7:'Sunday'}

In [None]:
airlines_df['DayOfWeek'] = airlines_df['DayOfWeek'].map(days)
airlines_df

In [None]:
# Create our features
# Convert string columns to numbers and drop 'delay' column then assign to X
X = pd.get_dummies(airlines_df, columns=['Airline', 'AirportFrom', 'AirportTo', 'DayOfWeek',]).drop('Delay', axis=1)
X

In [None]:
X.shape

In [None]:
# Create our target
y = airlines_df['Delay']
X.head()

In [None]:
# Convert the target column values to delayed and not delayed based on their values
#y = {'0': 'no delay'}   
#airlines_df = airlines_df.replace(y)

#airlines_df.reset_index(inplace=True, drop=True)
#airlines_df

In [None]:
X.describe()

In [None]:
y.value_counts()

In [None]:
## Labels are the values we want to predict
#labels = np.array(X['Delay'])

# Remove the labels from the features
# axis 1 refers to the columns
#X = X.drop('Delay', axis = 1)
# Saving feature names for later use
#X_list = list(X.columns)
# Convert to numpy array
#X = np.array(X)

In [None]:
#d = {}    
# airlines_df = airlines_df.replace(x)

# Split into training and testing splits

In [None]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [None]:
#Check balances

print(Counter(y_train))
print(Counter(y_test))

# Ensemble Learners
## Used to compare which algorithm results in best performance
## Chosen in an effort to improve the accuracy and robustness of the model, decrease variance of the model, and increase overall performance of the model

# Balanced Random Forest Classifier
## Chosen to rank importance of input variables, run efficiently on large datasets, and are robust against overfitting 

In [None]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
random_f = BalancedRandomForestClassifier(n_estimators = 100)

In [None]:
random_f = random_f.fit(X_train, y_train, sample_weight=None)

In [None]:
# Calculated the balanced accuracy score
y_pred = random_f.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

In [None]:
# Display the confusion matrix
# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
#Create dataframe for the cm 
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
cm_df

In [None]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, y_pred)

In [None]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, y_pred))

In [None]:
# Calculate feature importance in the Random Forest model.
importances = random_f.feature_importances_
importances

In [None]:
# We can sort the features by their importance.
sorted(zip(random_f.feature_importances_, X.columns), reverse=True)

In [None]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))