In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import plotly.graph_objects as go
import plotly.express as px
import os

In [3]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

# Read the CSV and Perform Basic Data Cleaning

In [4]:
from sklearn.preprocessing import LabelEncoder

In [7]:
#Indicate columns and target
columns = [
    "id","Airline","Delay","Flight","AirportFrom","AirportTo","DayOfWeek","Time","Length"
]

target = ["Delay"]


In [8]:
file_path = Path("Airlines.csv")
airlines_df = pd.read_csv(file_path)
airlines_df.head()

Unnamed: 0,id,Airline,Flight,AirportFrom,AirportTo,DayOfWeek,Time,Length,Delay
0,1,CO,269,SFO,IAH,3,15,205,1
1,2,US,1558,PHX,CLT,3,15,222,1
2,3,AA,2400,LAX,DFW,3,20,165,1
3,4,AA,2466,SFO,DFW,3,20,195,1
4,5,AS,108,ANC,SEA,3,30,202,0


In [9]:
#Read data into pandas
#data = pd.read_csv("Airlines.csv")

airlines_df = airlines_df.loc[:, columns].copy()
airlines_df

Unnamed: 0,id,Airline,Delay,Flight,AirportFrom,AirportTo,DayOfWeek,Time,Length
0,1,CO,1,269,SFO,IAH,3,15,205
1,2,US,1,1558,PHX,CLT,3,15,222
2,3,AA,1,2400,LAX,DFW,3,20,165
3,4,AA,1,2466,SFO,DFW,3,20,195
4,5,AS,0,108,ANC,SEA,3,30,202
...,...,...,...,...,...,...,...,...,...
539378,539379,CO,0,178,OGG,SNA,5,1439,326
539379,539380,FL,0,398,SEA,ATL,5,1439,305
539380,539381,FL,0,609,SFO,MKE,5,1439,255
539381,539382,UA,1,78,HNL,SFO,5,1439,313


In [10]:
#Drop the null columns where all values are null
airlines_df = airlines_df.dropna(axis='columns', how='all')

# Drop the null rows
airlines_df = airlines_df.dropna()
airlines_df

Unnamed: 0,id,Airline,Delay,Flight,AirportFrom,AirportTo,DayOfWeek,Time,Length
0,1,CO,1,269,SFO,IAH,3,15,205
1,2,US,1,1558,PHX,CLT,3,15,222
2,3,AA,1,2400,LAX,DFW,3,20,165
3,4,AA,1,2466,SFO,DFW,3,20,195
4,5,AS,0,108,ANC,SEA,3,30,202
...,...,...,...,...,...,...,...,...,...
539378,539379,CO,0,178,OGG,SNA,5,1439,326
539379,539380,FL,0,398,SEA,ATL,5,1439,305
539380,539381,FL,0,609,SFO,MKE,5,1439,255
539381,539382,UA,1,78,HNL,SFO,5,1439,313


In [11]:
#Remove info that does not tell us anything about delay or no delay 
#Flight numbers cannot be used as unique identifier b/c some flights have same flight number but
#are based out of different airports
airlines_df = airlines_df.drop(columns =['id','Flight'],axis=1)
airlines_df

Unnamed: 0,Airline,Delay,AirportFrom,AirportTo,DayOfWeek,Time,Length
0,CO,1,SFO,IAH,3,15,205
1,US,1,PHX,CLT,3,15,222
2,AA,1,LAX,DFW,3,20,165
3,AA,1,SFO,DFW,3,20,195
4,AS,0,ANC,SEA,3,30,202
...,...,...,...,...,...,...,...
539378,CO,0,OGG,SNA,5,1439,326
539379,FL,0,SEA,ATL,5,1439,305
539380,FL,0,SFO,MKE,5,1439,255
539381,UA,1,HNL,SFO,5,1439,313


# Initial Visulizations/Counts Using Plotly

In [12]:
import pandas as pd
from path import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [13]:
#Create plot grouped by airline to count 
#df = airlines_df[['Airline','Delay']].groupby('Airline').agg('count').reset_index()
#fig = px.bar(airlines_df, x='Airline', y='Delay')
#fig.show()

In [14]:
#Create plot grouped by DayOfWeek to count 
#df = airlines_df[['DayOfWeek','Delay']].groupby('Airline').agg('count').reset_index()
#fig = px.bar(airlines_df, x='DayOfWeek', y='Delay')
#fig.show()

In [15]:

#Create plot grouped by departing airport to count 
#airlines_df = airlines_df[['AirportFrom','Delay']].groupby('AirportFrom').agg('count').reset_index()
#fig = px.bar(airlines_df, x='AirportFrom', y='Delay')
#fig.show()

In [16]:
#Create plot grouped by departing airport to count 
#airlines_df = airlines_df[['AirportTo','Delay']].groupby('AirportTo').agg('count').reset_index()
#fig = px.bar(airlines_df, x='AirportTo', y='Delay')
#fig.show()

In [17]:
#Create plot grouped by departing airport to count 
#airlines_df = data[['Length','Delay']].groupby('Length').agg('count').reset_index()
#fig = px.bar(airlines_df, x='Length', y='Delay')
#fig.show()

# Data Preparation

In [18]:
#Change numbers to day of week to later be converted to 1s and 0s 
days = {1:'Monday', 2:'Tuesday', 3:'Wednesday', 4:'Thursday', 5:'Friday', 6:'Saturday', 7:'Sunday'}

In [19]:
airlines_df['DayOfWeek'] = airlines_df['DayOfWeek'].map(days)
airlines_df

Unnamed: 0,Airline,Delay,AirportFrom,AirportTo,DayOfWeek,Time,Length
0,CO,1,SFO,IAH,Wednesday,15,205
1,US,1,PHX,CLT,Wednesday,15,222
2,AA,1,LAX,DFW,Wednesday,20,165
3,AA,1,SFO,DFW,Wednesday,20,195
4,AS,0,ANC,SEA,Wednesday,30,202
...,...,...,...,...,...,...,...
539378,CO,0,OGG,SNA,Friday,1439,326
539379,FL,0,SEA,ATL,Friday,1439,305
539380,FL,0,SFO,MKE,Friday,1439,255
539381,UA,1,HNL,SFO,Friday,1439,313


In [20]:
# Create our features
# Convert string columns to numbers and drop 'delay' column then assign to X
X = pd.get_dummies(airlines_df, columns=['Airline', 'AirportFrom', 'AirportTo', 'DayOfWeek'])#drop('Delay', axis=1)
X

Unnamed: 0,Delay,Time,Length,Airline_9E,Airline_AA,Airline_AS,Airline_B6,Airline_CO,Airline_DL,Airline_EV,...,AirportTo_XNA,AirportTo_YAK,AirportTo_YUM,DayOfWeek_Friday,DayOfWeek_Monday,DayOfWeek_Saturday,DayOfWeek_Sunday,DayOfWeek_Thursday,DayOfWeek_Tuesday,DayOfWeek_Wednesday
0,1,15,205,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,15,222,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1,20,165,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,1,20,195,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,30,202,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
539378,0,1439,326,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
539379,0,1439,305,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
539380,0,1439,255,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
539381,1,1439,313,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [21]:
X.shape

(539383, 614)

In [22]:
# Create our target
y = airlines_df['Delay']
X.head()

Unnamed: 0,Delay,Time,Length,Airline_9E,Airline_AA,Airline_AS,Airline_B6,Airline_CO,Airline_DL,Airline_EV,...,AirportTo_XNA,AirportTo_YAK,AirportTo_YUM,DayOfWeek_Friday,DayOfWeek_Monday,DayOfWeek_Saturday,DayOfWeek_Sunday,DayOfWeek_Thursday,DayOfWeek_Tuesday,DayOfWeek_Wednesday
0,1,15,205,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
1,1,15,222,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,1,20,165,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,1,20,195,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,30,202,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [23]:
## Labels are the values we want to predict
labels = np.array(X['Delay'])

# Remove the labels from the features
# axis 1 refers to the columns
X = X.drop('Delay', axis = 1)
# Saving feature names for later use
X_list = list(X.columns)
# Convert to numpy array
X = np.array(X)

# Split into training and testing splits

In [24]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split


# Split the data into training and testing sets
train_X, test_X, train_labels, test_labels = train_test_split(X, labels, test_size = 0.25, random_state = 42)

In [25]:
#Check if we completed correctly by checking the shape of the data. If we completed correctly the training features
#number of columns should match the testing feature number of columns

print('Training Features Shape:', train_X.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_X.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (404537, 613)
Training Labels Shape: (404537,)
Testing Features Shape: (134846, 613)
Testing Labels Shape: (134846,)
