In [11]:
pip list

Package                            Version
---------------------------------- -------------------
absl-py                            1.2.0
alabaster                          0.7.12
anaconda-client                    1.7.2
anaconda-navigator                 1.9.12
anaconda-project                   0.8.3
argh                               0.26.2
asn1crypto                         1.3.0
astroid                            2.4.2
astropy                            4.0.1.post1
astunparse                         1.6.3
atomicwrites                       1.4.0
attrs                              19.3.0
autopep8                           1.5.3
Babel                              2.8.0
backcall                           0.2.0
backports.functools-lru-cache      1.6.4
backports.shutil-get-terminal-size 1.0.0
backports.tempfile                 1.0
backports.weakref                  1.0.post1
bcrypt                             3.1.7
beautifulsoup4                     4.9.1
bitarray                     



In [74]:
#All the necessary packages to run the code in this file
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from math import sqrt

import keras
from keras.models import Sequential
from keras.layers import Dense
from keras import backend as ker

In [75]:
#importing the dataset
df = pd.read_csv('CDS492_Dataset.csv') 

In [76]:
#creating a new dataset with the variables of interest
df2 = df[['Carrier_Name', 'Delays_Over_Flights', 'Employees_Over_Flights', 'Passengers_Over_Flights']]
print(df2.shape)
df2.describe()

(1344, 4)


Unnamed: 0,Delays_Over_Flights,Employees_Over_Flights,Passengers_Over_Flights
count,1344.0,1344.0,1344.0
mean,0.041969,0.733795,95.976385
std,0.027366,0.854767,61.444475
min,0.0,0.0,0.0
25%,0.02661,0.219882,47.18485
50%,0.041558,0.518705,119.77352
75%,0.0582,1.013264,148.265143
max,0.183047,11.683366,201.00111


In [77]:
#creating dummy values for the categorical variable Carrier Name
unit_col_name='Carrier_Name'

df_dummies = pd.get_dummies(df[unit_col_name])

df2_with_dummies = df2.join(df_dummies)

In [78]:
df2_with_dummies

Unnamed: 0,Carrier_Name,Delays_Over_Flights,Employees_Over_Flights,Passengers_Over_Flights,Alaska_Airlines,Allegiant_Air,American_Airlines,Delta_Air_Lines,Endeavor,Envoy_Air,Frontier_Airlines,Hawaiian_Airlines,JetBlue_Airways,Mesa_Airlines,PSA_Airlines,Republic_Airline,SkyWest_Airlines,Southwest_Airlines,Spirit_Air_Lines,United_Air_Lines
0,Endeavor,0.036250,0.230664,50.725178,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,American_Airlines,0.062584,1.362790,169.241256,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Alaska_Airlines,0.084691,1.010009,137.334261,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,JetBlue_Airways,0.139842,0.976347,142.952221,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
4,Delta_Air_Lines,0.075346,1.191750,151.452237,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1339,SkyWest_Airlines,0.058195,0.217567,45.478114,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
1340,United_Air_Lines,0.085845,2.186847,175.845449,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1341,Southwest_Airlines,0.059475,0.473151,100.321745,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1342,Mesa_Airlines,0.000000,0.000000,0.000000,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [79]:
#Designating explanatory and response variables for later use
df2_with_dummies = df2_with_dummies.drop('Carrier_Name', axis=1)
target_column = ['Delays_Over_Flights']
predictors = list(set(list(df2_with_dummies.columns))-set(target_column))

In [91]:
#Train-test split the data
X = df2_with_dummies[predictors].values
y = df2_with_dummies[target_column].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=40)
print(X_train.shape); print(X_test.shape)

(940, 18)
(404, 18)


In [92]:
#Keras model creation with 4 layers total which include 2 hidden layers
model = Sequential()
model.add(Dense(500, input_dim=18, activation= "relu"))
model.add(Dense(100, activation= "relu"))
model.add(Dense(50, activation= "relu"))
model.add(Dense(1))

In [93]:
#Compiling the model with the adam optimizer and fitting it to the train data
model.compile(loss= "mean_squared_error" , optimizer="adam", metrics=["mean_squared_error"])
model.fit(X_train, y_train, epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x1cad7d86700>

In [94]:
#Predicting using the train data and finding performance metrics for the model
pred_train= model.predict(X_train)
print("Mean Squared Error: ", mean_squared_error(y_train, pred_train))
print("Root Mean Squared Error: ", np.sqrt(mean_squared_error(y_train,pred_train)))
print("R2 Score: ", r2_score(y_train, pred_train))
print("Mean Absolute Error: ", mean_absolute_error(y_train, pred_train))

#Predicting using the test data and finding performance metrics for the model
pred= model.predict(X_test)
print("Mean Squared Error: ", mean_squared_error(y_test, pred))
print("Root Mean Squared Error: ", np.sqrt(mean_squared_error(y_test,pred)))
print("R2 Score: ", r2_score(y_test, pred))
print("Mean Absolute Error: ", mean_absolute_error(y_test, pred))

Mean Squared Error:  0.0002335945468752036
Root Mean Squared Error:  0.015283800145094924
R2 Score:  0.683867090426757
Mean Absolute Error:  0.010546479139512408
Mean Squared Error:  0.00024927907921439756
Root Mean Squared Error:  0.015788574324947693
R2 Score:  0.6762975931983458
Mean Absolute Error:  0.011371575284330415


In [123]:
#New dataset using the airline data which was determined to have a strong correlation
df3 = df2[df2['Carrier_Name'].isin(['Endeavor', 'Allegiant_Air', 'Envoy_Air', 'PSA_Airlines'])]

In [124]:
#Looking at the correlation of the new dataset
df3.corr()

Unnamed: 0,Delays_Over_Flights,Employees_Over_Flights,Passengers_Over_Flights
Delays_Over_Flights,1.0,0.623206,0.880081
Employees_Over_Flights,0.623206,1.0,0.604622
Passengers_Over_Flights,0.880081,0.604622,1.0


In [125]:
#Creating a train-test split for the new dataset
df3 = df3.drop('Carrier_Name', axis=1)
target_column = ['Delays_Over_Flights']
predictors = list(set(list(df3.columns))-set(target_column))

X_new = df3[predictors].values
y_new = df3[target_column].values

X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X_new, y_new, test_size=0.30, random_state=40)
print(X_train_new.shape); print(X_test_new.shape)

(235, 2)
(101, 2)


In [126]:
#New keras model for the new dataset with the same amount of layers and hidden layers as the last model
model_select = Sequential()
model_select.add(Dense(500, input_dim=2, activation= "relu"))
model_select.add(Dense(100, activation= "relu"))
model_select.add(Dense(50, activation= "relu"))
model_select.add(Dense(1))

In [127]:
#Compile and fit data using the adam optimizer
model_select.compile(loss= "mean_squared_error" , optimizer="adam", metrics=["mean_squared_error"])
model_select.fit(X_train_new, y_train_new, epochs=25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x1cad91f8f40>

In [128]:
#Predict data using the train data and find the performance metrics for the model
pred_train= model_select.predict(X_train_new)
print("Mean Squared Error: ", mean_squared_error(y_train_new, pred_train))
print("Root Mean Squared Error: ", np.sqrt(mean_squared_error(y_train_new,pred_train)))
print("R2 Score: ", r2_score(y_train_new, pred_train))
print("Mean Absolute Error: ", mean_absolute_error(y_train_new, pred_train))

#Predict data using the test data and find the performance metrics for the model
pred= model_select.predict(X_test_new)
print("Mean Squared Error: ", mean_squared_error(y_test_new, pred))
print("Root Mean Squared Error: ", np.sqrt(mean_squared_error(y_test_new,pred)))
print("R2 Score: ", r2_score(y_test_new, pred))
print("Mean Absolute Error: ", mean_absolute_error(y_test_new, pred))

Mean Squared Error:  0.00010181114491626289
Root Mean Squared Error:  0.010090150886694554
R2 Score:  0.8333614341850994
Mean Absolute Error:  0.0055429413648833276
Mean Squared Error:  9.829163706563589e-05
Root Mean Squared Error:  0.009914213890452227
R2 Score:  0.8303707634103078
Mean Absolute Error:  0.0053672494924434655
