In [97]:
# Import Dependencies
from path import Path
import pandas as pd
from sklearn.preprocessing import StandardScaler,OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from collections import defaultdict
import pickle, os
import tensorflow as tf


# Preliminary data preprocessing
Original Dataset was provided by a Kaggle Project.  For the intial analysis of the dataset the information was imported to a local directory and then imported read into this jupyter notebook.  Later we will have the connection to the database here. The dataset already came as a csv file so no further processing was necessary.  

Merging of the weather data was accomplished by using the World Weather Online API.  The download of the required information was achieved in a seperate jupyter notebook.  Data was formated as a simple csv file and was imported below.

In [120]:
# Import testing dataset
flights_2019 = pd.read_csv('Database/Data/jan_19_clean.csv')
flights_2020 = pd.read_csv('Database/Data/jan_20_clean.csv')

print('Total Flights for 2019: ', len(flights_2019)
      , '  Total Flights for 2020: ', len(flights_2020) )

flights_2019.head()

Total Flights for 2019:  565963   Total Flights for 2020:  599268


Unnamed: 0,DAY_OF_MONTH,DAY_OF_WEEK,OP_CARRIER_AIRLINE_ID,OP_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST,DEP_TIME,DEP_DEL15,DEP_TIME_BLK,ARR_TIME,ARR_DEL15,CANCELLED,DIVERTED,DISTANCE
0,1,2,20363,9E,N8688C,3280,11953,1195302,GNV,10397,1039707,ATL,601.0,0.0,0600-0659,722.0,0.0,0,0,300
1,1,2,20363,9E,N348PQ,3281,13487,1348702,MSP,11193,1119302,CVG,1359.0,0.0,1400-1459,1633.0,0.0,0,0,596
2,1,2,20363,9E,N8896A,3282,11433,1143302,DTW,11193,1119302,CVG,1215.0,0.0,1200-1259,1329.0,0.0,0,0,229
3,1,2,20363,9E,N8886A,3283,15249,1524906,TLH,10397,1039707,ATL,1521.0,0.0,1500-1559,1625.0,0.0,0,0,223
4,1,2,20363,9E,N8974C,3284,10397,1039707,ATL,11778,1177801,FSM,1847.0,0.0,1900-1959,1940.0,0.0,0,0,579


In [121]:
flights_2019['Year'] = 2019
flights_2020['Year'] = 2020

In [122]:
len(flights_2020.ORIGIN.unique())

351

In [123]:
# Import Weather Data
weather_2018 = pd.read_csv('Database/weather/weather_data/weather_2019_df.csv')
weather_2019 = pd.read_csv('Database/weather/weather_data/weather_2019_df.csv')
weather_2020 = pd.read_csv('Database/weather/weather_data/weather_2020_df.csv')

# Combine the weather dataframes
weather_df = pd.concat([weather_2018, weather_2019, weather_2020])

weather_df.head()

Unnamed: 0,location,date_time,precipMM,visibility,cloudcover,windspeedKmph,humidity
0,"islip,ny",2019-01-01 00:00:00,8.5,5,100,10,91
1,"islip,ny",2019-01-01 01:00:00,4.3,5,100,12,93
2,"islip,ny",2019-01-01 02:00:00,4.3,6,100,15,94
3,"islip,ny",2019-01-01 03:00:00,5.8,6,100,18,96
4,"islip,ny",2019-01-01 04:00:00,2.9,7,98,20,96


In [124]:
len(weather_df.location.unique())

318

In [125]:
len(weather_df)

709776

In [140]:
# Import airport city information
city_lookup_df = pd.read_csv("Database/Data/airport_codes.csv")
lookup_columns = list(city_lookup_df.columns)
city_lookup_df.head()

Unnamed: 0,ORIGIN,CITY,STATE
0,ABR,Aberdeen,SD
1,ABI,Abilene,TX
2,ADK,Adak Island,AK
3,CAK,Akron/Canton,OH
4,ALB,Albany,NY


In [141]:
city_lookup_df.CITY = city_lookup_df.CITY.str.replace("-", "/")

In [142]:

city_lookup_df['Primary_City'] = city_lookup_df.CITY.str.split('/', expand=True)[0].str.replace('None', "")
city_lookup_df['Secondary_City'] = city_lookup_df.CITY.str.split('/', expand=True)[1].str.replace('None', "")
city_lookup_df['Tertiary_City'] = city_lookup_df.CITY.str.split('/', expand=True)[2].str.replace('None', "")

In [143]:
city_lookup_df.head()

Unnamed: 0,ORIGIN,CITY,STATE,Primary_City,Secondary_City,Tertiary_City
0,ABR,Aberdeen,SD,Aberdeen,,
1,ABI,Abilene,TX,Abilene,,
2,ADK,Adak Island,AK,Adak Island,,
3,CAK,Akron/Canton,OH,Akron,Canton,
4,ALB,Albany,NY,Albany,,


In [156]:
city_lookup_df[city_lookup_df['Secondary_City'].notnull()]

Unnamed: 0,ORIGIN,CITY,STATE,Primary_City,Secondary_City,Tertiary_City
3,CAK,Akron/Canton,OH,Akron,Canton,
7,ABE,Allentown/Bethlehem/Easton,PA,Allentown,Bethlehem,Easton
12,ACV,Eureka/Arcata,CA,Eureka,Arcata,
13,AVL,Asheville/Hendersonville,NC,Asheville,Hendersonville,
24,MBS,Bay City/Midland/Saginaw,MI,Bay City,Midland,Saginaw
25,BPT,Beaumont/Port Arthur,TX,Beaumont,Port Arthur,
31,GPT,Biloxi/Gulfport,MS,Biloxi,Gulfport,
32,BGM,Binghamton/Endicott/Johnson City,NY,Binghamton,Endicott,Johnson City
48,MRY,Carmel/Monterey,CA,Carmel,Monterey,
52,CMI,Champaign/Urbana,IL,Champaign,Urbana,


In [149]:
cities_option = ['Primary_City', 'Secondary_City', 'Tertiary_City']
lookup_columns.append("locations")

location_lookup_df = pd.DataFrame(columns=lookup_columns) 
for city_option in cities_option:
    x = city_lookup_df[city_lookup_df[city_option].notnull()]
    
    location_lookup_df = location_lookup_df.append(
        x[[city_option, 'STATE']].apply(lambda row: ','.join(row.values.astype(str)), axis=1))



KeyError: ('Primary_City', 'STATE')

In [None]:
Additional_locations = 

In [None]:
city_lookup_df['location'] = city_lookup_df['location'].str.lower()
city_lookup_df['location'] = city_lookup_df['location'].str.replace(" ","")

In [89]:
city_lookup_df.head()

Unnamed: 0,ORIGIN,CITY,STATE,Primary_City,Secondary_City,location
0,ABR,Aberdeen,SD,Aberdeen,,"aberdeen,sd"
1,ABI,Abilene,TX,Abilene,,"abilene,tx"
2,ADK,Adak Island,AK,Adak Island,,"adakisland,ak"
3,CAK,Akron/Canton,OH,Akron,Canton,"akron,oh"
4,ALB,Albany,NY,Albany,,"albany,ny"


In [91]:
weather_merged = pd.merge(weather_df, city_lookup_df, on = "location", how = "left")
weather_merged.head()

Unnamed: 0,location,date_time,precipMM,visibility,cloudcover,windspeedKmph,humidity,ORIGIN,CITY,STATE,Primary_City,Secondary_City
0,"islip,ny",2019-01-01 00:00:00,8.5,5,100,10,91,ISP,Islip,NY,Islip,
1,"islip,ny",2019-01-01 01:00:00,4.3,5,100,12,93,ISP,Islip,NY,Islip,
2,"islip,ny",2019-01-01 02:00:00,4.3,6,100,15,94,ISP,Islip,NY,Islip,
3,"islip,ny",2019-01-01 03:00:00,5.8,6,100,18,96,ISP,Islip,NY,Islip,
4,"islip,ny",2019-01-01 04:00:00,2.9,7,98,20,96,ISP,Islip,NY,Islip,


In [93]:
weather_merged[weather_merged.ORIGIN.isnull()].location.unique()

array(['saultstemarie,mi,', 'bristol,tn', 'saginaw,mi', 'gulfport,ms',
       'deadhorse,ak', 'huntington,wv', 'rhinelander,wi,', 'monterey,ca',
       'longview,tx', 'durham,nc', 'wausau,wi', 'yellowstone,wy',
       'kauai,hi', 'fortworth,tx', 'universitypark,pa', 'easton,pa',
       'odessa,tx', 'arcata,ca', 'maui,hi'], dtype=object)

In [90]:
city_lookup_df[city_lookup_df.CITY == "San Francisco"]

Unnamed: 0,ORIGIN,CITY,STATE,Primary_City,Secondary_City,location
266,SFO,San Francisco,CA,San Francisco,,"sanfrancisco,ca"


## Preliminary Feature Engineering
Beyond a visual inspection of the head of the data the following steps were performed below:

1) Determined if there are any rows that need to be dropped because they are missing data.

2) Determine which columns of data will need to be included in the training set.

3) Perform the inital encoding of the columns that contained objects as numerical values so it would be easier for the machine learning model to process it.

In [3]:
na_count = flights_2019.DAY_OF_MONTH.count() - flights_2019.dropna(axis=0).DAY_OF_MONTH.count()
print('Total rows with missing information from 2019 data was:  ', na_count )

Total rows with missing information from 2019 data was:   0


In [4]:
na_count = flights_2020.DAY_OF_MONTH.count() - flights_2020.dropna(axis=0).DAY_OF_MONTH.count()
print('Total rows with missing information from 2019 data was:  ', na_count )

Total rows with missing information from 2019 data was:   0


### Step one complete.  The datasets are already cleaned with no missing information in any of the rows.

In [5]:
# Combine the two data sets to give more data for the model to work with
flights = pd.concat([flights_2019, flights_2020])

In [6]:
#  Look at the available columns
flights.columns

Index(['DAY_OF_MONTH', 'DAY_OF_WEEK', 'OP_CARRIER_AIRLINE_ID', 'OP_CARRIER',
       'TAIL_NUM', 'OP_CARRIER_FL_NUM', 'ORIGIN_AIRPORT_ID',
       'ORIGIN_AIRPORT_SEQ_ID', 'ORIGIN', 'DEST_AIRPORT_ID',
       'DEST_AIRPORT_SEQ_ID', 'DEST', 'DEP_TIME', 'DEP_DEL15', 'DEP_TIME_BLK',
       'ARR_TIME', 'ARR_DEL15', 'CANCELLED', 'DIVERTED', 'DISTANCE'],
      dtype='object')

### Step 2:
#### Feature Selection:

To determine which features to include it was important to determine how we want to use the predictive model.
In this case we will be taking a set of inputs from the user and displaying which time slot is most likely to not
have a flight delay. So we will only include information the model that we can get from the user or provide look up tables for.

The filterdList below included all the data that we either we will be able to get from the user, will be a look up
or is the primary feature we want to predict. 

In this case it is whether there will be a departure delay or not the column name DEP_DEL15 will the be feature we want to predict.

In [7]:
# Create the list that we are going to keep as features
filteredList = ['DAY_OF_MONTH', 'DAY_OF_WEEK', 'OP_CARRIER', 'ORIGIN', 'DEST', 'DEP_DEL15', 'DISTANCE'
                ,'DEP_TIME_BLK', 'TAIL_NUM']

# Keep the list of columns that were removed...possibly for later use.
columnsRemoved = ['OP_CARRIER_AIRLINE_ID', 'DEST_AIRPORT_SEQ_ID', 'ORIGIN_AIRPORT_SEQ_ID' , 'ORIGIN_AIRPORT_ID'
                  , 'OP_CARRIER_FL_NUM', 'DEST_AIRPORT_ID', 'ARR_TIME', 'ARR_DEL15', 'CANCELLED', 'DIVERTED', 'DEP_TIME']

# Filter the data to only include the columns we want
machine_model_df = flights.filter(filteredList)
machine_model_df.head()

Unnamed: 0,DAY_OF_MONTH,DAY_OF_WEEK,OP_CARRIER,ORIGIN,DEST,DEP_DEL15,DISTANCE,DEP_TIME_BLK,TAIL_NUM
0,1,2,9E,GNV,ATL,0.0,300,0600-0659,N8688C
1,1,2,9E,MSP,CVG,0.0,596,1400-1459,N348PQ
2,1,2,9E,DTW,CVG,0.0,229,1200-1259,N8896A
3,1,2,9E,TLH,ATL,0.0,223,1500-1559,N8886A
4,1,2,9E,ATL,FSM,0.0,579,1900-1959,N8974C


In [8]:
# Set the feature we want to predict as a integer since on import it was made a float
machine_model_df.DEP_DEL15 = machine_model_df.DEP_DEL15.astype('int')


In [9]:
# We are evaluating the impact of including Departure Time and a floating number in the dataset.
# if it is there then make it an integer
if "DEP_TIME" in machine_model_df.columns:
    machine_model_df.DEP_TIME = machine_model_df.DEP_TIME.astype('int')

In [10]:
# Examine features and determine how the various features will need to be encoded
machine_model_df.dtypes

DAY_OF_MONTH     int64
DAY_OF_WEEK      int64
OP_CARRIER      object
ORIGIN          object
DEST            object
DEP_DEL15        int32
DISTANCE         int64
DEP_TIME_BLK    object
TAIL_NUM        object
dtype: object

In [11]:
# Generate our categorical variable list
flights_cat = machine_model_df.dtypes[machine_model_df.dtypes == "object"].index.tolist()

# Check the number of unique values in each column
machine_model_df[flights_cat].nunique()

OP_CARRIER        17
ORIGIN           353
DEST             353
DEP_TIME_BLK      19
TAIL_NUM        5854
dtype: int64

In [12]:
# Evaluate how many records are available in the dataset
print('Total number of records:  ', machine_model_df.DAY_OF_MONTH.count())
print('Total number of columns:  ', len(machine_model_df.columns))

Total number of records:   1165231
Total number of columns:   9


###  Step 2 - Encoding:
#### Encoding Method:
A couple of things to note, the complete dataset has over a million records and there are currently 9 columns.  Given the number of columns that have unique string data, to attempt to apply get_dummies would dramatically increase the datatable size.  This was attempted on the Carrier and Origin the result was a error code stating that there was inadequate resources.  

We will reserve the get_dummy encoding method for the OP_Carrier and Departure Time Blocks only and will use LabelEncoder for Origin, Destination, and Tail Number.  It is necessary to not limit ourselves to only the top ten of any of these options because these will be unique entries by the user later when the model is being implemented.

In [13]:
# Perform get_dummies method on the OP_Carrier and Departure Time Blocks columns
dummy_columns = ['OP_CARRIER', 'DEP_TIME_BLK']
prefix = ['Carrier', 'Time_Block']
dummy_df = pd.get_dummies(machine_model_df[dummy_columns], prefix)

In [14]:
# Tranform each column with LabelEncoder (https://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn)
lableEncoder_columns = ['ORIGIN', 'DEST', 'TAIL_NUM']
d = defaultdict(LabelEncoder)

df = machine_model_df[lableEncoder_columns]

# Encoding the variable
labelEncoded_df = df.apply(lambda x: d[x.name].fit_transform(x))

# Retaining this code here for later reference
## Inverse the encoded
# fit.apply(lambda x: d[x.name].inverse_transform(x))

## Using the dictionary to label future data
# df.apply(lambda x: d[x.name].transform(x))

In [15]:
# Concatenate the encoded columns
machine_model_df_encoded = machine_model_df.drop(flights_cat, axis=1)
machine_model_df_encoded = pd.concat([machine_model_df_encoded, dummy_df, labelEncoded_df], axis = 1)
machine_model_df_encoded.head()

Unnamed: 0,DAY_OF_MONTH,DAY_OF_WEEK,DEP_DEL15,DISTANCE,Carrier_9E,Carrier_AA,Carrier_AS,Carrier_B6,Carrier_DL,Carrier_EV,...,Time_Block_1700-1759,Time_Block_1800-1859,Time_Block_1900-1959,Time_Block_2000-2059,Time_Block_2100-2159,Time_Block_2200-2259,Time_Block_2300-2359,ORIGIN,DEST,TAIL_NUM
0,1,2,0,300,1,0,0,0,0,0,...,0,0,0,0,0,0,0,130,19,4648
1,1,2,0,596,1,0,0,0,0,0,...,0,0,0,0,0,0,0,227,82,1542
2,1,2,0,229,1,0,0,0,0,0,...,0,0,0,0,0,0,0,97,82,4810
3,1,2,0,223,1,0,0,0,0,0,...,0,0,0,0,0,0,0,331,19,4806
4,1,2,0,579,1,0,0,0,0,0,...,0,0,1,0,0,0,0,19,122,4867


In [16]:
# Create Feature and Target Variables
y = machine_model_df_encoded["DEP_DEL15"]  # Target
X = machine_model_df_encoded.drop(columns="DEP_DEL15") # Features

##  Splitting Data:
Using the method imported from the package sklearn, the dataset is split into a training set and a test set.
The seperation is achived randomly.  
By adding the condition stratify = y, we are insuring that the test and training sets contain the proportion 
of values as provided in the target set.  By setting random_state to a specific value we can get repeatedly the same split of the data so it is possible to make sure that at least the testing and training sets are consistent.

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=34, 
                                                    stratify=y)
X_train.shape

(873923, 42)

In [19]:
# Preprocess numerical data

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

## Model Evaluation

Multiple models were evaluated to determine how well they are able to predict the end result.  It is important to note that there are far fewer delayed flights than flights that are on time.  Because that is true, when evaluating how accurate the model is at predicting we will need to evaluate it using precision, recall, and F1-score.

### LogisticRegression Model

In [20]:
# Create the Logistic Regression Model
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=2000,
                                random_state=10)

In [25]:
# Fit the model
classifier.fit(X_train_scaled, y_train)

LogisticRegression(max_iter=2000, random_state=10)

In [26]:
# Make predictions 
y_pred = classifier.predict(X_test_scaled)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,0


In [27]:
metrics.confusion_matrix(y_test, y_pred)

array([[246370,      0],
       [ 44938,      0]], dtype=int64)

In [28]:
# Because dataset is imbalanced we cant use accuracy metric
print(metrics.classification_report(y_test, y_pred))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.85      1.00      0.92    246370
           1       0.00      0.00      0.00     44938

    accuracy                           0.85    291308
   macro avg       0.42      0.50      0.46    291308
weighted avg       0.72      0.85      0.78    291308



# Decision Tree Analysis

In [39]:
clf_dt = DecisionTreeClassifier(random_state = 0)
model_dt = clf_dt.fit(X_train_scaled, y_train)

In [40]:
y_pred = model_dt.predict(X_test_scaled)
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.85      0.86    246370
           1       0.27      0.29      0.28     44938

    accuracy                           0.77    291308
   macro avg       0.57      0.57      0.57    291308
weighted avg       0.78      0.77      0.77    291308



In [41]:
metrics.confusion_matrix(y_test, y_pred)

array([[210549,  35821],
       [ 31706,  13232]], dtype=int64)

In [42]:
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,1,0
1,0,0
2,0,0
3,1,0
4,1,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,0


# Random Forest Classifier
This one is too slow and is demonstrated to be a poor predictor

In [33]:
from sklearn.ensemble import RandomForestClassifier
clf_rf = RandomForestClassifier(max_depth=10)
model_rf = clf_rf.fit(X_train_scaled, y_train)

In [34]:
y_pred = model_rf.predict(X_test_scaled)
print(metrics.classification_report(y_test,y_pred))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.85      1.00      0.92    246370
           1       0.00      0.00      0.00     44938

    accuracy                           0.85    291308
   macro avg       0.42      0.50      0.46    291308
weighted avg       0.72      0.85      0.78    291308



In [35]:
metrics.confusion_matrix(y_test, y_pred)

array([[246370,      0],
       [ 44938,      0]], dtype=int64)

### NeuralNetwork

In [50]:
# Define the model - deep neural net
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 =  1.5 * len(X_train_scaled[0])
hidden_nodes_layer2 = 12

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))


# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=100)

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [67]:
# Get Predictions
y_pred = tf.round(nn.predict(X_test_scaled))


In [69]:
print(metrics.classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.85      1.00      0.92    246370
           1       0.52      0.01      0.01     44938

    accuracy                           0.85    291308
   macro avg       0.68      0.50      0.47    291308
weighted avg       0.80      0.85      0.78    291308



In [70]:
metrics.confusion_matrix(y_test, y_pred)

array([[246083,    287],
       [ 44625,    313]], dtype=int64)

### Model Final Selection
After evaluating each model using the entire combined dataset, it was found that Decision Tree was the best at predicting delays.  That will be the model we will use going forward.