In [92]:
# Import Dependencies
from path import Path
import pandas as pd
import numpy as np
import datetime
from sklearn.preprocessing import StandardScaler,OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
import pickle, os
import tensorflow as tf
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func

import sys
sys.path.insert(1, '../../Programming Resources/')
from config import flightDataPostgres_password

# Preliminary data preprocessing
Original Dataset was provided by a Kaggle Project.  For the intial analysis of the dataset the information was imported to a local directory and then imported read into this jupyter notebook.  All preprocessign steps were executed in the jupyter notebook named -----.  

It was determined during initial evaluation that we also needed weather data, since the original dataset did not include weather information we needed to get historical weather data.  This was achieved by usng the World Weather Online API.

Merging of the weather data was accomplished by using the World Weather Online API.  The download of the required information was achieved in a seperate jupyter notebook named ---.  Data was formated as a simple csv file and was imported below.

## Import Flight Data for the training of the Machine Model

In [94]:
# Create engine using connection string to postgres database
flightDataPostgres_path = "@flightsdata.cxtoxxxge4vx.us-east-2.rds.amazonaws.com/flightsdata"
engine = create_engine("postgres://postgres:" 
                       + flightDataPostgres_password
                       + flightDataPostgres_path)

In [114]:
# Import flight data into the databases
flights_2019 = pd.read_sql_query("SELECT * FROM flight_data_2019", engine)
flights_2020 = pd.read_sql_query("SELECT * FROM flight_data_2020", engine)

In [115]:
print('Total Flights for 2019: ', len(flights_2019)
      , '  Total Flights for 2020: ', len(flights_2020))

Total Flights for 2019:  563126   Total Flights for 2020:  596157


In [116]:
flights_2019.head()

Unnamed: 0,origin,origin_city,day_of_month,day_of_week,op_carrier_airline_id,op_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin_airport_seq_id,...,dest,dep_time,dep_del15,dep_time_blk,arr_time,arr_del15,cancelled,diverted,distance,dest_city
0,ABR,"aberdeen,sd",2019-01-01,2,20304,OO,N910EV,7363,10141,1014106,...,MSP,1411.0,1.0,1300-1359,1521.0,1.0,0.0,0.0,257,"minneapolis,mn"
1,ABR,"aberdeen,sd",2019-01-01,2,20304,OO,N432SW,7365,10141,1014106,...,MSP,642.0,0.0,0600-0659,800.0,0.0,0.0,0.0,257,"minneapolis,mn"
2,ABR,"aberdeen,sd",2019-01-02,3,20304,OO,N498CA,7363,10141,1014106,...,MSP,1310.0,0.0,1300-1359,1424.0,0.0,0.0,0.0,257,"minneapolis,mn"
3,ABR,"aberdeen,sd",2019-01-02,3,20304,OO,N439SW,7365,10141,1014106,...,MSP,502.0,0.0,0001-0559,622.0,0.0,0.0,0.0,257,"minneapolis,mn"
4,ABR,"aberdeen,sd",2019-01-03,4,20304,OO,N8965E,7363,10141,1014106,...,MSP,1307.0,0.0,1300-1359,1411.0,0.0,0.0,0.0,257,"minneapolis,mn"


## Import Weather Data

In [119]:
# Import Weather Data
weather_2018 = pd.read_csv('Database/weather/weather_data/weather_2019_df.csv')
weather_2019 = pd.read_csv('Database/weather/weather_data/weather_2019_df.csv')
weather_2020 = pd.read_csv('Database/weather/weather_data/weather_2020_df.csv')

# Combine the weather dataframes
weather_df = pd.concat([weather_2018, weather_2019, weather_2020])

weather_df.head()

Unnamed: 0,location,date_time,precipMM,visibility,cloudcover,windspeedKmph,humidity
0,"islip,ny",2019-01-01 00:00:00,8.5,5,100,10,91
1,"islip,ny",2019-01-01 01:00:00,4.3,5,100,12,93
2,"islip,ny",2019-01-01 02:00:00,4.3,6,100,15,94
3,"islip,ny",2019-01-01 03:00:00,5.8,6,100,18,96
4,"islip,ny",2019-01-01 04:00:00,2.9,7,98,20,96


In [120]:
# Import airport city information that was used in creating the weather data
locations_lookup_df = pd.read_csv("Flight_data_files/post_hardcode_cities.csv")
locations_lookup_df.drop(columns = "Unnamed: 0", axis = 1, inplace = True)
locations_lookup_df.rename(columns = {"City": "location"}, inplace = True )
locations_lookup_df.head()

Unnamed: 0,Airport Code,location
0,ABR,"aberdeen,sd"
1,ABI,"abilene,tx"
2,ADK,"adakisland,ak"
3,CAK,"akron,oh"
4,ALB,"albany,ny"


In [121]:
# Merge to get Airport Codes into the dataframe so it can be merged with the flight data
weather_merged = pd.merge(weather_df, locations_lookup_df, on = "location", how = "left")


## Merge the Weather Data and Flight Data

In [134]:
# Add columns to prepare weather data to be merged with the flight data.
weather_merged.date_time = weather_merged.date_time.astype('datetime64[ns]')
weather_merged['Year'] = weather_merged.date_time.dt.year
weather_merged['Month'] = weather_merged.date_time.dt.month
weather_merged['Day'] = weather_merged.date_time.dt.day
weather_merged['Hour'] = weather_merged.date_time.dt.hour

weather_merged.head()

Unnamed: 0,location,date_time,precipMM,visibility,cloudcover,windspeedKmph,humidity,Airport Code,Year,Month,Day,Hour
0,"islip,ny",2019-01-01 00:00:00,8.5,5,100,10,91,ISP,2019,1,1,0
1,"islip,ny",2019-01-01 01:00:00,4.3,5,100,12,93,ISP,2019,1,1,1
2,"islip,ny",2019-01-01 02:00:00,4.3,6,100,15,94,ISP,2019,1,1,2
3,"islip,ny",2019-01-01 03:00:00,5.8,6,100,18,96,ISP,2019,1,1,3
4,"islip,ny",2019-01-01 04:00:00,2.9,7,98,20,96,ISP,2019,1,1,4


In [142]:
# Create year column before concatenating flight data 
flights_2019['Year'] = 2019
flights_2020['Year'] = 2020

# Combine the two data sets to give more data for the model to work with
flights = pd.concat([flights_2019, flights_2020])

# Create Columns to prepare for weather data merge
flights['Month'] = 1
flights['DEP_Hour'] = (flights['dep_time']/100).astype("int")
flights['DEP_Hour'] = flights['DEP_Hour'].replace(24, 0) # replace 24 hours with 0 to match weather data
flights['day_of_month'] = flights['day_of_month'].astype('datetime64[ns]').dt.day.astype("int")

In [144]:
# Merge flight and weather data
flights_merged = pd.merge(flights, weather_merged
                          ,  how='left'
                          , left_on=['origin', 'Year', 'Month', 'day_of_month' , 'DEP_Hour']
                          , right_on = ['Airport Code', 'Year', 'Month', 'Day', 'Hour'])

In [145]:
# Remove any duplicate data that was created during merge with flight data
filteredList = ['day_of_month', 'day_of_week', 'op_carrier', 'origin', 'dest', 'dep_del15', 'distance'
                ,'dep_time_blk', 'tail_num']

flights_merged = flights_merged.drop_duplicates(subset = filteredList)

# Preliminary Feature Engineering
Beyond a visual inspection of the head of the data the following steps were performed below:

1) Determine which columns of data will need to be included in the training set.  This was achieved earlier by iteratively evaluating which features improve the model results and which do not.

2) Perform the inital encoding of the columns that contained objects as numerical values so it would be easier for the machine learning model to process it.

In [148]:
#  Look at the available columns
flights_merged.columns

Index(['origin', 'origin_city', 'day_of_month', 'day_of_week',
       'op_carrier_airline_id', 'op_carrier', 'tail_num', 'op_carrier_fl_num',
       'origin_airport_id', 'origin_airport_seq_id', 'dest_airport_id',
       'dest_airport_seq_id', 'dest', 'dep_time', 'dep_del15', 'dep_time_blk',
       'arr_time', 'arr_del15', 'cancelled', 'diverted', 'distance',
       'dest_city', 'Year', 'Month', 'DEP_Hour', 'location', 'date_time',
       'precipMM', 'visibility', 'cloudcover', 'windspeedKmph', 'humidity',
       'Airport Code', 'Day', 'Hour'],
      dtype='object')

In [149]:
len(flights_merged)

1159283

In [150]:
flights_merged.head()

Unnamed: 0,origin,origin_city,day_of_month,day_of_week,op_carrier_airline_id,op_carrier,tail_num,op_carrier_fl_num,origin_airport_id,origin_airport_seq_id,...,location,date_time,precipMM,visibility,cloudcover,windspeedKmph,humidity,Airport Code,Day,Hour
0,ABR,"aberdeen,sd",1,2,20304,OO,N910EV,7363,10141,1014106,...,"aberdeen,sd",2019-01-01 14:00:00,0.0,10.0,13.0,10.0,81.0,ABR,1.0,14.0
2,ABR,"aberdeen,sd",1,2,20304,OO,N432SW,7365,10141,1014106,...,"aberdeen,sd",2019-01-01 06:00:00,0.0,10.0,42.0,15.0,82.0,ABR,1.0,6.0
4,ABR,"aberdeen,sd",2,3,20304,OO,N498CA,7363,10141,1014106,...,"aberdeen,sd",2019-01-02 13:00:00,0.0,10.0,57.0,15.0,84.0,ABR,2.0,13.0
6,ABR,"aberdeen,sd",2,3,20304,OO,N439SW,7365,10141,1014106,...,"aberdeen,sd",2019-01-02 05:00:00,0.0,10.0,68.0,20.0,82.0,ABR,2.0,5.0
8,ABR,"aberdeen,sd",3,4,20304,OO,N8965E,7363,10141,1014106,...,"aberdeen,sd",2019-01-03 13:00:00,0.0,10.0,5.0,18.0,88.0,ABR,3.0,13.0


In [154]:
#  Because of complications with the weather API not all cities were retrieved
#  This was only an issue with a few small cities.
missing_Airports = flights_merged[flights_merged['Airport Code'].isnull()].origin.unique()
missing_Airports

array(['SFB', 'PBG', 'USA', 'IAG', 'STX', 'BQN', 'STT', 'HOB', 'ECP',
       'ACY', 'DAL', 'FCA', 'ABY', 'MMH', 'LCK', 'GUM', 'SPN', 'PVU',
       'RFD', 'PPG', 'LYH', 'XWA', 'PAE', 'BFM'], dtype=object)

In [155]:
# Removing airports that do not have weather data
flights_cleaned_merged = flights_merged.dropna()
len(flights_merged) - len(flights_cleaned_merged)

18457

In [157]:
# get top 50 most frequent ORIGINS
n = 50
top_50_ORIGIN = flights_cleaned_merged.origin.value_counts()[:n].index.tolist()

In [160]:
# Output the top 50 for use for the dashboard
top_50 = pd.DataFrame((flights_cleaned_merged.groupby('origin').origin.count().sort_values(ascending=False)[:50]))
top_50.index.name = "Airport"
top_50.columns = ['flight_count']
top_50.to_csv("top_50_ORIGIN_Airports.csv")


In [162]:
top_50.head()

Unnamed: 0_level_0,flight_count
Airport,Unnamed: 1_level_1
ATL,62696
ORD,48102
DFW,45934
CLT,38377
DEN,38373


In [165]:
# To concentrate on only the most frequently used airport, we filtere for the top 50 
flights_cleaned_filtered = flights_cleaned_merged[flights_cleaned_merged.origin.isin(top_50_ORIGIN)]
len(flights_cleaned_filtered)

909510

### Step 2:
#### Feature Selection:

To determine which features to include it was important to determine how we want to use the predictive model.
In this case we will be taking a set of inputs from the user and displaying which time slot is most likely to not
have a flight delay. So we will only include information the model that we can get from the user or provide look up tables for.

The filterdList below included all the data that we either we will be able to get from the user, will be a look up
or is the primary feature we want to predict. 

In this case it is whether there will be a departure delay or not the column name DEP_DEL15 will the be feature we want to predict.

In [235]:
# Create the list that we are going to keep as features
filteredList = ['op_carrier', 'origin', 'dest', 'dep_del15'
                ,'DEP_Hour', 'windspeedKmph', 'precipMM']

# Filter the data to only include the columns we want
machine_model_df = flights_cleaned_filtered.filter(filteredList)


In [236]:
# Set the feature we want to predict as a integer since on import it was made a float
machine_model_df.dep_del15 = machine_model_df.dep_del15.astype('int')

In [237]:
machine_model_df.head()

Unnamed: 0,op_carrier,origin,dest,dep_del15,DEP_Hour,windspeedKmph,precipMM
15084,9E,ATL,FSM,0,18,10.0,2.4
15086,9E,ATL,BMI,0,12,12.0,0.1
15088,9E,ATL,AGS,0,10,10.0,0.1
15090,9E,ATL,OAJ,0,22,6.0,0.7
15092,9E,ATL,GNV,0,15,11.0,0.8


In [238]:
machine_model_df.dtypes

op_carrier        object
origin            object
dest              object
dep_del15          int32
DEP_Hour           int32
windspeedKmph    float64
precipMM         float64
dtype: object

In [239]:
# Generate our categorical variable list
flights_cat = machine_model_df.dtypes[machine_model_df.dtypes == "object"].index.tolist()

# Check the number of unique values in each column
machine_model_df[flights_cat].nunique()

op_carrier     17
origin         50
dest          328
dtype: int64

In [240]:
# Evaluate how many records are available in the dataset
print('Total number of records:  ', machine_model_df.origin.count())
print('Total number of columns:  ', len(machine_model_df.columns))

Total number of records:   909510
Total number of columns:   7


###  Step 2 - Encoding:
#### Encoding Method:
A couple of things to note, the complete dataset has over a million records and there are currently 9 columns.  Given the number of columns that have unique string data, to attempt to apply get_dummies would dramatically increase the datatable size.  This was attempted on the Carrier and Origin the result was a error code stating that there was inadequate resources.  

We will reserve the get_dummy encoding method for the OP_Carrier and Departure Time Blocks only and will use LabelEncoder for Origin, Destination, and Tail Number.  It is necessary to not limit ourselves to only the top ten of any of these options because these will be unique entries by the user later when the model is being implemented.

In [315]:
# Perform get_dummies method on the OP_Carrier and Departure Time Blocks columns
dummy_columns = ['op_carrier']
prefix = ['Carrier']
dummy_df = pd.get_dummies(machine_model_df[dummy_columns], prefix)

machine_model_df_encoded = machine_model_df.drop(['op_carrier'], axis = 1)


In [317]:
# Tranform each column with LabelEncoder
le = LabelEncoder()

# Fit and Transform Origin and Destination
le_origin = le.fit(machine_model_df['origin'])
le_dest = le.fit(machine_model_df['dest'])
machine_model_df_encoded['origin_enc'] = le_origin.transform(machine_model_df['origin'])
machine_model_df_encoded['dest_enc'] = le_dest.transform(machine_model_df['dest'])

# Export Encoder Information
pickle.dump(le_origin, open('pickle_files/Origin_encoder.pkl', 'wb'))
pickle.dump(le_dest, open('pickle_files/Dest_encoder.pkl', 'wb'))


machine_model_df_encoded.head()

Unnamed: 0,origin,dest,dep_del15,DEP_Hour,windspeedKmph,precipMM,origin_enc,dest_enc
15084,ATL,FSM,0,18,10.0,2.4,17,113
15086,ATL,BMI,0,12,12.0,0.1,17,36
15088,ATL,AGS,0,10,10.0,0.1,17,9
15090,ATL,OAJ,0,22,6.0,0.7,17,219
15092,ATL,GNV,0,15,11.0,0.8,17,121


In [318]:
# Import Encoder Information
le_origin_1 = pickle.load(open('pickle_files/Origin_encoder.pkl', 'rb')) 
le_dest_1 = pickle.load(open('pickle_files/Dest_encoder.pkl', 'rb'))


# Test Transform if it matches earlier output
machine_model_df_encoded['origin_enc_1'] = le_origin_1.transform(machine_model_df['origin'])
machine_model_df_encoded['dest_enc_1'] = le_dest_1.transform(machine_model_df['dest'])

machine_model_df_encoded.head()

Unnamed: 0,origin,dest,dep_del15,DEP_Hour,windspeedKmph,precipMM,origin_enc,dest_enc,origin_enc_1,dest_enc_1
15084,ATL,FSM,0,18,10.0,2.4,17,113,17,113
15086,ATL,BMI,0,12,12.0,0.1,17,36,17,36
15088,ATL,AGS,0,10,10.0,0.1,17,9,17,9
15090,ATL,OAJ,0,22,6.0,0.7,17,219,17,219
15092,ATL,GNV,0,15,11.0,0.8,17,121,17,121


In [319]:
# Clean up the dataframe and remove duplicate columns
machine_model_df_encoded['origin'] = machine_model_df_encoded['origin_enc']
machine_model_df_encoded['dest'] = machine_model_df_encoded['dest_enc']
machine_model_df_encoded.drop(['origin_enc','origin_enc_1','dest_enc','dest_enc_1'], axis=1, inplace = True)

machine_model_df.head()

Unnamed: 0,op_carrier,origin,dest,dep_del15,DEP_Hour,windspeedKmph,precipMM
15084,9E,ATL,FSM,0,18,10.0,2.4
15086,9E,ATL,BMI,0,12,12.0,0.1
15088,9E,ATL,AGS,0,10,10.0,0.1
15090,9E,ATL,OAJ,0,22,6.0,0.7
15092,9E,ATL,GNV,0,15,11.0,0.8


In [320]:
# Concatenate the encoded columns
machine_model_df_encoded = pd.concat([machine_model_df_encoded, dummy_df], axis = 1)
machine_model_df_encoded.head()

Unnamed: 0,origin,dest,dep_del15,DEP_Hour,windspeedKmph,precipMM,Carrier_9E,Carrier_AA,Carrier_AS,Carrier_B6,...,Carrier_G4,Carrier_HA,Carrier_MQ,Carrier_NK,Carrier_OH,Carrier_OO,Carrier_UA,Carrier_WN,Carrier_YV,Carrier_YX
15084,17,113,0,18,10.0,2.4,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15086,17,36,0,12,12.0,0.1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15088,17,9,0,10,10.0,0.1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15090,17,219,0,22,6.0,0.7,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
15092,17,121,0,15,11.0,0.8,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [321]:
# To evaluate how ballanced the data set is we look at how often flights are indicated as being delayed
# as indicated by a 1
machine_model_df_encoded.groupby(['dep_del15']).size()

dep_del15
0    767333
1    142177
dtype: int64

In [284]:
# Create Feature and Target Variables
y = machine_model_df_encoded["dep_del15"]  # Target
X = machine_model_df_encoded.drop(columns="dep_del15") # Features

In [323]:
# Get Column Headings for setup of production implementation of model
X.columns


Index(['origin', 'dest', 'DEP_Hour', 'windspeedKmph', 'precipMM', 'Carrier_9E',
       'Carrier_AA', 'Carrier_AS', 'Carrier_B6', 'Carrier_DL', 'Carrier_EV',
       'Carrier_F9', 'Carrier_G4', 'Carrier_HA', 'Carrier_MQ', 'Carrier_NK',
       'Carrier_OH', 'Carrier_OO', 'Carrier_UA', 'Carrier_WN', 'Carrier_YV',
       'Carrier_YX'],
      dtype='object')

##  Splitting Data:
Using the method imported from the package sklearn, the dataset is split into a training set and a test set.
The seperation is achived randomly.  
By adding the condition stratify = y, we are insuring that the test and training sets contain the proportion 
of values as provided in the target set.  By setting random_state to a specific value we can get repeatedly the same split of the data so it is possible to make sure that at least the testing and training sets are consistent.

In [285]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=34, 
                                                    stratify=y)
X_train.shape

(682132, 22)

In [286]:
# Preprocess numerical data

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [287]:
# summarize the scale of each input variable
for i in range(X_test.shape[1]):
    print('>%d, train: min=%.3f, max=%.3f, test: min=%.3f, max=%.3f' %
        (i, X_train_scaled[:, i].min(), X_train_scaled[:, i].max(),
        X_test_scaled[:, i].min(), X_test_scaled[:, i].max()))

>0, train: min=-1.607, max=1.745, test: min=-1.607, max=1.745
>1, train: min=-1.827, max=1.831, test: min=-1.827, max=1.831
>2, train: min=-2.679, max=1.946, test: min=-2.679, max=1.946
>3, train: min=-1.714, max=5.267, test: min=-1.714, max=5.670
>4, train: min=-0.242, max=35.176, test: min=-0.242, max=35.176
>5, train: min=-0.181, max=5.526, test: min=-0.181, max=5.526
>6, train: min=-0.422, max=2.371, test: min=-0.422, max=2.371
>7, train: min=-0.194, max=5.153, test: min=-0.194, max=5.153
>8, train: min=-0.212, max=4.719, test: min=-0.212, max=4.719
>9, train: min=-0.414, max=2.413, test: min=-0.414, max=2.413
>10, train: min=-0.124, max=8.039, test: min=-0.124, max=8.039
>11, train: min=-0.143, max=6.991, test: min=-0.143, max=6.991
>12, train: min=-0.059, max=17.059, test: min=-0.059, max=17.059
>13, train: min=-0.090, max=11.141, test: min=-0.090, max=11.141
>14, train: min=-0.181, max=5.511, test: min=-0.181, max=5.511
>15, train: min=-0.184, max=5.447, test: min=-0.184, max=5.

In [310]:
X_test_scaled[:5]

array([[-0.82944738,  0.57830369,  0.13634161,  0.43437741, -0.24186667,
        -0.18095926,  2.37051961, -0.19405879, -0.21189003, -0.41444205,
        -0.1243917 , -0.1430406 , -0.05862018, -0.0897607 , -0.18146412,
        -0.18358663, -0.17489525, -0.30943841, -0.32530336, -0.49075924,
        -0.16038655, -0.22515479],
       [ 0.65793671,  1.64100602, -0.06475808,  0.83711877, -0.24186667,
        -0.18095926, -0.42184844, -0.19405879, -0.21189003,  2.41288255,
        -0.1243917 , -0.1430406 , -0.05862018, -0.0897607 , -0.18146412,
        -0.18358663, -0.17489525, -0.30943841, -0.32530336, -0.49075924,
        -0.16038655, -0.22515479],
       [ 0.14307298, -1.60303266, -1.27135625, -0.90809381, -0.24186667,
        -0.18095926, -0.42184844, -0.19405879, -0.21189003, -0.41444205,
        -0.1243917 , -0.1430406 , -0.05862018, -0.0897607 , -0.18146412,
        -0.18358663, -0.17489525, -0.30943841, -0.32530336,  2.03765904,
        -0.16038655, -0.22515479],
       [ 0.90964786

In [309]:
# Save the Scaler
pickle.dump(scaler, open('pickle_files/scaler.pkl', 'wb'))

In [311]:
# load the scaler
scaler_1 = pickle.load(open('scaler.pkl', 'rb'))
# transform the test dataset
X_test_scaled_1 = scaler_1.transform(X_test)

X_test_scaled_1[:5]

array([[-0.82944738,  0.57830369,  0.13634161,  0.43437741, -0.24186667,
        -0.18095926,  2.37051961, -0.19405879, -0.21189003, -0.41444205,
        -0.1243917 , -0.1430406 , -0.05862018, -0.0897607 , -0.18146412,
        -0.18358663, -0.17489525, -0.30943841, -0.32530336, -0.49075924,
        -0.16038655, -0.22515479],
       [ 0.65793671,  1.64100602, -0.06475808,  0.83711877, -0.24186667,
        -0.18095926, -0.42184844, -0.19405879, -0.21189003,  2.41288255,
        -0.1243917 , -0.1430406 , -0.05862018, -0.0897607 , -0.18146412,
        -0.18358663, -0.17489525, -0.30943841, -0.32530336, -0.49075924,
        -0.16038655, -0.22515479],
       [ 0.14307298, -1.60303266, -1.27135625, -0.90809381, -0.24186667,
        -0.18095926, -0.42184844, -0.19405879, -0.21189003, -0.41444205,
        -0.1243917 , -0.1430406 , -0.05862018, -0.0897607 , -0.18146412,
        -0.18358663, -0.17489525, -0.30943841, -0.32530336,  2.03765904,
        -0.16038655, -0.22515479],
       [ 0.90964786

## Model Evaluation

Multiple models were evaluated to determine how well they are able to predict the end result.  It is important to note that there are far fewer delayed flights than flights that are on time.  Because that is true, when evaluating how accurate the model is at predicting we will need to evaluate it using precision, recall, and F1-score.

### LogisticRegression Model

In [291]:
# Create the Logistic Regression Model
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=2000,
                                random_state=10)

In [292]:
# Fit the model
classifier.fit(X_train_scaled, y_train)

LogisticRegression(max_iter=2000, random_state=10)

In [293]:
# Make predictions 
y_pred = classifier.predict(X_test_scaled)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,1
3,0,0
4,0,0
5,0,1
6,0,0
7,0,0
8,0,0
9,0,0


In [294]:
metrics.confusion_matrix(y_test, y_pred)

array([[191776,     58],
       [ 35509,     35]], dtype=int64)

In [295]:
# Because dataset is imbalanced we cant use accuracy metric
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      1.00      0.92    191834
           1       0.38      0.00      0.00     35544

    accuracy                           0.84    227378
   macro avg       0.61      0.50      0.46    227378
weighted avg       0.77      0.84      0.77    227378



# Decision Tree Analysis

In [296]:
clf_dt = DecisionTreeClassifier(random_state = 0)
model_dt = clf_dt.fit(X_train_scaled, y_train)

In [297]:
y_pred = model_dt.predict(X_test_scaled)
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.91      0.90    191834
           1       0.42      0.35      0.38     35544

    accuracy                           0.82    227378
   macro avg       0.65      0.63      0.64    227378
weighted avg       0.81      0.82      0.82    227378



In [298]:
metrics.confusion_matrix(y_test, y_pred)

array([[174954,  16880],
       [ 23154,  12390]], dtype=int64)

In [299]:
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,1
3,0,0
4,1,0
5,0,1
6,1,0
7,0,0
8,0,0
9,0,0


In [312]:
# save the model
pickle.dump(model_dt, open('pickle_files/model_dt.pkl', 'wb'))


In [313]:
# load the model
model_dt_1 = pickle.load(open('pickle_files/model_dt.pkl', 'rb'))

# Test loaded model
y_pred_1 = model_dt_1.predict(X_test_scaled)
print(metrics.classification_report(y_test, y_pred_1))

              precision    recall  f1-score   support

           0       0.88      0.91      0.90    191834
           1       0.42      0.35      0.38     35544

    accuracy                           0.82    227378
   macro avg       0.65      0.63      0.64    227378
weighted avg       0.81      0.82      0.82    227378



# Random Forest Classifier
This one is too slow and is demonstrated to be a poor predictor

In [89]:
from sklearn.ensemble import RandomForestClassifier
clf_rf = RandomForestClassifier(n_estimators=10, class_weight = 'balanced_subsample')
model_rf = clf_rf.fit(X_train_scaled, y_train)

In [90]:
y_pred = model_rf.predict(X_test_scaled)
print(metrics.classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.87      0.92      0.90    192122
           1       0.38      0.26      0.31     35598

    accuracy                           0.82    227720
   macro avg       0.62      0.59      0.60    227720
weighted avg       0.79      0.82      0.80    227720



In [91]:
metrics.confusion_matrix(y_test, y_pred)

array([[177100,  15022],
       [ 26415,   9183]], dtype=int64)

### NeuralNetwork

In [85]:
# Define the model - deep neural net
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 =  1.5 * len(X_train_scaled[0])
hidden_nodes_layer2 = 12

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))


# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=50)

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
7117/7117 - 3s - loss: 0.4009 - accuracy: 0.8476
Loss: 0.40092071890830994, Accuracy: 0.8475540280342102


In [86]:
# Get Predictions
y_pred = tf.round(nn.predict(X_test_scaled))


In [87]:
print(metrics.classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.85      1.00      0.92    192122
           1       0.72      0.04      0.08     35598

    accuracy                           0.85    227720
   macro avg       0.79      0.52      0.50    227720
weighted avg       0.83      0.85      0.79    227720



In [88]:
metrics.confusion_matrix(y_test, y_pred)

array([[191569,    553],
       [ 34162,   1436]], dtype=int64)

### Model Final Selection
After evaluating each model using the entire combined dataset, it was found that Decision Tree was the best at predicting delays.  That will be the model we will use going forward.

## Import Encoding and Machine Model for use in Production Test

In [None]:
# Import Scalers

# Import Encoder Information for origin and destination
le_origin = pickle.load(open('pickle_files/Origin_encoder.pkl', 'rb')) 
le_dest = pickle.load(open('pickle_files/Dest_encoder.pkl', 'rb'))

# load the scaler for all input columns
scaler = pickle.load(open('scaler.pkl', 'rb'))

# Load the model
# load the model
model_dt = pickle.load(open('pickle_files/model_dt.pkl', 'rb'))

In [335]:
input_columns = ['origin', 'dest', 'DEP_Hour', 'windspeedKmph', 'precipMM', 'Carrier_9E'
                           , 'Carrier_AA', 'Carrier_AS', 'Carrier_B6', 'Carrier_DL', 'Carrier_EV'
                           , 'Carrier_F9', 'Carrier_G4', 'Carrier_HA', 'Carrier_MQ', 'Carrier_NK'
                           , 'Carrier_OH', 'Carrier_OO', 'Carrier_UA', 'Carrier_WN', 'Carrier_YV'
                           , 'Carrier_YX']
op_carrier = '9E'
origin = 'ATL'
dest = 'FSM'
DEP_Hour = range(0,24)
windspeedKmph = 10.0
precipMM = 2.4

inputs_df = pd.DataFrame(columns = input_columns)

inputs_df['DEP_Hour'] = range(0, 24, 1)
inputs_df['origin'] = origin
inputs_df['dest'] = dest
inputs_df['windspeedKmph'] = windspeedKmph
inputs_df['precipMM'] = precipMM

carrier_column = "Carrier_" + op_carrier
inputs_df[carrier_column] = 1


inputs_df = inputs_df.fillna(0)


In [336]:
inputs_df

Unnamed: 0,origin,dest,DEP_Hour,windspeedKmph,precipMM,Carrier_9E,Carrier_AA,Carrier_AS,Carrier_B6,Carrier_DL,...,Carrier_G4,Carrier_HA,Carrier_MQ,Carrier_NK,Carrier_OH,Carrier_OO,Carrier_UA,Carrier_WN,Carrier_YV,Carrier_YX
0,ATL,FSM,0,10.0,2.4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ATL,FSM,1,10.0,2.4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,ATL,FSM,2,10.0,2.4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ATL,FSM,3,10.0,2.4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ATL,FSM,4,10.0,2.4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,ATL,FSM,5,10.0,2.4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,ATL,FSM,6,10.0,2.4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,ATL,FSM,7,10.0,2.4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,ATL,FSM,8,10.0,2.4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,ATL,FSM,9,10.0,2.4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [337]:
inputs_df_encoded = inputs_df

# Test Transform if it matches earlier output
inputs_df_encoded['origin'] = le_origin.transform(inputs_df['origin'])
inputs_df_encoded['dest'] = le_dest.transform(inputs_df['dest'])

inputs_df_encoded.head()

Unnamed: 0,origin,dest,DEP_Hour,windspeedKmph,precipMM,Carrier_9E,Carrier_AA,Carrier_AS,Carrier_B6,Carrier_DL,...,Carrier_G4,Carrier_HA,Carrier_MQ,Carrier_NK,Carrier_OH,Carrier_OO,Carrier_UA,Carrier_WN,Carrier_YV,Carrier_YX
0,17,113,0,10.0,2.4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,17,113,1,10.0,2.4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,17,113,2,10.0,2.4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,17,113,3,10.0,2.4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,17,113,4,10.0,2.4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [339]:
# transform the test dataset
inputs_scaled = scaler.transform(inputs_df_encoded)

inputs_scaled[:5]

array([[-1.60746367, -0.56270302, -2.67905412, -0.37110532,  5.74424583,
         5.52610556, -0.42184844, -0.19405879, -0.21189003, -0.41444205,
        -0.1243917 , -0.1430406 , -0.05862018, -0.0897607 , -0.18146412,
        -0.18358663, -0.17489525, -0.30943841, -0.32530336, -0.49075924,
        -0.16038655, -0.22515479],
       [-1.60746367, -0.56270302, -2.47795442, -0.37110532,  5.74424583,
         5.52610556, -0.42184844, -0.19405879, -0.21189003, -0.41444205,
        -0.1243917 , -0.1430406 , -0.05862018, -0.0897607 , -0.18146412,
        -0.18358663, -0.17489525, -0.30943841, -0.32530336, -0.49075924,
        -0.16038655, -0.22515479],
       [-1.60746367, -0.56270302, -2.27685473, -0.37110532,  5.74424583,
         5.52610556, -0.42184844, -0.19405879, -0.21189003, -0.41444205,
        -0.1243917 , -0.1430406 , -0.05862018, -0.0897607 , -0.18146412,
        -0.18358663, -0.17489525, -0.30943841, -0.32530336, -0.49075924,
        -0.16038655, -0.22515479],
       [-1.60746367

In [340]:

# Test loaded model
y_pred = model_dt.predict(inputs_scaled)

y_pred

array([1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 1])

In [341]:
inputs_df['delayed'] = y_pred

In [343]:
inputs_df

Unnamed: 0,origin,dest,DEP_Hour,windspeedKmph,precipMM,Carrier_9E,Carrier_AA,Carrier_AS,Carrier_B6,Carrier_DL,...,Carrier_HA,Carrier_MQ,Carrier_NK,Carrier_OH,Carrier_OO,Carrier_UA,Carrier_WN,Carrier_YV,Carrier_YX,delayed
0,17,113,0,10.0,2.4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,17,113,1,10.0,2.4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,17,113,2,10.0,2.4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,17,113,3,10.0,2.4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,17,113,4,10.0,2.4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5,17,113,5,10.0,2.4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
6,17,113,6,10.0,2.4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,17,113,7,10.0,2.4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,17,113,8,10.0,2.4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,17,113,9,10.0,2.4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
