In [1]:
# Import Dependencies
from path import Path
import pandas as pd
from sklearn.preprocessing import StandardScaler,OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from collections import defaultdict
import pickle, os


# Preliminary data preprocessing
Original Dataset was provided by a Kaggle Project.  For the intial analysis of the dataset the information was imported to a local directory and then imported read into this jupyter notebook.  Later we will have the connection to the database here. The dataset already came as a csv file so no further processing was necessary.  

Merging of the weather data was accomplished by using the World Weather Online API.  The download of the required information was achieved in a seperate jupyter notebook.  Data was formated as a simple csv file and was imported below.

In [3]:
# Import testing dataset
flights_2019 = pd.read_csv('Database/Data/jan_19_clean_data.csv')
flights_2020 = pd.read_csv('Database/Data/jan_20_clean_data.csv')

flights_2019.head()

Unnamed: 0,DAY_OF_MONTH,DAY_OF_WEEK,OP_CARRIER_AIRLINE_ID,OP_CARRIER,TAIL_NUM,OP_CARRIER_FL_NUM,ORIGIN_AIRPORT_ID,ORIGIN_AIRPORT_SEQ_ID,ORIGIN,DEST_AIRPORT_ID,DEST_AIRPORT_SEQ_ID,DEST,DEP_TIME,DEP_DEL15,DEP_TIME_BLK,ARR_TIME,ARR_DEL15,CANCELLED,DIVERTED,DISTANCE
0,1,2,20363,9E,N8688C,3280,11953,1195302,GNV,10397,1039707,ATL,601.0,0.0,0600-0659,722.0,0.0,0,0,300
1,1,2,20363,9E,N348PQ,3281,13487,1348702,MSP,11193,1119302,CVG,1359.0,0.0,1400-1459,1633.0,0.0,0,0,596
2,1,2,20363,9E,N8896A,3282,11433,1143302,DTW,11193,1119302,CVG,1215.0,0.0,1200-1259,1329.0,0.0,0,0,229
3,1,2,20363,9E,N8886A,3283,15249,1524906,TLH,10397,1039707,ATL,1521.0,0.0,1500-1559,1625.0,0.0,0,0,223
4,1,2,20363,9E,N8974C,3284,10397,1039707,ATL,11778,1177801,FSM,1847.0,0.0,1900-1959,1940.0,0.0,0,0,579


## Preliminary Feature Engineering
Beyond a visual inspection of the head of the data the following steps were performed below:

1) Determined if there are any rows that need to be dropped because they are missing data.

2) Determine which columns of data will need to be included in the training set.

3) Perform the inital encoding of the columns that contained objects as numerical values so it would be easier for the machine learning model to process it.

In [5]:
na_count = flights_2019.DAY_OF_MONTH.count() - flights_2019.dropna(axis=0).DAY_OF_MONTH.count()
print('Total rows with missing information from 2019 data was:  ', na_count )

Total rows with missing information from 2019 data was:   0


In [6]:
na_count = flights_2020.DAY_OF_MONTH.count() - flights_2020.dropna(axis=0).DAY_OF_MONTH.count()
print('Total rows with missing information from 2019 data was:  ', na_count )

Total rows with missing information from 2019 data was:   0


### Step one complete.  The datasets are already cleaned with no missing information in any of the rows.

In [9]:
# Combine the two data sets to give more data for the model to work with
flights = pd.concat([flights_2019, flights_2020])

In [10]:
#  Look at the available columns
flights.columns

Index(['DAY_OF_MONTH', 'DAY_OF_WEEK', 'OP_CARRIER_AIRLINE_ID', 'OP_CARRIER',
       'TAIL_NUM', 'OP_CARRIER_FL_NUM', 'ORIGIN_AIRPORT_ID',
       'ORIGIN_AIRPORT_SEQ_ID', 'ORIGIN', 'DEST_AIRPORT_ID',
       'DEST_AIRPORT_SEQ_ID', 'DEST', 'DEP_TIME', 'DEP_DEL15', 'DEP_TIME_BLK',
       'ARR_TIME', 'ARR_DEL15', 'CANCELLED', 'DIVERTED', 'DISTANCE'],
      dtype='object')

### Step 2:
#### Feature Selection:

To determine which features to include it was important to determine how we want to use the predictive model.
In this case we will be taking a set of inputs from the user and displaying which time slot is most likely to not
have a flight delay. So we will only include information the model that we can get from the user or provide look up tables for.

The filterdList below included all the data that we either we will be able to get from the user, will be a look up
or is the primary feature we want to predict. 

In this case it is whether there will be a departure delay or not the column name DEP_DEL15 will the be feature we want to predict.

In [11]:
# Create the list that we are going to keep as features
filteredList = ['DAY_OF_MONTH', 'DAY_OF_WEEK', 'OP_CARRIER', 'ORIGIN', 'DEST', 'DEP_DEL15', 'DISTANCE'
                ,'DEP_TIME_BLK', 'TAIL_NUM']

# Keep the list of columns that were removed...possibly for later use.
columnsRemoved = ['OP_CARRIER_AIRLINE_ID', 'DEST_AIRPORT_SEQ_ID', 'ORIGIN_AIRPORT_SEQ_ID' , 'ORIGIN_AIRPORT_ID'
                  , 'OP_CARRIER_FL_NUM', 'DEST_AIRPORT_ID', 'ARR_TIME', 'ARR_DEL15', 'CANCELLED', 'DIVERTED', 'DEP_TIME']

# Filter the data to only include the columns we want
machine_model_df = flights.filter(filteredList)
machine_model_df.head()

Unnamed: 0,DAY_OF_MONTH,DAY_OF_WEEK,OP_CARRIER,ORIGIN,DEST,DEP_DEL15,DISTANCE,DEP_TIME_BLK,TAIL_NUM
0,1,2,9E,GNV,ATL,0.0,300,0600-0659,N8688C
1,1,2,9E,MSP,CVG,0.0,596,1400-1459,N348PQ
2,1,2,9E,DTW,CVG,0.0,229,1200-1259,N8896A
3,1,2,9E,TLH,ATL,0.0,223,1500-1559,N8886A
4,1,2,9E,ATL,FSM,0.0,579,1900-1959,N8974C


In [12]:
# Set the feature we want to predict as a integer since on import it was made a float
machine_model_df.DEP_DEL15 = machine_model_df.DEP_DEL15.astype('int')


In [13]:
# We are evaluating the impact of including Departure Time and a floating number in the dataset.
# if it is there then make it an integer
if "DEP_TIME" in machine_model_df.columns:
    machine_model_df.DEP_TIME = machine_model_df.DEP_TIME.astype('int')

In [58]:
# Examine features and determine how the various features will need to be encoded
machine_model_df.dtypes

DAY_OF_MONTH     int64
DAY_OF_WEEK      int64
OP_CARRIER      object
ORIGIN          object
DEST            object
DEP_DEL15        int32
DISTANCE         int64
DEP_TIME_BLK    object
TAIL_NUM        object
dtype: object

In [24]:
# Generate our categorical variable list
flights_cat = machine_model_df.dtypes[machine_model_df.dtypes == "object"].index.tolist()

# Check the number of unique values in each column
machine_model_df[flights_cat].nunique()

OP_CARRIER        17
ORIGIN           353
DEST             353
DEP_TIME_BLK      19
TAIL_NUM        5854
dtype: int64

In [18]:
# Evaluate how many records are available in the dataset
print('Total number of records:  ', machine_model_df.DAY_OF_MONTH.count())
print('Total number of columns:  ', len(machine_model_df.columns))

Total number of records:   1165231
Total number of columns:   9


###  Step 2 - Encoding:
#### Encoding Method:
A couple of things to note, the complete dataset has over a million records and there are currently 9 columns.  Given the number of columns that have unique string data, to attempt to apply get_dummies would dramatically increase the datatable size.  This was attempted on the Carrier and Origin the result was a error code stating that there was inadequate resources.  

We will reserve the get_dummy encoding method for the OP_Carrier and Departure Time Blocks only and will use LabelEncoder for Origin, Destination, and Tail Number.  It is necessary to not limit ourselves to only the top ten of any of these options because these will be unique entries by the user later when the model is being implemented.

In [20]:
# Perform get_dummies method on the OP_Carrier and Departure Time Blocks columns
dummy_columns = ['OP_CARRIER', 'DEP_TIME_BLK']
prefix = ['Carrier', 'Time_Block']
dummy_df = pd.get_dummies(machine_model_df[dummy_columns], prefix)

In [22]:
# Tranform each column with LabelEncoder (https://stackoverflow.com/questions/24458645/label-encoding-across-multiple-columns-in-scikit-learn)
lableEncoder_columns = ['ORIGIN', 'DEST', 'TAIL_NUM']
d = defaultdict(LabelEncoder)

df = machine_model_df[lableEncoder_columns]

# Encoding the variable
labelEncoded_df = df.apply(lambda x: d[x.name].fit_transform(x))

# Retaining this code here for later reference
## Inverse the encoded
# fit.apply(lambda x: d[x.name].inverse_transform(x))

## Using the dictionary to label future data
# df.apply(lambda x: d[x.name].transform(x))

In [25]:
# Concatenate the encoded columns
machine_model_df_encoded = machine_model_df.drop(flights_cat, axis=1)
machine_model_df_encoded = pd.concat([machine_model_df_encoded, dummy_df, labelEncoded_df], axis = 1)
machine_model_df_encoded.head()

Unnamed: 0,DAY_OF_MONTH,DAY_OF_WEEK,DEP_DEL15,DISTANCE,Carrier_9E,Carrier_AA,Carrier_AS,Carrier_B6,Carrier_DL,Carrier_EV,...,Time_Block_1700-1759,Time_Block_1800-1859,Time_Block_1900-1959,Time_Block_2000-2059,Time_Block_2100-2159,Time_Block_2200-2259,Time_Block_2300-2359,ORIGIN,DEST,TAIL_NUM
0,1,2,0,300,1,0,0,0,0,0,...,0,0,0,0,0,0,0,130,19,4648
1,1,2,0,596,1,0,0,0,0,0,...,0,0,0,0,0,0,0,227,82,1542
2,1,2,0,229,1,0,0,0,0,0,...,0,0,0,0,0,0,0,97,82,4810
3,1,2,0,223,1,0,0,0,0,0,...,0,0,0,0,0,0,0,331,19,4806
4,1,2,0,579,1,0,0,0,0,0,...,0,0,1,0,0,0,0,19,122,4867


In [26]:
# Create Feature and Target Variables
y = machine_model_df_encoded["DEP_DEL15"]  # Target
X = machine_model_df_encoded.drop(columns="DEP_DEL15") # Features

##  Splitting Data:
Using the method imported from the package sklearn, the dataset is split into a training set and a test set.
The seperation is achived randomly.  
By adding the condition stratify = y, we are insuring that the test and training sets contain the proportion 
of values as provided in the target set.

In [65]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=34, 
                                                    stratify=y)
X_train.shape

(873923, 8)

In [66]:
# Preprocess numerical data for neural network

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# LogisticRegression Model

In [67]:
# Create the Logistic Regression Model
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=2000,
                                random_state=10)

In [75]:
# Fit the model
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=2000, random_state=10)

In [76]:
# Make predictions 
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,0


In [71]:
metrics.confusion_matrix(y_test, y_pred)

array([[246370,      0],
       [ 44938,      0]], dtype=int64)

In [77]:
# Because dataset is imbalanced we cant use accuracy metric

print(metrics.classification_report(y_test, y_pred))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.85      1.00      0.92    246370
           1       0.00      0.00      0.00     44938

    accuracy                           0.85    291308
   macro avg       0.42      0.50      0.46    291308
weighted avg       0.72      0.85      0.78    291308



# Decision Tree Analysis

In [80]:
from sklearn.tree import DecisionTreeClassifier
clf_dt = DecisionTreeClassifier(random_state = 0)
model_dt = clf_dt.fit(X_train, y_train)

In [81]:
y_pred = model_dt.predict(X_test)
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.85      0.86    246370
           1       0.27      0.29      0.28     44938

    accuracy                           0.77    291308
   macro avg       0.57      0.57      0.57    291308
weighted avg       0.78      0.77      0.77    291308



In [82]:
metrics.confusion_matrix(y_test, y_pred)

array([[209717,  36653],
       [ 31721,  13217]], dtype=int64)

In [None]:
results.head(20)

# Random Forest Classifier
This one is too slow and is demonstrated to be a poor predictor

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf_rf = RandomForestClassifier(max_depth=15)
model_rf = clf_rf.fit(X_train, y_train)

In [None]:
y_pred = model_rf.predict(X_test)
print(metrics.classification_report(y_test,y_pred))

In [None]:
metrics.confusion_matrix(y_test, y_pred)

# Support Vector Machines

In [None]:
from sklearn import svm 
svc = svm.SVC(kernel='linear') 
svc.fit(X_train, y_train)


In [None]:
y_pred = svc.predict(X_test)
print(metrics.classification_report(y_test,y_pred))

In [None]:
metrics.confusion_matrix(y_test, y_pred)