## Machine Learning Model 

### Delay Classification Model: Classify whether the flight will be delayed more than 15 minutes [0/1]

1. Read the data from the folder.
2. Convert to dummy variables.
3. Create train-and-test split data and train them in a logistic regression model.
4. Create a Random Forest Model with 50 trees and 100 trees to see the accuracy. 

# Predicting Arrival Delay

In [16]:
import pandas as pd
import numpy as np
import time
from sklearn.externals import joblib

# Read the data from the folder:

tic = time.time()
df = pd.read_csv('../python-introduction-th2669/juneairline_data1.csv')
toc = time.time()
print("Finished reading CSV file in " + str(toc-tic) + " seconds")
df.head
df.dtypes

Finished reading CSV file in 0.529555082321167 seconds


DAY_OF_WEEK         int64
UNIQUE_CARRIER     object
ORIGIN             object
DEST               object
ARR_DELAY         float64
DEP_HOUR            int64
ARR_HOUR            int64
dtype: object

In [17]:
# DATA preparation
tic = time.time()
# Remove data redundancy
df['ARR_HOUR'] = df['ARR_HOUR'].apply(lambda x:0 if x == 24 else x)
# Drop rows with Null Values
df.dropna(inplace=True)

# Convert to Dummy Variables
df = pd.concat([df,pd.get_dummies(df['DAY_OF_WEEK'],drop_first=True,prefix="DAY_OF_WEEK")],axis=1)
df = pd.concat([df,pd.get_dummies(df['UNIQUE_CARRIER'],drop_first=True,prefix="UNIQUE_CARRIER")],axis=1)
df = pd.concat([df,pd.get_dummies(df['ORIGIN'],drop_first=True,prefix="ORIGIN")],axis=1)
df = pd.concat([df,pd.get_dummies(df['DEST'],drop_first=True,prefix="DEST")],axis=1)
df = pd.concat([df,pd.get_dummies(df['DEP_HOUR'],drop_first=True,prefix="DEP_HOUR")],axis=1)
df = pd.concat([df,pd.get_dummies(df['ARR_HOUR'],drop_first=True,prefix="ARR_HOUR")],axis=1)

df.drop(['DAY_OF_WEEK','UNIQUE_CARRIER','ORIGIN','DEST','DEP_HOUR','ARR_HOUR'],axis=1,inplace=True)

#ARR_DELAY -> Delay Yes or No -> 1 if Delay > 15 minutes, else 0
df['ARR_DELAY'] = df['ARR_DELAY'].apply(lambda x:1 if x>=15 else 0)

toc = time.time()
print("Finished preparing data in " + str(toc-tic) + " seconds")

Finished preparing data in 27.491552114486694 seconds


### Try Logistic Regression Model 

In [18]:
#Create the train and test split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop('ARR_DELAY',axis=1), 
                                                    df['ARR_DELAY'], test_size=0.30, 
                                                    random_state=42)



In [19]:
from sklearn.linear_model import LogisticRegression

#Train the model in a logistic regression model
logmodel_arr = LogisticRegression(penalty='l2')
logmodel_arr.fit(X_train,y_train)

#Predicting on the Test Set
predictions = logmodel_arr.predict(X_test)

In [20]:
#Model Evaluation
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))

truePos = X_test[((predictions == 1) & (y_test == predictions))]
falsePos = X_test[((predictions == 1) & (y_test != predictions))]
trueNeg = X_test[((predictions == 0) & (y_test == predictions))]
falseNeg = X_test[((predictions == 0) & (y_test != predictions))]

TP = truePos.shape[0]
FP = falsePos.shape[0]
TN = trueNeg.shape[0]
FN = falseNeg.shape[0]

accuracy = float(TP + TN)/float(TP + TN + FP + FN)
print('Accuracy: '+str(accuracy))

             precision    recall  f1-score   support

          0       0.82      0.98      0.89    114257
          1       0.67      0.19      0.30     30018

avg / total       0.79      0.81      0.77    144275

Accuracy: 0.8125108300121296


In [6]:
import pickle

with open('logmodel_arr.pkl', 'wb') as fid:
    pickle.dump(logmodel_arr, fid,2)

In [7]:
#Save a dictionary of the index keys to make the dummy variables out of user input
#create a dataframe containing only the categorical variables. In our case, it is the entire dataset except the ARR_DELAY column
flightdata = df.drop('ARR_DELAY',axis=1)
index_dict = dict(zip(flightdata.columns,range(flightdata.shape[1])))

#Save the index_dict into disk
with open('flightdata', 'wb') as fid:
    pickle.dump(index_dict, fid,2)

### Let's Try Random Forest Model (n=50) 

In [19]:
from sklearn import linear_model, cross_validation, metrics, svm
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

# Create Random Forest classifier with 50 trees
randomforest_arr = RandomForestClassifier(n_estimators=50, n_jobs=-1)
randomforest_arr.fit(X_train, y_train)

# Evaluate on test set
predictions = randomforest_arr.predict(X_test)



In [20]:
#Model Evaluation
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))

truePos = X_test[((predictions == 1) & (y_test == predictions))]
falsePos = X_test[((predictions == 1) & (y_test != predictions))]
trueNeg = X_test[((predictions == 0) & (y_test == predictions))]
falseNeg = X_test[((predictions == 0) & (y_test != predictions))]

TP = truePos.shape[0]
FP = falsePos.shape[0]
TN = trueNeg.shape[0]
FN = falseNeg.shape[0]

accuracy = float(TP + TN)/float(TP + TN + FP + FN)
print('Accuracy: '+str(accuracy))

             precision    recall  f1-score   support

          0       0.88      0.94      0.91    114165
          1       0.71      0.53      0.61     30110

avg / total       0.85      0.86      0.85    144275

Accuracy: 0.857425056316063


### Let's Try Random Forest Model (n=100) 

In [5]:
from sklearn import linear_model, cross_validation, metrics, svm
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

# Create Random Forest classifier with 100 trees
randomforest_arr = RandomForestClassifier(n_estimators=100, n_jobs=-1)
randomforest_arr.fit(X_train, y_train)

# Evaluate on test set
predictions = randomforest_arr.predict(X_test)

In [6]:
#Model Evaluation
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))

truePos = X_test[((predictions == 1) & (y_test == predictions))]
falsePos = X_test[((predictions == 1) & (y_test != predictions))]
trueNeg = X_test[((predictions == 0) & (y_test == predictions))]
falseNeg = X_test[((predictions == 0) & (y_test != predictions))]

TP = truePos.shape[0]
FP = falsePos.shape[0]
TN = trueNeg.shape[0]
FN = falseNeg.shape[0]

accuracy = float(TP + TN)/float(TP + TN + FP + FN)
print('Accuracy: '+str(accuracy))

             precision    recall  f1-score   support

          0       0.88      0.95      0.91    114165
          1       0.72      0.53      0.61     30110

avg / total       0.85      0.86      0.85    144275

Accuracy: 0.8590053716860163


In [21]:
import pickle

with open('randomforest_arr.pkl', 'wb') as fid:
    pickle.dump(randomforest_arr, fid,2)

# Predicting Departure Delay with same method

In [8]:
import pandas as pd
import numpy as np
import time
from sklearn.externals import joblib

# Read the data from the folder:

tic = time.time()
df = pd.read_csv('../python-introduction-th2669/juneairline_data2.csv')
toc = time.time()
print("Finished reading CSV file in " + str(toc-tic) + " seconds")
df.head
df.dtypes

Finished reading CSV file in 0.5333318710327148 seconds


DAY_OF_WEEK         int64
UNIQUE_CARRIER     object
ORIGIN             object
DEST               object
DEP_DELAY         float64
DEP_HOUR            int64
ARR_HOUR            int64
dtype: object

In [9]:
# DATA preparation
tic = time.time()
# Remove data redundancy
df['ARR_HOUR'] = df['ARR_HOUR'].apply(lambda x:0 if x == 24 else x)
# Drop rows with Null Values
df.dropna(inplace=True)

# Convert to Dummy Variables
df = pd.concat([df,pd.get_dummies(df['DAY_OF_WEEK'],drop_first=True,prefix="DAY_OF_WEEK")],axis=1)
df = pd.concat([df,pd.get_dummies(df['UNIQUE_CARRIER'],drop_first=True,prefix="UNIQUE_CARRIER")],axis=1)
df = pd.concat([df,pd.get_dummies(df['ORIGIN'],drop_first=True,prefix="ORIGIN")],axis=1)
df = pd.concat([df,pd.get_dummies(df['DEST'],drop_first=True,prefix="DEST")],axis=1)
df = pd.concat([df,pd.get_dummies(df['DEP_HOUR'],drop_first=True,prefix="DEP_HOUR")],axis=1)
df = pd.concat([df,pd.get_dummies(df['ARR_HOUR'],drop_first=True,prefix="ARR_HOUR")],axis=1)

df.drop(['DAY_OF_WEEK','UNIQUE_CARRIER','ORIGIN','DEST','DEP_HOUR','ARR_HOUR'],axis=1,inplace=True)

#DEP_DELAY -> Delay Yes or No -> 1 if Delay > 15 minutes, else 0
df['DEP_DELAY'] = df['DEP_DELAY'].apply(lambda x:1 if x>=15 else 0)

toc = time.time()
print("Finished preparing data in " + str(toc-tic) + " seconds")


Finished preparing data in 30.257885932922363 seconds


## Logistic Regression Model

In [10]:
#Create the train and test split

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop('DEP_DELAY',axis=1), 
                                                    df['DEP_DELAY'], test_size=0.30, 
                                                    random_state=101)



In [11]:
from sklearn.linear_model import LogisticRegression

#Train the model in a logistic regression model
logmodel_dep = LogisticRegression(penalty='l2')
logmodel_dep.fit(X_train,y_train)

#Predicting on the Test Set
predictions = logmodel_dep.predict(X_test)

In [12]:
#Model Evaluation
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))

truePos = X_test[((predictions == 1) & (y_test == predictions))]
falsePos = X_test[((predictions == 1) & (y_test != predictions))]
trueNeg = X_test[((predictions == 0) & (y_test == predictions))]
falseNeg = X_test[((predictions == 0) & (y_test != predictions))]

TP = truePos.shape[0]
FP = falsePos.shape[0]
TN = trueNeg.shape[0]
FN = falseNeg.shape[0]

accuracy = float(TP + TN)/float(TP + TN + FP + FN)
print('Accuracy: '+str(accuracy))

             precision    recall  f1-score   support

          0       0.82      0.98      0.89    114638
          1       0.69      0.18      0.28     29637

avg / total       0.79      0.81      0.77    144275

Accuracy: 0.8145971235487783


In [13]:
import pickle

with open('logmodel_dep.pkl', 'wb') as fid:
    pickle.dump(logmodel_dep, fid,2)

### Let's Try Random Forest Model (n=50) 

In [10]:
from sklearn import linear_model, cross_validation, metrics, svm
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

# Create Random Forest classifier with 50 trees
randomforest_dep = RandomForestClassifier(n_estimators=50, n_jobs=-1)
randomforest_dep.fit(X_train, y_train)

# Evaluate on test set
predictions = randomforest_dep.predict(X_test)



In [11]:
#Model Evaluation
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))

truePos = X_test[((predictions == 1) & (y_test == predictions))]
falsePos = X_test[((predictions == 1) & (y_test != predictions))]
trueNeg = X_test[((predictions == 0) & (y_test == predictions))]
falseNeg = X_test[((predictions == 0) & (y_test != predictions))]

TP = truePos.shape[0]
FP = falsePos.shape[0]
TN = trueNeg.shape[0]
FN = falseNeg.shape[0]

accuracy = float(TP + TN)/float(TP + TN + FP + FN)
print('Accuracy: '+str(accuracy))

             precision    recall  f1-score   support

          0       0.87      0.93      0.90    114638
          1       0.64      0.47      0.54     29637

avg / total       0.82      0.84      0.83    144275

Accuracy: 0.8367977820135158


### Let's Try Random Forest Model (n=100) 

In [8]:
from sklearn import linear_model, cross_validation, metrics, svm
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

# Create Random Forest classifier with 100 trees
randomforest_dep = RandomForestClassifier(n_estimators=100, n_jobs=-1)
randomforest_dep.fit(X_train, y_train)

# Evaluate on test set
predictions = randomforest_dep.predict(X_test)



In [9]:
#Model Evaluation
from sklearn.metrics import classification_report
print(classification_report(y_test,predictions))

truePos = X_test[((predictions == 1) & (y_test == predictions))]
falsePos = X_test[((predictions == 1) & (y_test != predictions))]
trueNeg = X_test[((predictions == 0) & (y_test == predictions))]
falseNeg = X_test[((predictions == 0) & (y_test != predictions))]

TP = truePos.shape[0]
FP = falsePos.shape[0]
TN = trueNeg.shape[0]
FN = falseNeg.shape[0]

accuracy = float(TP + TN)/float(TP + TN + FP + FN)
print('Accuracy: '+str(accuracy))

             precision    recall  f1-score   support

          0       0.87      0.93      0.90    114638
          1       0.65      0.47      0.55     29637

avg / total       0.83      0.84      0.83    144275

Accuracy: 0.8395286778721193


It seems that random forest model with 100 trees does not improve the results a lot. It takes longer. Therefore, we choose to use 50 trees.

In [12]:
import pickle

with open('randomforest_dep.pkl', 'wb') as fid:
    pickle.dump(randomforest_dep, fid,2)