In [63]:
%matplotlib inline

from pathlib import Path

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from dmba import classificationSummary

In [64]:
fd = pd.read_csv('FlightDelays.csv')

In [65]:
fd.dtypes

CRS_DEP_TIME      int64
CARRIER          object
DEP_TIME          int64
DEST             object
DISTANCE          int64
FL_DATE          object
FL_NUM            int64
ORIGIN           object
Weather           int64
DAY_WEEK          int64
DAY_OF_MONTH      int64
TAIL_NUM         object
Flight Status    object
dtype: object

In [66]:
fd.drop(['DEP_TIME', 'FL_DATE', 'TAIL_NUM'], axis = 1, inplace = True)

In [67]:
mymap = {'ontime': 0, 'delayed':1}

fd = fd.applymap(lambda s: mymap.get(s) if s in mymap else s)

In [68]:
fd.head()

Unnamed: 0,CRS_DEP_TIME,CARRIER,DEST,DISTANCE,FL_NUM,ORIGIN,Weather,DAY_WEEK,DAY_OF_MONTH,Flight Status
0,1455,OH,JFK,184,5935,BWI,0,4,1,0
1,1640,DH,JFK,213,6155,DCA,0,4,1,0
2,1245,DH,LGA,229,7208,IAD,0,4,1,0
3,1715,DH,LGA,229,7215,IAD,0,4,1,0
4,1039,DH,LGA,229,7792,IAD,0,4,1,0


In [69]:
len(fd)

2201

In [70]:
print(fd['Flight Status'].unique())

[0 1]


In [71]:
print(fd['DAY_OF_MONTH'].unique())

[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31]


In [72]:
print(fd.DAY_WEEK.unique())

[4 5 6 7 1 2 3]


In [73]:
fd.DAY_WEEK = fd.DAY_WEEK.astype('category')

In [74]:
fd.dtypes

CRS_DEP_TIME        int64
CARRIER            object
DEST               object
DISTANCE            int64
FL_NUM              int64
ORIGIN             object
Weather             int64
DAY_WEEK         category
DAY_OF_MONTH        int64
Flight Status       int64
dtype: object

In [75]:
fd.CRS_DEP_TIME = pd.cut(fd.CRS_DEP_TIME, bins=8)

In [76]:
fd.CRS_DEP_TIME.unique()

[(1365.0, 1556.25], (1556.25, 1747.5], (1173.75, 1365.0], (982.5, 1173.75], (791.25, 982.5], (1938.75, 2130.0], (598.47, 791.25], (1747.5, 1938.75]]
Categories (8, interval[float64]): [(598.47, 791.25] < (791.25, 982.5] < (982.5, 1173.75] < (1173.75, 1365.0] < (1365.0, 1556.25] < (1556.25, 1747.5] < (1747.5, 1938.75] < (1938.75, 2130.0]]

In [77]:
fd.ORIGIN.unique()

array(['BWI', 'DCA', 'IAD'], dtype=object)

In [78]:
fd = pd.get_dummies(fd, drop_first = True)

In [79]:
fd.head()

Unnamed: 0,DISTANCE,FL_NUM,Weather,DAY_OF_MONTH,Flight Status,"CRS_DEP_TIME_(791.25, 982.5]","CRS_DEP_TIME_(982.5, 1173.75]","CRS_DEP_TIME_(1173.75, 1365.0]","CRS_DEP_TIME_(1365.0, 1556.25]","CRS_DEP_TIME_(1556.25, 1747.5]",...,DEST_JFK,DEST_LGA,ORIGIN_DCA,ORIGIN_IAD,DAY_WEEK_2,DAY_WEEK_3,DAY_WEEK_4,DAY_WEEK_5,DAY_WEEK_6,DAY_WEEK_7
0,184,5935,0,1,0,0,0,0,1,0,...,1,0,0,0,0,0,1,0,0,0
1,213,6155,0,1,0,0,0,0,0,1,...,1,0,1,0,0,0,1,0,0,0
2,229,7208,0,1,0,0,0,1,0,0,...,0,1,0,1,0,0,1,0,0,0
3,229,7215,0,1,0,0,0,0,0,1,...,0,1,0,1,0,0,1,0,0,0
4,229,7792,0,1,0,0,1,0,0,0,...,0,1,0,1,0,0,1,0,0,0


In [80]:
#split into training and validation
X = fd.drop(columns=['Flight Status'])
y = fd['Flight Status']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.40, random_state=3)

In [82]:
defaultTree = DecisionTreeClassifier(random_state=1)
defaultTree.fit(X_train, y_train)

classes = defaultTree.classes_
classificationSummary(y_valid, defaultTree.predict(X_valid), class_names=defaultTree.classes_)

Confusion Matrix (Accuracy 0.7185)

       Prediction
Actual   0   1
     0 557 130
     1 118  76


In [83]:
boost = AdaBoostClassifier(DecisionTreeClassifier(random_state=1), n_estimators=500, random_state=1)
boost.fit(X_train, y_train)

classes = boost.classes_
classificationSummary(y_valid, boost.predict(X_valid), class_names=classes)

Confusion Matrix (Accuracy 0.7321)

       Prediction
Actual   0   1
     0 565 122
     1 114  80


In terms of accuracy, the boosted tree performs better than the single tree with a difference of 1.36%

In terms of accuracy, the boosted tree has a slightly higher accuracy in identifying true positives (delayed flight)