In [1]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
import numpy as np
import pandas as pd

#Load the data
data = pd.read_csv("FlightDelays.csv")

In [2]:
#Inspect first few rows of data
data.head()

Unnamed: 0,CRS_DEP_TIME,CARRIER,DEP_TIME,DEST,DISTANCE,FL_DATE,FL_NUM,ORIGIN,Weather,DAY_WEEK,DAY_OF_MONTH,TAIL_NUM,Flight Status
0,1455,OH,1455,JFK,184,01/01/2004,5935,BWI,0,4,1,N940CA,ontime
1,1640,DH,1640,JFK,213,01/01/2004,6155,DCA,0,4,1,N405FJ,ontime
2,1245,DH,1245,LGA,229,01/01/2004,7208,IAD,0,4,1,N695BR,ontime
3,1715,DH,1709,LGA,229,01/01/2004,7215,IAD,0,4,1,N662BR,ontime
4,1039,DH,1035,LGA,229,01/01/2004,7792,IAD,0,4,1,N698BR,ontime


In [3]:
#Look at the datatypes
data.dtypes

CRS_DEP_TIME      int64
CARRIER          object
DEP_TIME          int64
DEST             object
DISTANCE          int64
FL_DATE          object
FL_NUM            int64
ORIGIN           object
Weather           int64
DAY_WEEK          int64
DAY_OF_MONTH      int64
TAIL_NUM         object
Flight Status    object
dtype: object

In [4]:
#Drop collumns that are not needed in the data for this assignment
data = data.drop(["TAIL_NUM","DISTANCE","Weather","FL_NUM","DISTANCE"],axis='columns')

In [5]:
#Verify the new dataset looks good
data.head()

Unnamed: 0,CRS_DEP_TIME,CARRIER,DEP_TIME,DEST,FL_DATE,ORIGIN,DAY_WEEK,DAY_OF_MONTH,Flight Status
0,1455,OH,1455,JFK,01/01/2004,BWI,4,1,ontime
1,1640,DH,1640,JFK,01/01/2004,DCA,4,1,ontime
2,1245,DH,1245,LGA,01/01/2004,IAD,4,1,ontime
3,1715,DH,1709,LGA,01/01/2004,IAD,4,1,ontime
4,1039,DH,1035,LGA,01/01/2004,IAD,4,1,ontime


In [6]:
#Get some summary information about the numeric values
data.describe()

Unnamed: 0,CRS_DEP_TIME,DEP_TIME,DAY_WEEK,DAY_OF_MONTH
count,2201.0,2201.0,2201.0,2201.0
mean,1371.938664,1369.298955,3.905498,16.024989
std,432.697149,442.462754,1.903149,8.67739
min,600.0,10.0,1.0,1.0
25%,1000.0,1004.0,2.0,8.0
50%,1455.0,1450.0,4.0,16.0
75%,1710.0,1709.0,5.0,23.0
max,2130.0,2330.0,7.0,31.0


In [7]:
#Inspect dependent variable
data['Flight Status'].unique()

array(['ontime', 'delayed'], dtype=object)

In [8]:
#Create dependent variable
y = data['Flight Status']

In [9]:
y

0       ontime
1       ontime
2       ontime
3       ontime
4       ontime
         ...  
2196    ontime
2197    ontime
2198    ontime
2199    ontime
2200    ontime
Name: Flight Status, Length: 2201, dtype: object

In [10]:
#Create predictor variable
x = data[['DAY_WEEK','CRS_DEP_TIME','ORIGIN','DEST','CARRIER']]

In [11]:
x

Unnamed: 0,DAY_WEEK,CRS_DEP_TIME,ORIGIN,DEST,CARRIER
0,4,1455,BWI,JFK,OH
1,4,1640,DCA,JFK,DH
2,4,1245,IAD,LGA,DH
3,4,1715,IAD,LGA,DH
4,4,1039,IAD,LGA,DH
...,...,...,...,...,...
2196,6,645,DCA,EWR,RU
2197,6,1700,IAD,EWR,RU
2198,6,1600,DCA,EWR,RU
2199,6,1359,DCA,EWR,RU


In [12]:
#Inspect coded values
x['DAY_WEEK'].unique()

array([4, 5, 6, 7, 1, 2, 3], dtype=int64)

In [13]:
x['CRS_DEP_TIME'].unique()

array([1455, 1640, 1245, 1715, 1039,  840, 1240, 1645, 2120,  930, 1230,
       1430, 1730, 2030, 1530,  600, 1830,  900, 1300, 1400, 1500, 1900,
        850, 1100, 1700, 2100, 1720, 1030,  700, 1710, 1525, 1515, 1630,
        640, 1610,  759,  630,  830, 1725, 1600,  730, 1000, 1200, 1800,
       2000, 1315, 1605, 1130, 1330, 1930,  800,  735, 1359,  645, 1040,
        925, 2130,  845, 1520], dtype=int64)

In [14]:
len(x['CRS_DEP_TIME'].unique())

59

Not quite sure why the instructions indicate that this would be 18 groups between 6AM and 10PM

In [15]:
print(x['ORIGIN'].unique())
print(x['DEST'].unique())
print(x['CARRIER'].unique())

['BWI' 'DCA' 'IAD']
['JFK' 'LGA' 'EWR']
['OH' 'DH' 'DL' 'MQ' 'UA' 'US' 'RU' 'CO']


In [16]:
cleanup_numsy = {"ontime": 0, "delayed": 1}
cleanup_numsx = {
                "ORIGIN": {"BWI": 1, "DCA": 2, "IAD": 3},
                "DEST": {"JFK": 1, "LGA": 2, "EWR": 3},
                "CARRIER": {"OH": 1, "DH": 2, "DL": 3, "MQ": 4, "UA": 5, "US": 6, "RU": 7, "CO": 8}}

data.replace(cleanup_numsx, inplace=True)
data.replace(cleanup_numsy, inplace=True)

In [17]:
y = data[['Flight Status']]
x = data[['DAY_WEEK','CRS_DEP_TIME','ORIGIN','DEST','CARRIER']]

In [18]:
type(y)

pandas.core.frame.DataFrame

In [19]:
type(x)

pandas.core.frame.DataFrame

In [20]:
y[y['Flight Status'] == 1]

Unnamed: 0,Flight Status
53,1
56,1
62,1
96,1
97,1
...,...
2146,1
2149,1
2150,1
2156,1


In [21]:
#Split data into training and test sets at a 60/40 ratio

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=0)

In [22]:
y_train['Flight Status']

895     0
372     0
576     0
211     1
520     0
       ..
1033    0
1731    0
763     0
835     0
1653    0
Name: Flight Status, Length: 1320, dtype: int64

In [29]:
#Use GaussianNB as the Model
gnb = GaussianNB()
mnb = MultinomialNB()

In [37]:
rng = np.random.RandomState(1)
A = rng.randint(5, size=(6, 100))
B = np.array([1, 2, 3, 4, 5, 6])

clf = MultinomialNB()
clf.fit(A, B)

print(clf.predict(A[2:3]))

[3]


In [39]:
mnb.fit(x_train, y_train['Flight Status'])

MultinomialNB()

In [40]:
y_pred = mnb.predict(x_test)

In [41]:
y_pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [36]:
print(len(x_test))

print(len(y_pred))

1321
1321


In [31]:
count = 0
for i in y_pred:
    if i is 1:
        count = count + 1
print(count)

0


In [22]:
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8039364118092355


In [34]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[1062,    0],
       [ 259,    0]], dtype=int64)

In [35]:
confusion_matrix(y_test, y_pred, normalize='all')

array([[0.80393641, 0.        ],
       [0.19606359, 0.        ]])

In [66]:
y_test = y_test.to_frame()

In [67]:
y_test

Unnamed: 0,Flight Status
1446,0
289,1
724,0
1326,0
1588,0
...,...
255,0
1054,0
662,1
1825,0


In [94]:
y_test['pred'] = y_pred

In [95]:
results = x_test
results = results.join(y_test)

In [96]:
results.head()

Unnamed: 0,DAY_WEEK,CRS_DEP_TIME,ORIGIN,DEST,CARRIER,Flight Status,pred
1446,3,630,2,2,3,0,0
289,1,1710,3,3,2,1,0
724,7,1900,2,2,4,0,0
1326,1,1300,2,3,8,0,0
1588,4,2120,3,3,2,0,0


In [97]:
results['pred'].unique()

array([0], dtype=int64)