# Chapter 8: The Naive Bayes Classifier

> (c) 2019 Galit Shmueli, Peter C. Bruce, Peter Gedeck 
>
> Code included in
>
> _Data Mining for Business Analytics: Concepts, Techniques, and Applications in Python_ (First Edition) 
> Galit Shmueli, Peter C. Bruce, Peter Gedeck, and Nitin R. Patel. 2019.

## Import required packages

In [84]:
%matplotlib inline

from pathlib import Path

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import matplotlib.pylab as plt

!pip install dmba
from dmba import classificationSummary, gainsChart




## Import Dataset

* import dataset
* convert to categorical
* convert num to categorical by creating bins
* run NB model
* predict probabibilities using NB model

In [85]:
delays = pd.read_csv('FlightDelays.csv')

In [86]:
delays.rename(columns={'Flight Status' : 'Flight_Status'}, inplace=True)

In [87]:
delays.DAY_WEEK = delays.DAY_WEEK.astype('category')
delays.CARRIER = delays.CARRIER.astype('category')
delays.ORIGIN = delays.ORIGIN.astype('category')
delays.DEST = delays.DEST.astype('category')
delays.CRS_DEP_TIME = [round(t/100) for t in delays.CRS_DEP_TIME]
delays.CRS_DEP_TIME = delays.CRS_DEP_TIME.astype('category')

In [88]:
outcome = 'Flight_Status'
predictors = ['DAY_WEEK','CARRIER','ORIGIN','DEST','CRS_DEP_TIME']

X= pd.get_dummies(delays[predictors])
y= (delays[outcome] == 'delayed').astype(int)
classes= ['ontime', 'delayed']
pd.DataFrame(y)

Unnamed: 0,Flight_Status
0,0
1,0
2,0
3,0
4,0
...,...
2196,0
2197,0
2198,0
2199,0


In [89]:
X_train, X_valid, y_train, y_valid = train_test_split(X,y, test_size=0.4, random_state=1)

delays_nb = MultinomialNB(alpha=0.1)
delays_nb.fit(X_train, y_train)

pred_prob_train = delays_nb.predict_proba(X_train)
pred_prob_valid = delays_nb.predict_proba(X_valid)

#pd.DataFrame(pred_prob_train)

y_train_pred = delays_nb.predict(X_train)
y_valid_pred = delays_nb.predict(X_valid)

In [90]:
pd.DataFrame(y_valid_pred)

Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0
...,...
876,0
877,0
878,0
879,0


In [None]:
X.head()

In [None]:
X_train.head()

In [91]:
index = X_train.index
indices = index.to_list()
#indices
delays.iloc[indices]

Unnamed: 0,CRS_DEP_TIME,CARRIER,DEP_TIME,DEST,DISTANCE,FL_DATE,FL_NUM,ORIGIN,Weather,DAY_WEEK,DAY_OF_MONTH,TAIL_NUM,Flight_Status
1215,21,DH,2110,LGA,229,1/18/2004,7684,IAD,0,7,18,N632BR,ontime
1476,7,US,659,LGA,214,1/21/2004,2160,DCA,0,3,21,N713UW,ontime
1897,9,US,858,LGA,214,1/27/2004,2164,DCA,0,2,27,N733UW,ontime
83,13,US,1258,LGA,214,01/02/2004,2172,DCA,0,5,2,N736UW,ontime
1172,15,DL,1509,JFK,213,1/17/2004,746,DCA,0,6,17,N909DL,delayed
...,...,...,...,...,...,...,...,...,...,...,...,...,...
960,18,MQ,1825,JFK,213,1/14/2004,4784,DCA,0,3,14,N728MQ,ontime
905,7,RU,653,EWR,169,1/13/2004,2703,BWI,0,2,13,N12530,ontime
1096,7,DL,739,LGA,214,1/16/2004,1742,DCA,0,5,16,N224DA,ontime
235,16,DH,1700,JFK,228,01/05/2004,7810,IAD,0,1,5,N331UE,ontime


In [92]:
y_train_pred = pd.DataFrame(y_train_pred)
y_train_pred.rename(columns={0:'delayed'}, inplace=True)
y_train_pred.head()

Unnamed: 0,delayed
0,0
1,0
2,0
3,0
4,0


In [93]:
pred_prob_train = pd.DataFrame(pred_prob_train)
pred_prob_train.rename(columns={0:'ontime_prob', 1:'delay_prob'}, inplace=True)
pred_prob_train

Unnamed: 0,ontime_prob,delay_prob
0,0.538788,0.461212
1,0.969970,0.030030
2,0.983705,0.016295
3,0.969531,0.030469
4,0.905152,0.094848
...,...,...
1315,0.824814,0.175186
1316,0.648865,0.351135
1317,0.939830,0.060170
1318,0.540771,0.459229


In [94]:
train_predictions = pd.concat([delays.iloc[indices].reset_index(),pred_prob_train,y_train_pred], axis=1)

In [95]:
index = X_valid.index
indices = index.to_list()
delays.iloc[indices]

Unnamed: 0,CRS_DEP_TIME,CARRIER,DEP_TIME,DEST,DISTANCE,FL_DATE,FL_NUM,ORIGIN,Weather,DAY_WEEK,DAY_OF_MONTH,TAIL_NUM,Flight_Status
1276,16,DH,1641,JFK,228,1/19/2004,7810,IAD,0,1,19,N327UE,ontime
1446,6,DL,630,LGA,214,1/21/2004,1740,DCA,0,3,21,N242DL,ontime
335,6,MQ,558,JFK,213,01/06/2004,4760,DCA,0,2,6,N739MQ,ontime
1458,18,DL,1828,LGA,214,1/21/2004,1764,DCA,0,3,21,N242DL,ontime
2038,18,US,1758,LGA,214,1/29/2004,2182,DCA,0,4,29,N704UW,ontime
...,...,...,...,...,...,...,...,...,...,...,...,...,...
460,14,DH,1439,EWR,213,01/07/2004,7307,IAD,0,3,7,N324UE,ontime
2063,13,RU,1259,EWR,213,1/29/2004,2692,IAD,0,4,29,N17507,ontime
159,17,RU,1738,EWR,199,01/03/2004,2097,DCA,0,6,3,N14998,ontime
2027,7,US,657,LGA,214,1/29/2004,2160,DCA,0,4,29,N751UW,delayed


In [96]:
y_valid_pred = pd.DataFrame(y_valid_pred)
y_valid_pred.rename(columns={0:'delayed'}, inplace=True)

In [97]:
pred_prob_valid = pd.DataFrame(pred_prob_valid)
pred_prob_valid.rename(columns={0:'ontime_prob',1:'delay_prob'}, inplace=True)

In [98]:
valid_predictions = pd.concat([delays.iloc[indices].reset_index(),pred_prob_valid,y_valid_pred], axis=1)
valid_predictions.head()

Unnamed: 0,index,CRS_DEP_TIME,CARRIER,DEP_TIME,DEST,DISTANCE,FL_DATE,FL_NUM,ORIGIN,Weather,DAY_WEEK,DAY_OF_MONTH,TAIL_NUM,Flight_Status,ontime_prob,delay_prob,delayed
0,1276,16,DH,1641,JFK,228,1/19/2004,7810,IAD,0,1,19,N327UE,ontime,0.540771,0.459229,0
1,1446,6,DL,630,LGA,214,1/21/2004,1740,DCA,0,3,21,N242DL,ontime,0.970798,0.029202,0
2,335,6,MQ,558,JFK,213,01/06/2004,4760,DCA,0,2,6,N739MQ,ontime,0.780133,0.219867,0
3,1458,18,DL,1828,LGA,214,1/21/2004,1764,DCA,0,3,21,N242DL,ontime,0.97084,0.02916,0
4,2038,18,US,1758,LGA,214,1/29/2004,2182,DCA,0,4,29,N704UW,ontime,0.982379,0.017621,0


In [99]:
train_predictions.loc[(train_predictions.CARRIER == 'DL') &
                      (train_predictions.DAY_WEEK == 7) &
                      (train_predictions.CRS_DEP_TIME == 10) &
                      (train_predictions.DEST == 'LGA') &
                      (train_predictions.ORIGIN == 'DCA')]

Unnamed: 0,index,CRS_DEP_TIME,CARRIER,DEP_TIME,DEST,DISTANCE,FL_DATE,FL_NUM,ORIGIN,Weather,DAY_WEEK,DAY_OF_MONTH,TAIL_NUM,Flight_Status,ontime_prob,delay_prob,delayed
277,180,10,DL,1028,LGA,214,01/04/2004,1748,DCA,0,7,4,N221DL,ontime,0.941701,0.058299,0
1058,1748,10,DL,1026,LGA,214,1/25/2004,1748,DCA,0,7,25,N225DL,ontime,0.941701,0.058299,0
1119,706,10,DL,1028,LGA,214,01/11/2004,1748,DCA,0,7,11,N242DL,ontime,0.941701,0.058299,0


In [100]:
valid_predictions.loc[(valid_predictions.CARRIER == 'DL') &
                      (valid_predictions.DAY_WEEK == 7) &
                      (valid_predictions.CRS_DEP_TIME == 10) &
                      (valid_predictions.DEST == 'LGA') &
                      (valid_predictions.ORIGIN == 'DCA')]

Unnamed: 0,index,CRS_DEP_TIME,CARRIER,DEP_TIME,DEST,DISTANCE,FL_DATE,FL_NUM,ORIGIN,Weather,DAY_WEEK,DAY_OF_MONTH,TAIL_NUM,Flight_Status,ontime_prob,delay_prob,delayed
473,1225,10,DL,1029,LGA,214,1/18/2004,1748,DCA,0,7,18,N242DL,ontime,0.941701,0.058299,0


## Create a Probability table
First construct a frequency table and then convert it to the propability table

* create a frequency table for all the predictors
* create a proportion table for all the predictors
* calculate the probabiblity for the following scenarios:

1.  P(delayed | Carrier = DL, Day_Week = 7, Dep_Time = 10, Dest = LGA, Origin = DCA)
2. P(ontime | Carrier = DL, Day_Week = 7, Dep_Time = 10, Dest = LGA, Origin = DCA)
3. Total P(dealyed|....)
4. Total P(ontime|.....)



## Comparison of the manual calculated calculation vs the NB model prediction

In [103]:
train_df,valid_df = train_test_split(delays, test_size=0.4, random_state=1)

for predictor in predictors:
  df=train_df[['Flight_Status',predictor]]
  freq_table = df.pivot_table(index='Flight_Status',columns = predictor, aggfunc=len)
  prop_table = freq_table.apply(lambda x: x/sum(x), axis=1)
  print(prop_table)
  print()

DAY_WEEK              1         2         3  ...         5         6         7
Flight_Status                                ...                              
delayed        0.191571  0.149425  0.114943  ...  0.187739  0.068966  0.160920
ontime         0.124646  0.141643  0.144476  ...  0.169027  0.135977  0.104816

[2 rows x 7 columns]

CARRIER              CO        DH        DL  ...        RU        UA        US
Flight_Status                                ...                              
delayed        0.057471  0.314176  0.095785  ...  0.218391  0.015326  0.068966
ontime         0.034939  0.229462  0.203966  ...  0.169027  0.016997  0.218130

[2 rows x 8 columns]

ORIGIN              BWI       DCA       IAD
Flight_Status                              
delayed        0.080460  0.521073  0.398467
ontime         0.060434  0.647781  0.291785

DEST                EWR       JFK       LGA
Flight_Status                              
delayed        0.379310  0.199234  0.421456
ontime       

## Confusion Matrix

* build a confusion matrix on the training set
* build a confusion matrix on the validation set

In [104]:
classificationSummary(y_valid,y_valid_pred, class_names=classes)

Confusion Matrix (Accuracy 0.7866)

        Prediction
 Actual  ontime delayed
 ontime     667      47
delayed     141      26
