# Building Basic predictive models over the NYC Taxi Trip Dataset

Targets: 
    Choosing most suitable evaluation metrics.
    Benchmark model.
    K-Nearest Neighbours model.
    Linear model, interpret the variable coefficients of the model.
    Decision Tree model, interpret the variable importance.
    Plot Bar plots.

In [9]:
#Importing Required libraries.

import numpy as np
import pandas as pd
from datetime import timedelta
import datetime as dt
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [10]:
#Reading data with pandas and giving vaiable name tt
tt = pd.read_csv('nyc_taxi_trip_duration.csv')
tt.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id1080784,2,2016-02-29 16:40:21,2016-02-29 16:47:01,1,-73.953918,40.778873,-73.963875,40.771164,N,400
1,id0889885,1,2016-03-11 23:35:37,2016-03-11 23:53:57,2,-73.988312,40.731743,-73.994751,40.694931,N,1100
2,id0857912,2,2016-02-21 17:59:33,2016-02-21 18:26:48,2,-73.997314,40.721458,-73.948029,40.774918,N,1635
3,id3744273,2,2016-01-05 09:44:31,2016-01-05 10:03:32,6,-73.96167,40.75972,-73.956779,40.780628,N,1141
4,id0232939,1,2016-02-17 06:42:23,2016-02-17 06:56:31,1,-74.01712,40.708469,-73.988182,40.740631,N,848


In [11]:
#Rows and columns
tt.shape

(729322, 11)

We have 729322 rows and 11 columns.

In [12]:
#Columns and values
tt.columns
tt.iloc[1,:]

id                              id0889885
vendor_id                               1
pickup_datetime       2016-03-11 23:35:37
dropoff_datetime      2016-03-11 23:53:57
passenger_count                         2
pickup_longitude               -73.988312
pickup_latitude                 40.731743
dropoff_longitude              -73.994751
dropoff_latitude                40.694931
store_and_fwd_flag                      N
trip_duration                        1100
Name: 1, dtype: object

In [None]:
#Checking missing values. 
np.sum(pd.isnull(tt))

# The target variable is continous so I'm using RMSE (Root Mean Squared Error) in Evaluation metric.

# Benchmark Model

In [14]:
#Creating column trip duration hour with trip duration column values.
tt['trip_duration_hour'] = tt['trip_duration'].apply(lambda x: x/3600)

In [15]:
#Creating mean column for the duration hour column.
tt["trip_duration_hour_mean"]=tt["trip_duration_hour"].mean()
tt["trip_duration_hour_mean"].head()

0    0.264508
1    0.264508
2    0.264508
3    0.264508
4    0.264508
Name: trip_duration_hour_mean, dtype: float64

In [16]:
tt.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,trip_duration_hour,trip_duration_hour_mean
0,id1080784,2,2016-02-29 16:40:21,2016-02-29 16:47:01,1,-73.953918,40.778873,-73.963875,40.771164,N,400,0.111111,0.264508
1,id0889885,1,2016-03-11 23:35:37,2016-03-11 23:53:57,2,-73.988312,40.731743,-73.994751,40.694931,N,1100,0.305556,0.264508
2,id0857912,2,2016-02-21 17:59:33,2016-02-21 18:26:48,2,-73.997314,40.721458,-73.948029,40.774918,N,1635,0.454167,0.264508
3,id3744273,2,2016-01-05 09:44:31,2016-01-05 10:03:32,6,-73.96167,40.75972,-73.956779,40.780628,N,1141,0.316944,0.264508
4,id0232939,1,2016-02-17 06:42:23,2016-02-17 06:56:31,1,-74.01712,40.708469,-73.988182,40.740631,N,848,0.235556,0.264508


Need to analyze the trip duration mean with respect to mean of pickup time of the day, dropoff time of the day, passener count and store fwd flag columns

#To have a exact pick time of the day and drop off time of the day we need to extract the values given from the columns pickup_datetime and dropoff_datetime.

In [17]:
#changing the format of the datetime columns
tt['pickup_datetime'] = pd.to_datetime(tt['pickup_datetime'], format = '%Y-%m-%d %H:%M:%S')
tt['dropoff_datetime'] = pd.to_datetime(tt['dropoff_datetime'], format = '%Y-%m-%d %H:%M:%S')

In [18]:
#Extract every values and store seperately in different columns.

tt['drop_hour'] = tt['dropoff_datetime'].dt.hour
tt['drop_weekday'] = tt['dropoff_datetime'].dt.weekday
tt['drop_day'] = tt['dropoff_datetime'].dt.day
tt['drop_month'] = tt['dropoff_datetime'].dt.month
tt['drop_year'] = tt['dropoff_datetime'].dt.year
tt['drop_date'] = tt['dropoff_datetime'].dt.date


tt['pick_hour'] = tt['pickup_datetime'].dt.hour
tt['pick_weekday'] = tt['pickup_datetime'].dt.weekday
tt['pick_day'] = tt['pickup_datetime'].dt.day
tt['pick_month'] = tt['pickup_datetime'].dt.month
tt['pick_year'] = tt['pickup_datetime'].dt.year
tt['pick_date'] = tt['pickup_datetime'].dt.date

In [19]:
tt.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,...,drop_day,drop_month,drop_year,drop_date,pick_hour,pick_weekday,pick_day,pick_month,pick_year,pick_date
0,id1080784,2,2016-02-29 16:40:21,2016-02-29 16:47:01,1,-73.953918,40.778873,-73.963875,40.771164,N,...,29,2,2016,2016-02-29,16,0,29,2,2016,2016-02-29
1,id0889885,1,2016-03-11 23:35:37,2016-03-11 23:53:57,2,-73.988312,40.731743,-73.994751,40.694931,N,...,11,3,2016,2016-03-11,23,4,11,3,2016,2016-03-11
2,id0857912,2,2016-02-21 17:59:33,2016-02-21 18:26:48,2,-73.997314,40.721458,-73.948029,40.774918,N,...,21,2,2016,2016-02-21,17,6,21,2,2016,2016-02-21
3,id3744273,2,2016-01-05 09:44:31,2016-01-05 10:03:32,6,-73.96167,40.75972,-73.956779,40.780628,N,...,5,1,2016,2016-01-05,9,1,5,1,2016,2016-01-05
4,id0232939,1,2016-02-17 06:42:23,2016-02-17 06:56:31,1,-74.01712,40.708469,-73.988182,40.740631,N,...,17,2,2016,2016-02-17,6,2,17,2,2016,2016-02-17


In [20]:
#importing the shuffle library to create train and test datasets.
from sklearn.utils import shuffle
# Shuffling the Dataset
tt = shuffle(tt, random_state = 42)
#creating 4 divisions
div = int(tt.shape[0]/4)
# 3 sets to train and 1 set to test.
train = tt.loc[:3*div+1,:]
test = tt.loc[3*div+1:]

In [21]:
train.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,...,drop_day,drop_month,drop_year,drop_date,pick_hour,pick_weekday,pick_day,pick_month,pick_year,pick_date
469114,id2380741,2,2016-05-21 10:40:14,2016-05-21 10:51:11,1,-73.981796,40.762035,-73.972267,40.781265,N,...,21,5,2016,2016-05-21,10,5,21,5,2016,2016-05-21
694852,id3946961,2,2016-01-08 18:49:27,2016-01-08 18:52:42,5,-73.980965,40.747677,-73.982704,40.741161,N,...,8,1,2016,2016-01-08,18,4,8,1,2016,2016-01-08
696324,id0833913,1,2016-05-22 00:54:10,2016-05-22 01:08:10,1,-73.951065,40.782722,-73.867691,40.833664,N,...,22,5,2016,2016-05-22,0,6,22,5,2016,2016-05-22
356496,id1336849,1,2016-06-11 10:32:12,2016-06-11 10:38:50,1,-73.987625,40.762791,-73.973518,40.762909,N,...,11,6,2016,2016-06-11,10,5,11,6,2016,2016-06-11
645318,id1610858,1,2016-04-03 10:45:51,2016-04-03 10:57:13,3,-73.964333,40.792503,-73.988609,40.758369,N,...,3,4,2016,2016-04-03,10,6,3,4,2016,2016-04-03


In [22]:
train.shape

(589451, 25)

In [23]:
test.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,...,drop_day,drop_month,drop_year,drop_date,pick_hour,pick_weekday,pick_day,pick_month,pick_year,pick_date
546991,id2240736,1,2016-05-25 07:59:16,2016-05-25 08:05:02,1,-73.991364,40.73259,-74.000526,40.742283,N,...,25,5,2016,2016-05-25,7,2,25,5,2016,2016-05-25
43126,id1423404,1,2016-01-18 12:17:13,2016-01-18 12:21:13,2,-73.966225,40.768059,-73.967606,40.763073,N,...,18,1,2016,2016-01-18,12,0,18,1,2016,2016-01-18
641450,id1317268,2,2016-03-02 18:39:01,2016-03-02 18:50:12,1,-73.994926,40.766018,-74.004219,40.742523,N,...,2,3,2016,2016-03-02,18,2,2,3,2016,2016-03-02
611380,id3335546,1,2016-04-06 19:17:20,2016-04-06 19:18:03,1,-73.974388,40.793781,-73.976006,40.792339,N,...,6,4,2016,2016-04-06,19,2,6,4,2016,2016-04-06
62690,id2174190,2,2016-06-21 18:35:31,2016-06-21 18:40:56,3,-73.96344,40.798557,-73.979736,40.777878,N,...,21,6,2016,2016-06-21,18,1,21,6,2016,2016-06-21


In [24]:
test.shape

(139872, 25)

In [25]:
# RMSE (Root Mean Squared Error)
# Import mean square error as mse from sklearn and square root from math
from sklearn.metrics import mean_squared_error as mse
from math import sqrt

trip_dur_mean_error = sqrt((mse(test['trip_duration_hour'] , test['trip_duration_hour_mean'])))
trip_dur_mean_error

0.8904067655425832

In [26]:
#Pickup time of the day.
pick_up = pd.pivot_table(train, values='trip_duration_hour', index = ['pick_hour'], aggfunc=np.mean)
pick_up

Unnamed: 0_level_0,trip_duration_hour
pick_hour,Unnamed: 1_level_1
0,0.272852
1,0.254333
2,0.237052
3,0.24626
4,0.247365
5,0.230513
6,0.20176
7,0.225605
8,0.260481
9,0.258606


In [27]:
# Initialize new column to zero
test['p_up'] = 0

# Assign mean value corresponding to unique entry
for i in train['pick_hour'].unique():
    test['p_up'][test['pick_hour'] == str(i)] = train['trip_duration_hour'][train['pick_hour'] == str(i)].mean()

In [28]:
#Calculating RMSE
pickup_error = sqrt(mse(test['trip_duration_hour'] , test['p_up'] ))
pickup_error

0.9290781075032716

In [29]:
#Dropoff time of the day
drop_off = pd.pivot_table(train, values='trip_duration_hour', index = ['drop_hour'], aggfunc=np.mean)
drop_off

Unnamed: 0_level_0,trip_duration_hour
drop_hour,Unnamed: 1_level_1
0,0.292695
1,0.25573
2,0.231592
3,0.258366
4,0.229382
5,0.229485
6,0.190441
7,0.197141
8,0.248227
9,0.261141


In [30]:
# Initialize new column to zero
test['d_off'] = 0

# Assign mean value to unique entry
for i in train['drop_hour'].unique():
    test['d_off'][test['drop_hour'] == str(i)] = train['trip_duration_hour'][train['drop_hour'] == str(i)].mean()

In [31]:
#Calculating RMSE
dropoff_error = sqrt(mse(test['trip_duration_hour'] , test['d_off'] ))
dropoff_error

0.9290781075032716

In [32]:
#Passenger Count
p_count = pd.pivot_table(train, values='trip_duration_hour', index = ["passenger_count"], aggfunc=np.mean)
p_count

Unnamed: 0_level_0,trip_duration_hour
passenger_count,Unnamed: 1_level_1
0,0.092981
1,0.255343
2,0.277822
3,0.287332
4,0.285759
5,0.299641
6,0.300193


In [33]:
# Initialize new column to zero
test['pass_count'] = 0

# Assign the mean value to unique entry
for i in train['passenger_count'].unique(): 
    test['pass_count'][test['passenger_count'] == str(i)] = train['trip_duration_hour'][train['passenger_count'] == str(i)].mean()

In [34]:
passenger_error = sqrt(mse(test['trip_duration_hour'] , test['pass_count'] ))
passenger_error

0.9290781075032716

In [35]:
#Store and Fwd
store_and_fwd = pd.pivot_table(train, values='trip_duration_hour', index = ["store_and_fwd_flag"], aggfunc=np.mean)
store_and_fwd

Unnamed: 0_level_0,trip_duration_hour
store_and_fwd_flag,Unnamed: 1_level_1
N,0.264109
Y,0.304058


In [42]:
# initializing new column to zero
test['store_fwd'] = 0

# Assign the mean value corresponding to unique entry
for i in train['store_and_fwd_flag'].unique():
    test['store_fwd'][test['store_and_fwd_flag'] == str(i)] = train['trip_duration_hour'][train['store_and_fwd_flag'] == str(i)].mean()

In [43]:
str_fwd_error = sqrt(mse(test['trip_duration_hour'], test['store_fwd'] ))
str_fwd_error

0.8904020727484228

In [44]:
#vendor id
v_id = pd.pivot_table(train, values='trip_duration_hour', index = ["vendor_id"], aggfunc=np.mean)
v_id

Unnamed: 0_level_0,trip_duration_hour
vendor_id,Unnamed: 1_level_1
1,0.232826
2,0.291652


In [45]:
# initializing new column to zero
test['ven_id'] = 0

# Assign the mean value corresponding to unique entry
for i in train['vendor_id'].unique():
    test['ven_id'][test['vendor_id'] == str(i)] = train['trip_duration_hour'][train['vendor_id'] == str(i)].mean()

In [46]:
v_error = sqrt(mse(test['vendor_id'] , test['trip_duration_hour'] ))
v_error

1.6191911214509642

In [48]:
combo = pd.pivot_table(train, values = 'trip_duration_hour', index = ['passenger_count','pick_hour','drop_hour'], aggfunc = np.mean)
combo

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,trip_duration_hour
passenger_count,pick_hour,drop_hour,Unnamed: 3_level_1
0,0,0,0.011458
0,1,1,0.004028
0,2,2,0.311389
0,3,3,0.018889
0,4,4,0.001667
...,...,...,...
6,21,22,0.317244
6,22,22,0.212188
6,22,23,0.338074
6,23,0,0.341636


In [49]:
test['Super_mean'] = 0

s2 = 'pick_hour'
s1 = 'drop_hour'


for i in test[s1].unique():
  
  for j in test[s2].unique():
    test['Super_mean'][(test[s1] == i) & (test[s2]==str(j))] = train['trip_duration'][(train[s1] == i) & (train[s2]==str(j))].mean()

In [51]:
super_mean_error = sqrt(mse(test['trip_duration'] , test['Super_mean'] ))
super_mean_error

3344.6811870117776

The RMSE error of Pick hour is 0.9290781075032716

The RMSE error of drop hour is 0.9290781075032716

The RMSE error of Passenger is 0.9290781075032716

The RMSE error of Store and Fwd flag is 0.8904020727484228

The RMSE error of vendor_id is 1.6191911214509642

The RMSE of vendor_id is little high camparing to other errors.

The pickup, dropoff and passenger errors are equal to each other.

The store and FWd flag error is little bit less then other errors.