### Import libraries


In [42]:
import pandas as pd
import numpy as np 
import seaborn as sns
%matplotlib inline

### Load data

In [43]:
taxi_data = pd.read_csv('nyc_taxi_trip_duration.csv')

### Data wrangling

In [44]:
#check head of  data to confirm we have the correct data
taxi_data.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id1080784,2,2016-02-29 16:40:21,2016-02-29 16:47:01,1,-73.953918,40.778873,-73.963875,40.771164,N,400
1,id0889885,1,2016-03-11 23:35:37,2016-03-11 23:53:57,2,-73.988312,40.731743,-73.994751,40.694931,N,1100
2,id0857912,2,2016-02-21 17:59:33,2016-02-21 18:26:48,2,-73.997314,40.721458,-73.948029,40.774918,N,1635
3,id3744273,2,2016-01-05 09:44:31,2016-01-05 10:03:32,6,-73.96167,40.75972,-73.956779,40.780628,N,1141
4,id0232939,1,2016-02-17 06:42:23,2016-02-17 06:56:31,1,-74.01712,40.708469,-73.988182,40.740631,N,848


In [45]:
#column types
taxi_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 729322 entries, 0 to 729321
Data columns (total 11 columns):
id                    729322 non-null object
vendor_id             729322 non-null int64
pickup_datetime       729322 non-null object
dropoff_datetime      729322 non-null object
passenger_count       729322 non-null int64
pickup_longitude      729322 non-null float64
pickup_latitude       729322 non-null float64
dropoff_longitude     729322 non-null float64
dropoff_latitude      729322 non-null float64
store_and_fwd_flag    729322 non-null object
trip_duration         729322 non-null int64
dtypes: float64(4), int64(3), object(4)
memory usage: 61.2+ MB


We have to convert the date columns to datetime format so that python knows they are dates

In [46]:
#convert date columns to date format
taxi_data['pickup_datetime'] = pd.to_datetime(taxi_data['pickup_datetime'])
taxi_data['dropoff_datetime'] = pd.to_datetime(taxi_data['dropoff_datetime'])

In [47]:
#data dimension 
print('The numnber of rows and columnsis {} and {} respectively'.format(taxi_data.shape[0],taxi_data.shape[1]))

The numnber of rows and columnsis 729322 and 11 respectively


In [48]:
#desciptive statistics on numerical columns
taxi_data.describe()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration
count,729322.0,729322.0,729322.0,729322.0,729322.0,729322.0,729322.0
mean,1.535403,1.662055,-73.973513,40.750919,-73.973422,40.751775,952.2291
std,0.498745,1.312446,0.069754,0.033594,0.069588,0.036037,3864.626
min,1.0,0.0,-121.933342,34.712234,-121.933304,32.181141,1.0
25%,1.0,1.0,-73.991859,40.737335,-73.991318,40.735931,397.0
50%,2.0,1.0,-73.981758,40.75407,-73.979759,40.754509,663.0
75%,2.0,2.0,-73.967361,40.768314,-73.963036,40.769741,1075.0
max,2.0,9.0,-65.897385,51.881084,-65.897385,43.921028,1939736.0


We can see from the above table that there are some trips that took a duration of 1 second. We go ahead and delete trips that took less than 60 seconds since it doesn't make sense to have a trip that short.

In [49]:
#Remove outliers---trips less than 60 seconds
taxi_data = taxi_data[taxi_data['trip_duration']>60]

We can also check the top most trips to see if there are no extreme values.

In [50]:
#check longest trips
taxi_data['trip_duration'].sort_values(ascending=False).head()

21813     1939736
259437      86391
119185      86387
177225      86378
496391      86377
Name: trip_duration, dtype: int64

The longest trip is more than 20 times longer than the closest trip to it. This is clearly an outlier and hence we get rid of it.

In [51]:
#Remove outliers---longest trip
taxi_data = taxi_data[taxi_data['trip_duration']<1939736]

In [52]:
#check new shape after removing outliers
print('The numnber of rows and columns after removing outliers is {} and {} respectively'.format(taxi_data.shape[0],taxi_data.shape[1]))

The numnber of rows and columns after removing outliers is 724959 and 11 respectively


In [53]:
##check if we have missing values
taxi_data.isnull().any()

id                    False
vendor_id             False
pickup_datetime       False
dropoff_datetime      False
passenger_count       False
pickup_longitude      False
pickup_latitude       False
dropoff_longitude     False
dropoff_latitude      False
store_and_fwd_flag    False
trip_duration         False
dtype: bool

The are no missing values in any any column of the dataset

### Feature Engineering

In [54]:
## Create hour, day and  month variables
taxi_data['pickup_hour'] = taxi_data['pickup_datetime'].dt.hour
taxi_data['dropoff_hour'] = taxi_data['dropoff_datetime'].dt.hour
taxi_data['pickup_day'] = taxi_data['pickup_datetime'].dt.day_name()
taxi_data['pickup_month'] = taxi_data['pickup_datetime'].dt.month
taxi_data['dropoff_day'] = taxi_data['dropoff_datetime'].dt.day_name()
taxi_data['dropoff_month'] = taxi_data['dropoff_datetime'].dt.month

There are variables that might not add value to the model and hence we need to drop them to avois making our model slower for no reason.

In [55]:
#specify columns to drop
drop_cols = ['id','pickup_datetime','dropoff_datetime']

In [56]:
#create new data frame without the unwanted columns
taxi_data = taxi_data.drop(drop_cols,axis=1)

Some variables are categorical and the model doesn't work with categorical features so we have to convert them. We use one-hot encoding to conver them.

In [57]:
# specify categorical variables
cat_cols = ['store_and_fwd_flag','pickup_day','dropoff_day']

In [58]:
# one-hot encode categorical features
taxi_data = pd.get_dummies(taxi_data,columns=cat_cols)

In [62]:
#separate x_features from the target feature
y = taxi_data['trip_duration']
x = taxi_data.drop('trip_duration',axis=1)

In [64]:
#seperate training and validation dataset
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val =train_test_split(x,y,test_size = 0.2, random_state=42)

### Model building

#### Linear regressor

In [68]:
from sklearn.linear_model import LinearRegression

#create instance of linear regression
lnr = LinearRegression()

In [76]:
#fit
lnr.fit(x_train,y_train)

#predict on the training set
predictions_train  = lnr.predict(x_train)

#predictions on the validation set
predictions_val  = lnr.predict(x_val)

#### Evaluation

In [77]:
from sklearn.metrics import mean_squared_error

RMSE for training set is 3106.546525567574


In [78]:
print('RMSE for training set is {}'.format(np.sqrt(mean_squared_error(y_train, predictions_train))))

RMSE for training set is 3106.546525567574


In [79]:
print('RMSE for validation set is {}'.format(np.sqrt(mean_squared_error(y_val, predictions_val))))

RMSE for validation set is 3209.1648562211176
