# Setup

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import boxcox
import matplotlib.pyplot as plt

import seaborn as sns
train = pd.read_csv('../input/nyc-taxi-trip-duration/train.zip',parse_dates=['pickup_datetime','dropoff_datetime'])
test = pd.read_csv('../input/nyc-taxi-trip-duration/test.zip',parse_dates=['pickup_datetime'])

train['trip_duration'],_ = boxcox(train['trip_duration'])
train = train[train['trip_duration'] !=0].reset_index(drop=True)

y = train.trip_duration

train = train.drop(['trip_duration','vendor_id','store_and_fwd_flag'],axis=1)
test = test.drop(['vendor_id','store_and_fwd_flag'],axis=1)

print ('SETUP COMPLETE')

SETUP COMPLETE


In [2]:
train.head()

Unnamed: 0,id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
0,id2875421,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602
1,id2377394,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152
2,id3858529,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087
3,id3504673,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718
4,id2181028,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252


# Feature Engineering

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458611 entries, 0 to 1458610
Data columns (total 8 columns):
 #   Column             Non-Null Count    Dtype         
---  ------             --------------    -----         
 0   id                 1458611 non-null  object        
 1   pickup_datetime    1458611 non-null  datetime64[ns]
 2   dropoff_datetime   1458611 non-null  datetime64[ns]
 3   passenger_count    1458611 non-null  int64         
 4   pickup_longitude   1458611 non-null  float64       
 5   pickup_latitude    1458611 non-null  float64       
 6   dropoff_longitude  1458611 non-null  float64       
 7   dropoff_latitude   1458611 non-null  float64       
dtypes: datetime64[ns](2), float64(4), int64(1), object(1)
memory usage: 89.0+ MB


## Creating Distance feature

In [4]:
train['x'] = (train.pickup_longitude - train.dropoff_longitude)**2
train['y'] = (train.pickup_latitude - train.dropoff_latitude)**2
train['distance_sqrt'] = train.x + train.y
train['distance'] = np.sqrt(train.distance_sqrt) * 1000

test['x'] = (test.pickup_longitude - test.dropoff_longitude)**2
test['y'] = (test.pickup_latitude - test.dropoff_latitude)**2
test['distance_sqrt'] = test.x + test.y
test['distance'] = np.sqrt(test.distance_sqrt) *1000

train = train.drop(['x','y','distance_sqrt'],axis=1)
test = test.drop(['x','y','distance_sqrt'],axis=1)
print(len(train))

1458611


## Creating Month, Day, Hour features

In [5]:
train = train.assign(pmonth=train.pickup_datetime.dt.month, pday = train.pickup_datetime.dt.day, phour = train.pickup_datetime.dt.hour)
test = test.assign(pmonth=test.pickup_datetime.dt.month, pday = test.pickup_datetime.dt.day, phour = test.pickup_datetime.dt.hour)
print(len(train))

1458611


## Dropping pickup and dropoff time features

In [6]:
X_train = train.drop(['pickup_datetime','dropoff_datetime','id'],axis=1)
X_test = test.drop(['pickup_datetime','id'],axis=1)
print(len(train))
print(len(X_train))

1458611
1458611


In [7]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458611 entries, 0 to 1458610
Data columns (total 9 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   passenger_count    1458611 non-null  int64  
 1   pickup_longitude   1458611 non-null  float64
 2   pickup_latitude    1458611 non-null  float64
 3   dropoff_longitude  1458611 non-null  float64
 4   dropoff_latitude   1458611 non-null  float64
 5   distance           1458611 non-null  float64
 6   pmonth             1458611 non-null  int64  
 7   pday               1458611 non-null  int64  
 8   phour              1458611 non-null  int64  
dtypes: float64(5), int64(4)
memory usage: 100.2 MB


## Creating Pickup Boroughs for train dataset

In [8]:
train_pickup_borough = []
for j,k in zip(X_train.pickup_longitude,X_train.pickup_latitude):
    if ((j>=-74.040269) and (j<=-73.865036)) and ((k>=40.574031) and (k<=40.736728)):
        train_pickup_borough.append('Brooklyn')
    elif ((j>=-74.245856) and (j<=-73.057562)) and ((k>=40.502863) and (k<=40.647234)):
        train_pickup_borough.append('Staten_island')
    elif ((j>=-74.011645) and (j<=-73.913772)) and ((k>=40.703384) and (k<=40.876058)):
        train_pickup_borough.append('Manhattan')
    elif ((j>=-73.931573) and (j<=-73.781473)) and ((k>=40.797766) and (k<=40.912460)):
        train_pickup_borough.append('Bronx')
    else: train_pickup_borough.append('Queens')
Borough_ptrain = pd.DataFrame(train_pickup_borough,columns=['pickup_borough'])
X_train = pd.concat([X_train,Borough_ptrain], axis=1)

## Creating Pickup Borough for test dataset

In [9]:
test_pickup_borough = []
for g,h in zip(X_test.pickup_longitude,X_test.pickup_latitude):
    if ((g>=-74.040269) and (g<=-73.865036)) and ((h>=40.574031) and (h<=40.736728)):
        test_pickup_borough.append('Brooklyn')
    elif ((g>=-74.245856) and (g<=-73.057562)) and ((h>=40.502863) and (h<=40.647234)):
        test_pickup_borough.append('Staten_island')
    elif ((g>=-74.011645) and (g<=-73.913772)) and ((h>=40.703384) and (h<=40.876058)):
        test_pickup_borough.append('Manhattan')
    elif ((g>=-73.931573) and (g<=-73.781473)) and ((h>=40.797766) and (h<=40.912460)):
        test_pickup_borough.append('Bronx')
    else: test_pickup_borough.append('Queens')
Borough_ptest = pd.DataFrame(test_pickup_borough,columns=['pickup_borough'])
X_test = pd.concat([X_test,Borough_ptest],axis=1)
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 625134 entries, 0 to 625133
Data columns (total 10 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   passenger_count    625134 non-null  int64  
 1   pickup_longitude   625134 non-null  float64
 2   pickup_latitude    625134 non-null  float64
 3   dropoff_longitude  625134 non-null  float64
 4   dropoff_latitude   625134 non-null  float64
 5   distance           625134 non-null  float64
 6   pmonth             625134 non-null  int64  
 7   pday               625134 non-null  int64  
 8   phour              625134 non-null  int64  
 9   pickup_borough     625134 non-null  object 
dtypes: float64(5), int64(4), object(1)
memory usage: 47.7+ MB


## Creating Dropoff Borough for train dataset 

In [10]:
train_dropoff_borough = []
for d,f in zip(X_train.dropoff_longitude,X_train.dropoff_latitude):
    if ((d>=-74.040269) and (d<=-73.865036)) and ((f>=40.574031) and (f<=40.736728)):
        train_dropoff_borough.append('Brooklyn')
    elif ((d>=-74.245856) and (d<=-73.057562)) and ((f>=40.502863) and (f<=40.647234)):
        train_dropoff_borough.append('Staten_island')
    elif ((d>=-74.011645) and (d<=-73.913772)) and ((f>=40.703384) and (f<=40.876058)):
        train_dropoff_borough.append('Manhattan')
    elif ((d>=-73.931573) and (d<=-73.781473)) and ((f>=40.797766) and (f<=40.912460)):
        train_dropoff_borough.append('Bronx')
    else: train_dropoff_borough.append('Queens')
Borough_dtrain = pd.DataFrame(train_dropoff_borough,columns=['dropoff_borough'])
X_train = pd.concat([X_train,Borough_dtrain],axis=1)
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458611 entries, 0 to 1458610
Data columns (total 11 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   passenger_count    1458611 non-null  int64  
 1   pickup_longitude   1458611 non-null  float64
 2   pickup_latitude    1458611 non-null  float64
 3   dropoff_longitude  1458611 non-null  float64
 4   dropoff_latitude   1458611 non-null  float64
 5   distance           1458611 non-null  float64
 6   pmonth             1458611 non-null  int64  
 7   pday               1458611 non-null  int64  
 8   phour              1458611 non-null  int64  
 9   pickup_borough     1458611 non-null  object 
 10  dropoff_borough    1458611 non-null  object 
dtypes: float64(5), int64(4), object(2)
memory usage: 122.4+ MB


## Creating Dropoff Borough for test dataset

In [11]:
test_dropoff_borough = []
for a,s in zip(X_test.dropoff_longitude,X_test.dropoff_latitude):
    if ((a>=-74.040269) and (a<=-73.865036)) and ((s>=40.574031) and (s<=40.736728)):
        test_dropoff_borough.append('Brooklyn')
    elif ((a>=-74.245856) and (a<=-73.057562)) and ((s>=40.502863) and (s<=40.647234)):
        test_dropoff_borough.append('Staten_island')
    elif ((a>=-74.011645) and (a<=-73.913772)) and ((s>=40.703384) and (s<=40.876058)):
        test_dropoff_borough.append('Manhattan')
    elif ((a>=-73.931573) and (a<=-73.781473)) and ((s>=40.797766) and (s<=40.912460)):
        test_dropoff_borough.append('Bronx')
    else: test_dropoff_borough.append('Queens')
Borough_dtest = pd.DataFrame(test_dropoff_borough,columns=['dropoff_borough'])
X_test = pd.concat([X_test,Borough_dtest],axis=1)
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 625134 entries, 0 to 625133
Data columns (total 11 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   passenger_count    625134 non-null  int64  
 1   pickup_longitude   625134 non-null  float64
 2   pickup_latitude    625134 non-null  float64
 3   dropoff_longitude  625134 non-null  float64
 4   dropoff_latitude   625134 non-null  float64
 5   distance           625134 non-null  float64
 6   pmonth             625134 non-null  int64  
 7   pday               625134 non-null  int64  
 8   phour              625134 non-null  int64  
 9   pickup_borough     625134 non-null  object 
 10  dropoff_borough    625134 non-null  object 
dtypes: float64(5), int64(4), object(2)
memory usage: 52.5+ MB


## Creating avg borough speed feature

In [12]:
X_train['speed'] = X_train['distance'] / y

In [13]:
from typing import List
def assign_borough_speed(df: pd.DataFrame) -> pd.DataFrame:
    borough_speed = []
    for colitter in df['dropoff_borough']:
        if colitter == 'Staten_island':
            borough_speed.append(20.179187)
        elif colitter == 'Bronx':
            borough_speed.append(14.042448)
        elif colitter == 'Queens':
            borough_speed.append(12.183943)
        elif colitter == 'Brooklin':
            borough_speed.append(4.583516)
        else: borough_speed.append(3.450198)
    
    borough_speed = pd.DataFrame(borough_speed, columns=['borough_speed'])
    return pd.concat([df, borough_speed], axis=1) 

In [14]:
X_train = assign_borough_speed(df=X_train)
X_test = assign_borough_speed(df=X_test)

## Transforming borough from characters to numbers

In [15]:
X_train.head()

Unnamed: 0,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,distance,pmonth,pday,phour,pickup_borough,dropoff_borough,speed,borough_speed
0,1,-73.982155,40.767937,-73.96463,40.765602,17.679539,3,14,17,Manhattan,Manhattan,2.431762,3.450198
1,1,-73.980415,40.738564,-73.999481,40.731152,20.455904,6,12,0,Manhattan,Brooklyn,2.621872,3.450198
2,1,-73.979027,40.763939,-74.005333,40.710087,59.933799,1,19,11,Manhattan,Brooklyn,6.297136,3.450198
3,1,-74.01004,40.719971,-74.012268,40.706718,13.438206,4,6,19,Brooklyn,Brooklyn,1.869498,3.450198
4,1,-73.973053,40.793209,-73.972923,40.78252,10.689569,3,26,13,Manhattan,Manhattan,1.483118,3.450198


In [16]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
X_train['pickup_borough'] = encoder.fit_transform(X_train['pickup_borough'])
X_test['pickup_borough'] =  encoder.transform(X_test['pickup_borough'])
X_train['dropoff_borough'] = encoder.transform(X_train['dropoff_borough'])
X_test['dropoff_borough'] =  encoder.transform(X_test['dropoff_borough'])
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458611 entries, 0 to 1458610
Data columns (total 13 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   passenger_count    1458611 non-null  int64  
 1   pickup_longitude   1458611 non-null  float64
 2   pickup_latitude    1458611 non-null  float64
 3   dropoff_longitude  1458611 non-null  float64
 4   dropoff_latitude   1458611 non-null  float64
 5   distance           1458611 non-null  float64
 6   pmonth             1458611 non-null  int64  
 7   pday               1458611 non-null  int64  
 8   phour              1458611 non-null  int64  
 9   pickup_borough     1458611 non-null  int64  
 10  dropoff_borough    1458611 non-null  int64  
 11  speed              1458611 non-null  float64
 12  borough_speed      1458611 non-null  float64
dtypes: float64(7), int64(6)
memory usage: 144.7 MB


In [17]:
X_train.head()

Unnamed: 0,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,distance,pmonth,pday,phour,pickup_borough,dropoff_borough,speed,borough_speed
0,1,-73.982155,40.767937,-73.96463,40.765602,17.679539,3,14,17,2,2,2.431762,3.450198
1,1,-73.980415,40.738564,-73.999481,40.731152,20.455904,6,12,0,2,1,2.621872,3.450198
2,1,-73.979027,40.763939,-74.005333,40.710087,59.933799,1,19,11,2,1,6.297136,3.450198
3,1,-74.01004,40.719971,-74.012268,40.706718,13.438206,4,6,19,1,1,1.869498,3.450198
4,1,-73.973053,40.793209,-73.972923,40.78252,10.689569,3,26,13,2,2,1.483118,3.450198


In [18]:
X_train = X_train.drop(['speed'],axis=1)
X_train.head()

Unnamed: 0,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,distance,pmonth,pday,phour,pickup_borough,dropoff_borough,borough_speed
0,1,-73.982155,40.767937,-73.96463,40.765602,17.679539,3,14,17,2,2,3.450198
1,1,-73.980415,40.738564,-73.999481,40.731152,20.455904,6,12,0,2,1,3.450198
2,1,-73.979027,40.763939,-74.005333,40.710087,59.933799,1,19,11,2,1,3.450198
3,1,-74.01004,40.719971,-74.012268,40.706718,13.438206,4,6,19,1,1,3.450198
4,1,-73.973053,40.793209,-73.972923,40.78252,10.689569,3,26,13,2,2,3.450198


# Model

In [19]:
from xgboost import XGBRegressor
model = XGBRegressor()
model.fit(X_train,y)
preds = model.predict(X_test)

In [20]:
from scipy.special import inv_boxcox
preds = inv_boxcox(preds,_)

In [21]:
output = pd.DataFrame({'Id':test.id,'trip_duration':preds})
output.to_csv('submission.csv', index=False)
output.head()

Unnamed: 0,Id,trip_duration
0,id3004672,930.96582
1,id3505355,483.621277
2,id1217141,499.025177
3,id2150126,956.119934
4,id1598245,284.216675
