### Import Libraires


In [18]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

### Read Data

In [19]:
path = '../data/nyc_taxi_trip_duration/'
train = pd.read_csv(path + 'train.csv')
test = pd.read_csv(path + 'test.csv')

In [20]:
display(train.tail(2))
display(test.tail(2))

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
1458642,id2714485,1,2016-01-05 15:56:26,2016-01-05 16:02:39,1,-73.982079,40.749062,-73.974632,40.757107,N,373
1458643,id1209952,1,2016-04-05 14:44:25,2016-04-05 14:47:43,1,-73.979538,40.78175,-73.972809,40.790585,N,198


Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag
625132,id1384355,1,2016-01-01 00:00:28,1,-73.976501,40.733562,-73.854263,40.891788,N
625133,id0621643,2,2016-01-01 00:00:22,2,-73.98185,40.716881,-73.96933,40.769379,N


In [21]:
display(train.info())
display(test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458644 entries, 0 to 1458643
Data columns (total 11 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   id                  1458644 non-null  object 
 1   vendor_id           1458644 non-null  int64  
 2   pickup_datetime     1458644 non-null  object 
 3   dropoff_datetime    1458644 non-null  object 
 4   passenger_count     1458644 non-null  int64  
 5   pickup_longitude    1458644 non-null  float64
 6   pickup_latitude     1458644 non-null  float64
 7   dropoff_longitude   1458644 non-null  float64
 8   dropoff_latitude    1458644 non-null  float64
 9   store_and_fwd_flag  1458644 non-null  object 
 10  trip_duration       1458644 non-null  int64  
dtypes: float64(4), int64(3), object(4)
memory usage: 122.4+ MB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 625134 entries, 0 to 625133
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   id                  625134 non-null  object 
 1   vendor_id           625134 non-null  int64  
 2   pickup_datetime     625134 non-null  object 
 3   passenger_count     625134 non-null  int64  
 4   pickup_longitude    625134 non-null  float64
 5   pickup_latitude     625134 non-null  float64
 6   dropoff_longitude   625134 non-null  float64
 7   dropoff_latitude    625134 non-null  float64
 8   store_and_fwd_flag  625134 non-null  object 
dtypes: float64(4), int64(2), object(3)
memory usage: 42.9+ MB


None

In [22]:
print(train.columns.difference(test.columns))

Index(['dropoff_datetime', 'trip_duration'], dtype='object')


### Preprocessing 

##### transform to date dtype

In [23]:
train['pickup_datetime'] = pd.to_datetime(train['pickup_datetime'])
test['pickup_datetime'] = pd.to_datetime(test['pickup_datetime'])

##### Label Encoding

In [24]:
le = LabelEncoder()
train['store_and_fwd_flag'] = le.fit_transform(train['store_and_fwd_flag'])
test['store_and_fwd_flag'] = le.transform(test['store_and_fwd_flag'])

##### Create New Feature

In [25]:
# date features
for df in [train, test]:
    df['month'] = df['pickup_datetime'].dt.month
    df['day'] = df['pickup_datetime'].dt.day
    df['weekday'] = df['pickup_datetime'].dt.weekday
    df['hour'] = df['pickup_datetime'].dt.hour
    df['minute'] = df['pickup_datetime'].dt.minute


In [26]:
# distance features
for df in [train, test]:
    df['dist_long'] = df['pickup_longitude'] - df['dropoff_longitude']
    df['dist_lat'] = df['pickup_latitude'] - df['dropoff_latitude']
    df['dist'] = np.sqrt(np.square(df['dist_long'])) + np.square(df['dist_lat'])

In [27]:
# spatial features: count and speed
for df in [train, test]:
    df['pickup_longitude_bin'] = np.round(df['pickup_longitude'], 2)
    df['pickup_latitude_bin'] = np.round(df['pickup_latitude'], 2)
    df['dropoff_longitude_bin'] = np.round(df['dropoff_longitude'], 2)
    df['dropoff_latitude_bin'] = np.round(df['dropoff_latitude'], 2)

In [28]:
# count features
a = pd.concat([train, test]).groupby(['pickup_longitude_bin', 'pickup_latitude_bin']).size().reset_index().\
rename(columns={0 : 'size'})
b = pd.concat([train, test]).groupby(['dropoff_longitude_bin', 'dropoff_latitude_bin'], as_index=False).size()

train = pd.merge(train, a, on=['pickup_longitude_bin', 'pickup_latitude_bin'], how='left')
train = pd.merge(train, b, on=['dropoff_longitude_bin', 'dropoff_latitude_bin'], how='left')

test = pd.merge(test, a, on=['pickup_longitude_bin', 'pickup_latitude_bin'], how='left')
test = pd.merge(test, b, on=['dropoff_longitude_bin', 'dropoff_latitude_bin'], how='left')



In [29]:
train.tail()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,month,day,weekday,hour,minute,dist_long,dist_lat,dist,pickup_longitude_bin,pickup_latitude_bin,dropoff_longitude_bin,dropoff_latitude_bin,size_x,size_y
1458639,id2376096,2,2016-04-08 13:31:04,2016-04-08 13:44:02,4,-73.982201,40.745522,-73.994911,40.74017,0,778,4,8,4,13,31,0.012711,0.005352,0.012739,-73.98,40.75,-73.99,40.74,104115,78992
1458640,id1049543,1,2016-01-10 07:35:15,2016-01-10 07:46:10,1,-74.000946,40.747379,-73.970184,40.796547,0,655,1,10,6,7,35,-0.030762,-0.049168,0.033179,-74.0,40.75,-73.97,40.8,34996,18480
1458641,id2304944,2,2016-04-22 06:57:41,2016-04-22 07:10:25,1,-73.959129,40.768799,-74.004433,40.707371,0,764,4,22,4,6,57,0.045303,0.061428,0.049077,-73.96,40.77,-74.0,40.71,73643,8258
1458642,id2714485,1,2016-01-05 15:56:26,2016-01-05 16:02:39,1,-73.982079,40.749062,-73.974632,40.757107,0,373,1,5,1,15,56,-0.007446,-0.008045,0.007511,-73.98,40.75,-73.97,40.76,104115,110116
1458643,id1209952,1,2016-04-05 14:44:25,2016-04-05 14:47:43,1,-73.979538,40.78175,-73.972809,40.790585,0,198,4,5,1,14,44,-0.006729,-0.008835,0.006807,-73.98,40.78,-73.97,40.79,51802,35290


In [30]:
# speed features
train['speed'] = 100000 * train['dist'] / train['trip_duration']

a = train[['speed', 'pickup_longitude_bin', 'pickup_latitude_bin']].groupby(['pickup_longitude_bin', 'pickup_latitude_bin']).mean().reset_index()
a = a.rename(columns={'speed' : 'ave_speed'})
b = train[['speed', 'dropoff_longitude_bin', 'dropoff_latitude_bin']].groupby(['dropoff_longitude_bin', 'dropoff_latitude_bin']).mean().reset_index()
b = b.rename(columns={'speed' : 'ave_speed'})

train = pd.merge(train, a, on = ['pickup_longitude_bin', 'pickup_latitude_bin'], how='left')
train = pd.merge(train, b, on = ['dropoff_longitude_bin', 'dropoff_latitude_bin'], how='left')

test = pd.merge(test, a, on = ['pickup_longitude_bin', 'pickup_latitude_bin'], how='left')
test = pd.merge(test, b, on = ['dropoff_longitude_bin', 'dropoff_latitude_bin'], how='left')

In [31]:
# drop bins
train = train.drop(['speed', 'pickup_longitude_bin', 'pickup_latitude_bin', 'dropoff_longitude_bin', 'dropoff_latitude_bin'], axis = 1)
test = test.drop(['pickup_longitude_bin', 'pickup_latitude_bin', 'dropoff_longitude_bin', 'dropoff_latitude_bin'], axis = 1)

In [32]:
print(train.columns.difference(test.columns))

Index(['dropoff_datetime', 'trip_duration'], dtype='object')


In [33]:
# weather data
weather = pd.read_csv(path + 'KNYC_Metars.csv')
display(weather.tail(2))

weather['Time'] = pd.to_datetime(weather['Time'])
weather['year'] = weather['Time'].dt.year
weather['month'] = weather['Time'].dt.month
weather['day'] = weather['Time'].dt.day
weather['hour'] = weather['Time'].dt.hour
weather = weather[weather['year']==2016]
display(weather.tail())

Unnamed: 0,Time,Temp.,Windchill,Heat Index,Humidity,Pressure,Dew Point,Visibility,Wind Dir,Wind Speed,Gust Speed,Precip,Events,Conditions
8785,2017-01-02 00:00:00,5.0,3.3,,0.41,1030.6,-7.2,16.1,ENE,7.4,0.0,0.0,,Clear
8786,2017-01-02 01:00:00,5.0,2.1,,0.43,1030.1,-6.7,16.1,ENE,13.0,0.0,0.0,,Clear


Unnamed: 0,Time,Temp.,Windchill,Heat Index,Humidity,Pressure,Dew Point,Visibility,Wind Dir,Wind Speed,Gust Speed,Precip,Events,Conditions,year,month,day,hour
8756,2016-12-31 19:00:00,6.1,,,0.51,1014.0,-3.3,16.1,North,0.0,0.0,0.0,,Partly Cloudy,2016,12,31,19
8757,2016-12-31 20:00:00,6.1,3.8,,0.51,1013.9,-3.3,16.1,WSW,11.1,0.0,0.0,,Overcast,2016,12,31,20
8758,2016-12-31 21:00:00,6.1,4.6,,0.47,1013.8,-4.4,16.1,Variable,7.4,0.0,0.0,,Overcast,2016,12,31,21
8759,2016-12-31 22:00:00,6.1,3.4,,0.47,1012.9,-4.4,16.1,WSW,13.0,38.9,0.0,,Overcast,2016,12,31,22
8760,2016-12-31 23:00:00,6.1,4.2,,0.45,1012.5,-5.0,16.1,Variable,9.3,29.6,0.0,,Overcast,2016,12,31,23


In [35]:
train = pd.merge(train, weather[['Temp.', 'month', 'day', 'hour']], on=['month', 'day', 'hour'], how='left')
test = pd.merge(test, weather[['Temp.', 'month', 'day', 'hour']], on=['month', 'day', 'hour'], how='left')

In [38]:
x_train = train.drop(['id', 'pickup_datetime', 'dropoff_datetime', 'trip_duration'], axis=1)
x_test = test.drop(['id', 'pickup_datetime'], axis=1)
y_train = train['trip_duration']
id_train = train['id']
id_test = test['id']

### Modeling

In [47]:
# xgb parameters
params = {
    'booster' : 'gbtree',
    'objective' : 'reg:linear',
    'learning_rate' : 0.1,
    'max_depth' : 14,
    'subsample' : .8, 
    'colsample_bytree' : .7,
    'colsample_bylevel' : .7,
    'verbosity' : 1,
    'gpu_id' : 0,
    'tree_method': 'gpu_hist'
}

# number of rounds
nrounds = 200

In [48]:
# train model
dtrain = xgb.DMatrix(x_train, np.log(y_train + 1))
gbm = xgb.train(params, dtrain, num_boost_round=nrounds)



In [50]:
# test predictions
pred_test = np.exp(gbm.predict(xgb.DMatrix(x_test))) - 1

In [51]:
# create submission
df = pd.DataFrame({'id' : id_test, 'trip_duration' : pred_test})
df = df.set_index('id')
# df.to_csv(path + 'nyc_taxi_trip_duration2.csv', index=True)