In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

import pickle
from geopy.geocoders import Nominatim
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', None)

# Import data and take a look at it

In [2]:
sample_df = pd.read_csv("train.csv")

In [3]:
sample_df.shape

(1458644, 11)

In [4]:
sample_df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


# Data Preprocessing

In [5]:
sample_df["store_and_fwd_flag"].value_counts()

N    1450599
Y       8045
Name: store_and_fwd_flag, dtype: int64

In [6]:
#Convert character variables to numeric 
f = lambda x: 0 if x == 'N' else 1

sample_df["store_and_fwd_flag"] = sample_df["store_and_fwd_flag"].apply(lambda x: f(x))            

In [7]:
#Check result
sample_df["store_and_fwd_flag"].value_counts()

0    1450599
1       8045
Name: store_and_fwd_flag, dtype: int64

## Engineer features

In [8]:
#First, convert datetime strings into datetime
sample_df["dropoff_datetime"] = pd.to_datetime(sample_df["dropoff_datetime"], format='%Y-%m-%d %H:%M:%S')
sample_df["pickup_datetime"] = pd.to_datetime(sample_df["pickup_datetime"], format='%Y-%m-%d %H:%M:%S')

In [9]:
#Now construct other variables, like month, date, etc.
sample_df["pickup_month"] = sample_df["pickup_datetime"].dt.month
sample_df["pickup_day"] = sample_df["pickup_datetime"].dt.day
sample_df["pickup_weekday"] = sample_df["pickup_datetime"].dt.weekday #sample_df["pickup_weekday"] = sample_df["pickup_datetime"].dt.weekday_name
sample_df["pickup_hour"] = sample_df["pickup_datetime"].dt.hour
sample_df["pickup_minute"] = sample_df["pickup_datetime"].dt.minute

In [10]:
#Get latitude and longitude differences 
sample_df["latitude_difference"] = sample_df["dropoff_latitude"] - sample_df["pickup_latitude"]
sample_df["longitude_difference"] = sample_df["dropoff_longitude"] - sample_df["pickup_longitude"]

In [11]:
#Convert duration to minutes for easier interpretation
sample_df["trip_duration"] = sample_df["trip_duration"].apply(lambda x: round(x/60))   

In [12]:
#Convert trip distance from longitude and latitude differences to Manhattan distance.
sample_df["trip_distance"] = 0.621371 * 6371 * (abs(2 * np.arctan2(np.sqrt(np.square(np.sin((abs(sample_df["latitude_difference"]) * np.pi / 180) / 2))), 
                                  np.sqrt(1-(np.square(np.sin((abs(sample_df["latitude_difference"]) * np.pi / 180) / 2)))))) + \
                                     abs(2 * np.arctan2(np.sqrt(np.square(np.sin((abs(sample_df["longitude_difference"]) * np.pi / 180) / 2))), 
                                  np.sqrt(1-(np.square(np.sin((abs(sample_df["longitude_difference"]) * np.pi / 180) / 2)))))))

In [13]:
sample_df.head(5)

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_month,pickup_day,pickup_weekday,pickup_hour,pickup_minute,latitude_difference,longitude_difference,trip_distance
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,0,8,3,14,0,17,24,-0.002335,0.017525,1.372146
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,0,11,6,12,6,0,43,-0.007412,-0.019066,1.82944
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,0,35,1,19,1,11,35,-0.053852,-0.026306,5.538397
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,0,7,4,6,2,19,32,-0.013252,-0.002228,1.069567
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,0,7,3,26,5,13,30,-0.010689,0.00013,0.747485


# Modeling

In [14]:
X = sample_df.drop(["trip_duration", "id", "vendor_id", "pickup_datetime", "dropoff_datetime"], axis=1)
y = sample_df["trip_duration"]

In [15]:
#Split the data into training, test, and valdiation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2018)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=2019)

In [16]:
#Define evaluation metric
def rmsle(y_true, y_pred):
    assert len(y_true) == len(y_pred)
    return np.square(np.log(y_pred + 1) - np.log(y_true + 1)).mean() ** 0.5

In [17]:
#XGBoost parameters 
params = {
    'booster':            'gbtree',
    'objective':          'reg:linear',
    'learning_rate':      0.05,
    'max_depth':          14,
    'subsample':          0.9,
    'colsample_bytree':   0.7,
    'colsample_bylevel':  0.7,
    'silent':             1,
    'feval':              'rmsle'
}

In [18]:
nrounds = 200

In [19]:
#Define train and validation sets
dtrain = xgb.DMatrix(X_train, np.log(y_train+1))
dval = xgb.DMatrix(X_val, np.log(y_val+1))

#this is for tracking the error
watchlist = [(dval, 'eval'), (dtrain, 'train')]

In [20]:
#Train model
gbm = xgb.train(params,
                dtrain,
                num_boost_round = nrounds,
                evals = watchlist,
                verbose_eval = True
                )

Parameters: { "feval", "silent" } are not used.

[0]	eval-rmse:2.00600	train-rmse:2.00534
[1]	eval-rmse:1.90977	train-rmse:1.90919
[2]	eval-rmse:1.81870	train-rmse:1.81810
[3]	eval-rmse:1.73243	train-rmse:1.73183
[4]	eval-rmse:1.65036	train-rmse:1.64983
[5]	eval-rmse:1.57251	train-rmse:1.57196
[6]	eval-rmse:1.49886	train-rmse:1.49826
[7]	eval-rmse:1.42934	train-rmse:1.42876
[8]	eval-rmse:1.36317	train-rmse:1.36247
[9]	eval-rmse:1.30042	train-rmse:1.29948
[10]	eval-rmse:1.24151	train-rmse:1.24040
[11]	eval-rmse:1.18619	train-rmse:1.18497
[12]	eval-rmse:1.13395	train-rmse:1.13261
[13]	eval-rmse:1.08365	train-rmse:1.08199
[14]	eval-rmse:1.03615	train-rmse:1.03430
[15]	eval-rmse:0.99152	train-rmse:0.98947
[16]	eval-rmse:0.94900	train-rmse:0.94676
[17]	eval-rmse:0.90896	train-rmse:0.90627
[18]	eval-rmse:0.87147	train-rmse:0.86853
[19]	eval-rmse:0.83576	train-rmse:0.83231
[20]	eval-rmse:0.80234	train-rmse:0.79844
[21]	eval-rmse:0.77079	train-rmse:0.76654
[22]	eval-rmse:0.74102	train-rmse:0.7

[183]	eval-rmse:0.33229	train-rmse:0.22644
[184]	eval-rmse:0.33227	train-rmse:0.22604
[185]	eval-rmse:0.33224	train-rmse:0.22561
[186]	eval-rmse:0.33217	train-rmse:0.22509
[187]	eval-rmse:0.33216	train-rmse:0.22496
[188]	eval-rmse:0.33215	train-rmse:0.22484
[189]	eval-rmse:0.33211	train-rmse:0.22449
[190]	eval-rmse:0.33209	train-rmse:0.22402
[191]	eval-rmse:0.33204	train-rmse:0.22390
[192]	eval-rmse:0.33200	train-rmse:0.22338
[193]	eval-rmse:0.33195	train-rmse:0.22320
[194]	eval-rmse:0.33192	train-rmse:0.22274
[195]	eval-rmse:0.33192	train-rmse:0.22270
[196]	eval-rmse:0.33187	train-rmse:0.22242
[197]	eval-rmse:0.33178	train-rmse:0.22198
[198]	eval-rmse:0.33177	train-rmse:0.22183
[199]	eval-rmse:0.33176	train-rmse:0.22173


In [21]:
#Test predictions
pred = np.exp(gbm.predict(xgb.DMatrix(X_test))) - 1

In [22]:
#Use mean absolute error to get a basic estimate of the error
mae = (abs(pred - y_test)).mean()
mae

4.880538319898825

In [23]:
#Take a look at feature importance
feature_scores = gbm.get_fscore()
feature_scores

{'passenger_count': 34267.0,
 'pickup_longitude': 172436.0,
 'pickup_latitude': 140689.0,
 'dropoff_longitude': 138972.0,
 'dropoff_latitude': 123669.0,
 'store_and_fwd_flag': 1738.0,
 'pickup_month': 39903.0,
 'pickup_day': 75027.0,
 'pickup_weekday': 29360.0,
 'pickup_hour': 60108.0,
 'pickup_minute': 80870.0,
 'latitude_difference': 122912.0,
 'longitude_difference': 102540.0,
 'trip_distance': 102846.0}

In [24]:
#This is not very telling, so let's scale the features
summ = 0
for key in feature_scores:
    summ = summ + feature_scores[key]

for key in feature_scores:
    feature_scores[key] = feature_scores[key] / summ

feature_scores

{'passenger_count': 0.027965367894709783,
 'pickup_longitude': 0.14072536779677755,
 'pickup_latitude': 0.11481657699065645,
 'dropoff_longitude': 0.11341532982355058,
 'dropoff_latitude': 0.10092652062248998,
 'store_and_fwd_flag': 0.0014183853095107713,
 'pickup_month': 0.032564918875378775,
 'pickup_day': 0.06122968620061257,
 'pickup_weekday': 0.023960755286096803,
 'pickup_hour': 0.04905426017495595,
 'pickup_minute': 0.06599817029927277,
 'latitude_difference': 0.10030873139389408,
 'longitude_difference': 0.08368310105709695,
 'trip_distance': 0.083932828274997}

# Save the model

In [25]:
filename = "xgb_model.sav"
pickle.dump(gbm, open(filename, 'wb'))