In [78]:
import pandas as pd
import numpy as np
import xgboost as xgb

import pickle
from geopy.geocoders import Nominatim
from sklearn.model_selection import train_test_split
#from google.colab import drive

pd.set_option('display.max_columns', None)

In [79]:
sample_df = pd.read_csv("train.csv")

In [80]:
sample_df.shape

(1458644, 11)

In [81]:
sample_df.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [82]:
sample_df["store_and_fwd_flag"].value_counts()

store_and_fwd_flag
N    1450599
Y       8045
Name: count, dtype: int64

In [83]:
#Convert character variables to numeric
f = lambda x: 0 if x == 'N' else 1

sample_df["store_and_fwd_flag"] = sample_df["store_and_fwd_flag"].apply(lambda x: f(x))

In [84]:
#Check result
sample_df["store_and_fwd_flag"].value_counts()

store_and_fwd_flag
0    1450599
1       8045
Name: count, dtype: int64

In [85]:
print(sample_df.columns)


Index(['id', 'vendor_id', 'pickup_datetime', 'dropoff_datetime',
       'passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag',
       'trip_duration'],
      dtype='object')


In [87]:
#First, convert datetime strings into datetime
sample_df["dropoff_datetime"] = pd.to_datetime(sample_df["dropoff_datetime"], format='%Y-%m-%d %H:%M:%S')
sample_df["pickup_datetime"] = pd.to_datetime(sample_df["pickup_datetime"], format='%Y-%m-%d %H:%M:%S')

In [88]:
#Now construct other variables, like month, date, etc.
sample_df["pickup_month"] = sample_df["pickup_datetime"].dt.month
sample_df["pickup_day"] = sample_df["pickup_datetime"].dt.day
sample_df["pickup_weekday"] = sample_df["pickup_datetime"].dt.weekday #sample_df["pickup_weekday"] = sample_df["pickup_datetime"].dt.weekday_name
sample_df["pickup_hour"] = sample_df["pickup_datetime"].dt.hour
sample_df["pickup_minute"] = sample_df["pickup_datetime"].dt.minute

In [89]:
#Get latitude and longitude differences
sample_df["latitude_difference"] = sample_df["dropoff_latitude"] - sample_df["pickup_latitude"]
sample_df["longitude_difference"] = sample_df["dropoff_longitude"] - sample_df["pickup_longitude"]

In [90]:
#Convert duration to minutes for easier interpretation
sample_df["trip_duration"] = sample_df["trip_duration"].apply(lambda x: round(x/60))

In [91]:
#Convert trip distance from longitude and latitude differences to Manhattan distance.
sample_df["trip_distance"] = 0.621371 * 6371 * (abs(2 * np.arctan2(np.sqrt(np.square(np.sin((abs(sample_df["latitude_difference"]) * np.pi / 180) / 2))),
                                  np.sqrt(1-(np.square(np.sin((abs(sample_df["latitude_difference"]) * np.pi / 180) / 2)))))) + \
                                     abs(2 * np.arctan2(np.sqrt(np.square(np.sin((abs(sample_df["longitude_difference"]) * np.pi / 180) / 2))),
                                  np.sqrt(1-(np.square(np.sin((abs(sample_df["longitude_difference"]) * np.pi / 180) / 2)))))))

In [92]:
sample_df.head(5)

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_month,pickup_day,pickup_weekday,pickup_hour,pickup_minute,latitude_difference,longitude_difference,trip_distance
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,0,8,3,14,0,17,24,-0.002335,0.017525,1.372146
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,0,11,6,12,6,0,43,-0.007412,-0.019066,1.82944
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,0,35,1,19,1,11,35,-0.053852,-0.026306,5.538397
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,0,7,4,6,2,19,32,-0.013252,-0.002228,1.069567
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,0,7,3,26,5,13,30,-0.010689,0.00013,0.747485


In [None]:
#modeling

In [93]:
X = sample_df.drop(["trip_duration", "id", "vendor_id", "pickup_datetime", "dropoff_datetime"], axis=1)
y = sample_df["trip_duration"]

In [94]:
#Split the data into training, test, and valdiation sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2018)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=2019)

In [95]:
#Define evaluation metric
def rmsle(y_true, y_pred):
    assert len(y_true) == len(y_pred)
    return np.square(np.log(y_pred + 1) - np.log(y_true + 1)).mean() ** 0.5

In [96]:
#XGBoost parameters
params = {
    'booster':            'gbtree',
    'objective':          'reg:linear',
    'learning_rate':      0.05,
    'max_depth':          14,
    'subsample':          0.9,
    'colsample_bytree':   0.7,
    'colsample_bylevel':  0.7,
    'silent':             1,
    'feval':              'rmsle'
}

In [97]:
nrounds = 2000

In [98]:
#Define train and validation sets
dtrain = xgb.DMatrix(X_train, np.log(y_train+1))
dval = xgb.DMatrix(X_val,np.log(y_val+1))

#this is for tracking the error
watchlist = [(dval, 'eval'), (dtrain, 'train')]

#, np.log(y_train+1)
#, np.log(y_val+1)

In [99]:
#Train model
gbm = xgb.train(params,
                dtrain,
                num_boost_round = nrounds,
                evals = watchlist,
                verbose_eval = True
                )

Parameters: { "feval", "silent" } are not used.



[0]	eval-rmse:0.67355	train-rmse:0.67406
[1]	eval-rmse:0.65965	train-rmse:0.65981
[2]	eval-rmse:0.63845	train-rmse:0.63812
[3]	eval-rmse:0.62273	train-rmse:0.62210
[4]	eval-rmse:0.60374	train-rmse:0.60255
[5]	eval-rmse:0.58567	train-rmse:0.58388
[6]	eval-rmse:0.56969	train-rmse:0.56718
[7]	eval-rmse:0.55420	train-rmse:0.55101
[8]	eval-rmse:0.54423	train-rmse:0.54047
[9]	eval-rmse:0.53499	train-rmse:0.53068
[10]	eval-rmse:0.52269	train-rmse:0.51786
[11]	eval-rmse:0.51071	train-rmse:0.50520
[12]	eval-rmse:0.49934	train-rmse:0.49296
[13]	eval-rmse:0.48833	train-rmse:0.48102
[14]	eval-rmse:0.47884	train-rmse:0.47108
[15]	eval-rmse:0.47266	train-rmse:0.46440
[16]	eval-rmse:0.46384	train-rmse:0.45464
[17]	eval-rmse:0.45602	train-rmse:0.44577
[18]	eval-rmse:0.44813	train-rmse:0.43700
[19]	eval-rmse:0.44071	train-rmse:0.42895
[20]	eval-rmse:0.43418	train-rmse:0.42171
[21]	eval-rmse:0.42811	train-rmse:0.41445
[22]	eval-rmse:0.42248	train-rmse:0.40825
[23]	eval-rmse:0.41704	train-rmse:0.40168
[2

In [100]:
#Test predictions
pred = np.exp(gbm.predict(xgb.DMatrix(X_test))) - 1

In [101]:
#Use mean absolute error to get a basic estimate of the error
mae = (abs(pred - y_test)).mean()
mae

4.8431692832816156

In [102]:
#Take a look at feature importance
feature_scores = gbm.get_fscore()
feature_scores

{'passenger_count': 561531.0,
 'pickup_longitude': 1492688.0,
 'pickup_latitude': 1402371.0,
 'dropoff_longitude': 1350415.0,
 'dropoff_latitude': 1221993.0,
 'store_and_fwd_flag': 12081.0,
 'pickup_month': 595482.0,
 'pickup_day': 964135.0,
 'pickup_weekday': 569593.0,
 'pickup_hour': 880839.0,
 'pickup_minute': 1101608.0,
 'latitude_difference': 1032852.0,
 'longitude_difference': 945647.0,
 'trip_distance': 933532.0}

In [103]:
#This is not very telling, so let's scale the features
summ = 0
for key in feature_scores:
    summ = summ + feature_scores[key]

for key in feature_scores:
    feature_scores[key] = feature_scores[key] / summ

feature_scores

{'passenger_count': 0.04298055985231118,
 'pickup_longitude': 0.11425293692570254,
 'pickup_latitude': 0.10733991658634248,
 'dropoff_longitude': 0.10336311393842691,
 'dropoff_latitude': 0.09353347059308444,
 'store_and_fwd_flag': 0.0009247007619806768,
 'pickup_month': 0.04557922846997577,
 'pickup_day': 0.07379657057795214,
 'pickup_weekday': 0.043597639360885655,
 'pickup_hour': 0.06742094979573689,
 'pickup_minute': 0.08431899321281427,
 'latitude_difference': 0.07905628933145153,
 'longitude_difference': 0.07238146688723955,
 'trip_distance': 0.07145416370609595}

In [104]:
filename = "trained_model.sav"
pickle.dump(gbm, open(filename, 'wb'))