# Modeling - Predicting Taxi Trip Durations in NYC

## Set up

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme()
sns.set_style('whitegrid')

In [2]:
from load_preprocess_data import load_train_data, load_test_data

# load data
train_data = load_train_data('data/W22P1_train.csv')
test_data = load_test_data('data/W22P1_test.csv')

In [3]:
train_data.head()

Unnamed: 0_level_0,pickup_datetime,dayofweek,hour,passenger_count,distance_km,l1_distance_km,bearing,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration,log_trip_duration
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,2016-01-07 19:32:15,3,19,1,1.2597,1.687396,296.295673,-73.986389,40.756615,-73.999794,40.761631,520,6.253829
1,2016-01-27 08:07:32,2,8,1,2.35665,3.146872,334.240476,-73.956039,40.767609,-73.968201,40.78669,989,6.896694
2,2016-01-31 13:52:55,6,13,1,2.806862,3.948055,230.930933,-73.975998,40.751137,-74.001854,40.735229,657,6.487684
3,2016-01-19 08:00:19,1,8,3,3.15551,3.99141,198.443755,-73.960121,40.781952,-73.97197,40.755039,1035,6.942157
4,2016-01-25 23:32:14,0,23,1,1.725446,1.998249,189.977838,-73.987434,40.760139,-73.990982,40.744862,621,6.431331


In [4]:
all_covariates = list(test_data.columns)

original_covariates = ['hour', 'passenger_count',
                       'pickup_longitude', 'pickup_latitude',
                       'dropoff_longitude', 'dropoff_latitude']

# numerical covariates
numerical_covariates = ['hour', 'passenger_count',
                        'distance_km', 'l1_distance_km', 'bearing',
                        'pickup_longitude', 'pickup_latitude',
                        'dropoff_longitude', 'dropoff_latitude']

# categorical + numerical covariates
cat_numerical_covariates = ['dayofweek', 'hour', 'passenger_count',
                            'distance_km', 'l1_distance_km', 'bearing',
                            'pickup_longitude', 'pickup_latitude',
                            'dropoff_longitude', 'dropoff_latitude']

print('covariates: ', all_covariates)

covariates:  ['pickup_datetime', 'dayofweek', 'hour', 'passenger_count', 'distance_km', 'l1_distance_km', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']


In [5]:
# train-test split the training data (so that we can evaluate without submitting)
from sklearn.model_selection import train_test_split
train_train_data, train_test_data = train_test_split(train_data, test_size=0.1)

In [6]:
def create_X_y(train_data, test_data, covariates, label):
    X_train = train_data[covariates]
    X_test = test_data[covariates]

    y_train = train_data[label]
    y_test = test_data[label]

    return X_train, X_test, y_train, y_test

In [7]:
from sklearn.metrics import mean_squared_log_error, mean_absolute_error, mean_squared_error

def eval_model(model, X, y, metric='rmsle', log=False):
    '''evaluate model on given model via the given metric'''

    y_pred = model.predict(X)
    if log:
        y_pred = np.exp(y_pred)
        y = np.exp(y)

    if metric=='rmsle':
        return np.sqrt(mean_squared_log_error(y, y_pred))
    elif 'msle':
        return mean_squared_log_error(y, y_pred)
    elif metric=='mse':
        return mean_squared_error(y, y_pred)
    elif metric=='rmse':
        return np.sqrt(mean_squared_error(y, y_pred))
    elif metric=='mae':
        return mean_absolute_error(y, y_pred)
    else:
        raise ValueError()

In [8]:
def create_submission(model, covariates, log=False):
    X_test = test_data[covariates]
    y_pred = model.predict(X_test)

    if log:
        y_pred = np.exp(y_pred)

    df = pd.DataFrame(index=test_data.index, data=y_pred, columns=['trip_duration'])

    return df

## Linear Regression

### Linear Regression on Original Features

In [9]:
from sklearn.linear_model import LinearRegression

X_train, X_test, y_train, y_test = create_X_y(train_train_data, train_test_data, original_covariates, 'log_trip_duration')

reg = LinearRegression().fit(X_train, y_train)

print('train rmsle: ', eval_model(reg, X_train, y_train, metric='rmsle', log=True))
print('test rmsle: ', eval_model(reg, X_test, y_test, metric='rmsle', log=True))

train rmsle:  0.7428762002036318
test rmsle:  0.7398575040660865


### Linear Regression - Haversine Distance Only

In [10]:
from sklearn.linear_model import LinearRegression

X_train, X_test, y_train, y_test = create_X_y(train_train_data, train_test_data, ['distance_km'], 'log_trip_duration')

reg = LinearRegression().fit(X_train, y_train)

print('train rmsle: ', eval_model(reg, X_train, y_train, metric='rmsle', log=True))
print('test rmsle: ', eval_model(reg, X_test, y_test, metric='rmsle', log=True))

train rmsle:  0.6234296443615465
test rmsle:  0.6226134271483733


### Linear Regression - All Features

In [11]:
from sklearn.linear_model import LinearRegression

X_train, X_test, y_train, y_test = create_X_y(train_train_data, train_test_data, numerical_covariates, 'log_trip_duration')

reg = LinearRegression().fit(X_train, y_train)

print('train rmsle: ', eval_model(reg, X_train, y_train, metric='rmsle', log=True))
print('test rmsle: ', eval_model(reg, X_test, y_test, metric='rmsle', log=True))

train rmsle:  0.6123319298724955
test rmsle:  0.6168818944836084


### Linear Regression - ElasticNet

In [12]:
from sklearn.linear_model import ElasticNetCV

X_train, X_test, y_train, y_test = create_X_y(train_train_data, train_test_data, numerical_covariates, 'log_trip_duration')

reg = ElasticNetCV(l1_ratio=[0.1, 0.5, 0.7, 1], n_alphas=100, cv=10).fit(X_train, y_train)

print('train rmsle: ', eval_model(reg, X_train, y_train, metric='rmsle', log=True))
print('test rmsle: ', eval_model(reg, X_test, y_test, metric='rmsle', log=True))

train rmsle:  0.6214283999315265
test rmsle:  0.6210295846481934


In [13]:
np.array(numerical_covariates)[reg.coef_ != 0] # selected features

array(['hour', 'passenger_count', 'distance_km', 'l1_distance_km',
       'bearing'], dtype='<U17')

### Linear Regression - Recursive Feature Elimination

In [14]:
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LinearRegression

X_train, X_test, y_train, y_test = create_X_y(train_train_data, train_test_data, numerical_covariates, 'log_trip_duration')

reg = RFECV(LinearRegression(), min_features_to_select=1, cv=10).fit(X_train, y_train)

print('train rmsle: ', eval_model(reg, X_train, y_train, metric='rmsle', log=True))
print('test rmsle: ', eval_model(reg, X_test, y_test, metric='rmsle', log=True))

train rmsle:  0.6123917310690234
test rmsle:  0.61741582503702


In [15]:
reg.support_ # selected features

array([ True,  True,  True,  True, False,  True,  True,  True,  True])

## Random Forest

In [16]:
from sklearn.ensemble import RandomForestRegressor

X_train, X_test, y_train, y_test = create_X_y(train_train_data, train_test_data, cat_numerical_covariates, 'log_trip_duration')

reg = RandomForestRegressor(n_estimators=100).fit(X_train, y_train)

print('train rmsle: ', eval_model(reg, X_train, y_train, metric='rmsle', log=True))
print('test rmsle: ', eval_model(reg, X_test, y_test, metric='rmsle', log=True))

train rmsle:  0.17356661346549815
test rmsle:  0.4549052433354603


In [17]:
pd.DataFrame(index=reg.feature_names_in_, data=reg.feature_importances_,
                columns=['feature_importance']).sort_values(by='feature_importance', ascending=False)

Unnamed: 0,feature_importance
distance_km,0.607879
bearing,0.072173
hour,0.054818
l1_distance_km,0.050401
dropoff_latitude,0.049066
pickup_latitude,0.046219
pickup_longitude,0.041144
dropoff_longitude,0.040892
dayofweek,0.026687
passenger_count,0.010721


## XGBoost Boosted Trees

In [18]:
import xgboost as xgb

X_train, X_test, y_train, y_test = create_X_y(train_train_data, train_test_data, cat_numerical_covariates, 'log_trip_duration')

reg = xgb.XGBRegressor(objective='reg:squarederror', importance_type='total_gain', n_jobs=-1).fit(X_train, y_train)

print('train rmsle: ', eval_model(reg, X_train, y_train, metric='rmsle', log=True))
print('test rmsle: ', eval_model(reg, X_test, y_test, metric='rmsle', log=True))

  from pandas import MultiIndex, Int64Index
  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


train rmsle:  0.3073819911729133
test rmsle:  0.4598466190861102


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [19]:
pd.DataFrame(index=X_train.columns, data=reg.feature_importances_,
                columns=['feature_importance']).sort_values(by='feature_importance', ascending=False)

Unnamed: 0,feature_importance
distance_km,0.724953
bearing,0.0661
hour,0.0426
dropoff_latitude,0.032092
pickup_longitude,0.027116
pickup_latitude,0.026967
dropoff_longitude,0.026064
l1_distance_km,0.026012
dayofweek,0.024609
passenger_count,0.003486


### XGBoost Boosted Trees Crossvalidation GridSearch

In [22]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

X_train, X_test, y_train, y_test = create_X_y(train_train_data, train_test_data, cat_numerical_covariates, 'log_trip_duration')

params = {
    'n_estimators': [50, 60, 80, 100],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.1, 0.5, 1.0, 1.5],
    # 'subsample': [0.9],
    # 'colsample_bytree': [0.8, 0.9, 1],
    # 'gamma': [0, 1, 5]
}

reg = xgb.XGBRegressor(objective='reg:squarederror', importance_type='total_gain')

cv = GridSearchCV(reg, params, cv=5, n_jobs=-1, verbose=1).fit(X_train, y_train)

print('train rmsle: ', eval_model(cv, X_train, y_train, metric='rmsle', log=True))
print('test rmsle: ', eval_model(cv, X_test, y_test, metric='rmsle', log=True))

Fitting 5 folds for each of 24 candidates, totalling 120 fits


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


train rmsle:  0.43467274066404016
test rmsle:  0.44486305260802245


  elif isinstance(data.columns, (pd.Int64Index, pd.RangeIndex)):


In [23]:
pd.DataFrame(index=X_train.columns, data=cv.best_estimator_.feature_importances_,
                columns=['feature_importance']).sort_values(by='feature_importance', ascending=False)

Unnamed: 0,feature_importance
distance_km,0.780927
l1_distance_km,0.097191
hour,0.03964
dropoff_latitude,0.023518
pickup_longitude,0.018028
dayofweek,0.016689
dropoff_longitude,0.011645
pickup_latitude,0.011578
passenger_count,0.000785


## LightGBM Boosted Trees

In [50]:
X_train, X_test, y_train, y_test = create_X_y(train_train_data, train_test_data, cat_numerical_covariates, 'log_trip_duration')

import lightgbm
reg = lightgbm.LGBMRegressor(boosting_type='gbdt', n_estimators=500, max_depth=10,
                            learning_rate=0.025, reg_alpha=10, reg_lambda=50,
                            importance_type='gain', n_jobs=-1).fit(X_train, y_train)

print('train rmsle: ', eval_model(reg, X_train, y_train, metric='rmsle', log=True))
print('test rmsle: ', eval_model(reg, X_test, y_test, metric='rmsle', log=True))

train rmsle:  0.42202684627655357
test rmsle:  0.4396356295573065


### LightGBM Boosted Trees Cross-validation Gridsearch

In [43]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

X_train, X_test, y_train, y_test = create_X_y(train_train_data, train_test_data, cat_numerical_covariates, 'log_trip_duration')

params = {
    'boosting_type': ['gbdt', 'dart', 'goss'],
    'n_estimators': [50, 100, 250, 500, 750., 1000],
    'n_estimators': [250, 500, 750],
    'max_depth': [-1, 5, 10, 20],
    'learning_rate': [0.025, 0.05, 0.1]
}

reg = lightgbm.LGBMRegressor(importance_type='gain')

cv = GridSearchCV(reg, params, cv=5, n_jobs=-1, verbose=1, scoring='neg_root_mean_squared_error').fit(X_train, y_train)

print('train rmsle: ', eval_model(cv, X_train, y_train, metric='rmsle', log=True))
print('test rmsle: ', eval_model(cv, X_test, y_test, metric='rmsle', log=True))

Fitting 5 folds for each of 12 candidates, totalling 60 fits
train rmsle:  0.3985735560944037
test rmsle:  0.44195960624216324


In [44]:
cv.best_score_

-0.453413727010943

In [45]:
cv.best_params_

{'boosting_type': 'gbdt',
 'learning_rate': 0.025,
 'max_depth': 10,
 'n_estimators': 500}

In [55]:
feat_imp = pd.DataFrame(index=X_train.columns, data=cv.best_estimator_.feature_importances_,
                columns=['feature_importance']).sort_values(by='feature_importance', ascending=False)
feat_imp.feature_importance /= feat_imp.feature_importance.sum()
feat_imp

Unnamed: 0,feature_importance
distance_km,0.766675
bearing,0.058955
hour,0.048788
dropoff_latitude,0.030453
dayofweek,0.022719
l1_distance_km,0.019906
pickup_latitude,0.018043
dropoff_longitude,0.017116
pickup_longitude,0.016232
passenger_count,0.001114
