# Model choice notebook

In this notebook XGBoost and Random Forest models are trained on GridSearchCV.

Both models are trained and evaluated using TrainRF and TrainXGB classes.

Cross validation of 5-folds is used. 

For reproducibility sake seed for both models and train-test split are set to constant value of 12345.

# Imports

In [7]:
from utils.TrainRF import TrainRF
from utils.TrainXGB import TrainXGB
import pandas as pd
import pickle

# XGBoost

## Data augmentation

One hot encoding has to be applied on categorical columns.

Datetime columns are drops, they are repleced by day month year columns

In [8]:
data = pd.read_pickle('clean_data/clustered_data.pkl')
# Reomve 'fare_amount_log','pickup_datetime'
data = data.drop(['pickup_datetime','date'],axis=1)
# One hot encode passenger_big_group, pickup_cluster
data_xgb = pd.get_dummies(data, columns=['passenger_big_group','pickup_cluster'])
# Check columns
data_xgb.columns

Index(['dropoff_latitude', 'dropoff_longitude', 'fare_amount', 'feat01',
       'feat02', 'feat03', 'feat04', 'feat05', 'feat06', 'feat07', 'feat08',
       'feat09', 'feat10', 'passenger_count', 'pickup_latitude',
       'pickup_longitude', 'fare_amount_log', 'year', 'month', 'day', 'hour',
       'trip_distance', 'avg_temperature_2m (°C)', 'passenger_big_group_0',
       'passenger_big_group_1', 'pickup_cluster_0', 'pickup_cluster_1',
       'pickup_cluster_2', 'pickup_cluster_3', 'pickup_cluster_4',
       'pickup_cluster_5'],
      dtype='object')

## Parameter Grid

In [9]:
#XGBOOST
param_grid = {
    'n_estimators': [1000], 
    'learning_rate': [0.05], 
    'min_child_weight': [2,4],
    'max_depth' : [6,8],
    'colsample_bytree': [0.5,0.75],
}


## Fare amount

Model fare amount as the target variable.

In [12]:
model_XGB = TrainXGB(data_xgb, 'fare_amount', param_grid=param_grid, kfolds=5, seed=12345)
best_model, train_score, test_score, cv_results = model_XGB.train_model()
with open('saved_models/cv_results_XGB.pkl', 'wb') as file:
    pickle.dump(cv_results, file)
model_XGB.save_best_model('saved_models/XGB_model.pkl')
print(train_score, test_score)
display(cv_results)

0.10730772670253756 2.950971274950159


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bytree,param_learning_rate,param_max_depth,param_min_child_weight,param_n_estimators,params,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,48.853476,0.430866,1.440963,0.118803,0.5,0.05,6,2,1000,"{'colsample_bytree': 0.5, 'learning_rate': 0.0...",...,-0.018469,0.001584,6,-0.004474,-0.004578,-0.004297,-0.004542,-0.004271,-0.004432,0.000126
1,47.822852,0.210947,1.497484,0.062554,0.5,0.05,6,4,1000,"{'colsample_bytree': 0.5, 'learning_rate': 0.0...",...,-0.018326,0.001673,5,-0.004975,-0.005316,-0.004725,-0.005,-0.004961,-0.004995,0.000189
2,92.094519,1.525769,2.899434,0.059569,0.5,0.05,8,2,1000,"{'colsample_bytree': 0.5, 'learning_rate': 0.0...",...,-0.021719,0.001193,8,-0.002049,-0.002088,-0.002128,-0.002114,-0.002134,-0.002102,3.1e-05
3,85.779513,1.751785,3.131114,0.184669,0.5,0.05,8,4,1000,"{'colsample_bytree': 0.5, 'learning_rate': 0.0...",...,-0.02134,0.001475,7,-0.002432,-0.00244,-0.002396,-0.00247,-0.002464,-0.00244,2.6e-05
4,49.295729,0.338211,1.614844,0.07455,0.75,0.05,6,2,1000,"{'colsample_bytree': 0.75, 'learning_rate': 0....",...,-0.0165,0.001341,2,-0.002692,-0.00301,-0.002543,-0.002899,-0.002891,-0.002807,0.000167
5,49.201972,0.42561,1.570766,0.134486,0.75,0.05,6,4,1000,"{'colsample_bytree': 0.75, 'learning_rate': 0....",...,-0.016483,0.001533,1,-0.003499,-0.003493,-0.003079,-0.00357,-0.003391,-0.003406,0.000173
6,82.456757,0.799178,1.67535,0.041719,0.75,0.05,8,2,1000,"{'colsample_bytree': 0.75, 'learning_rate': 0....",...,-0.017932,0.001338,4,-0.001076,-0.001095,-0.001061,-0.001063,-0.001057,-0.00107,1.4e-05
7,65.390338,13.29287,1.365679,0.551407,0.75,0.05,8,4,1000,"{'colsample_bytree': 0.75, 'learning_rate': 0....",...,-0.017102,0.001404,3,-0.001347,-0.001352,-0.001275,-0.001328,-0.001352,-0.001331,2.9e-05


## Fare amount log

Model log of fare amount as the target variable.

In [13]:
model_XGB_log = TrainXGB(data_xgb, 'fare_amount_log', param_grid=param_grid, kfolds=5, seed=12345)
best_model, train_score, test_score, cv_results = model_XGB_log.train_model()
with open('saved_models/cv_results_XGB_log.pkl', 'wb') as file:
    pickle.dump(cv_results, file)
model_XGB_log.save_best_model('saved_models/XGB_model_log.pkl')
print(train_score, test_score)
display(cv_results)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
0.002815877635209021 0.018692487597072604


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_colsample_bytree,param_learning_rate,param_max_depth,param_min_child_weight,param_n_estimators,params,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,49.524626,0.22967,1.392402,0.096156,0.5,0.05,6,2,1000,"{'colsample_bytree': 0.5, 'learning_rate': 0.0...",...,-0.003849,0.000509,5,-0.000858,-0.000877,-0.000869,-0.000859,-0.000839,-0.00086,1.3e-05
1,49.193049,0.172549,1.481857,0.111412,0.5,0.05,6,4,1000,"{'colsample_bytree': 0.5, 'learning_rate': 0.0...",...,-0.003827,0.000508,4,-0.000952,-0.001004,-0.00096,-0.000984,-0.000938,-0.000968,2.4e-05
2,91.823781,0.661263,2.765845,0.058441,0.5,0.05,8,2,1000,"{'colsample_bytree': 0.5, 'learning_rate': 0.0...",...,-0.004457,0.000394,8,-0.0005,-0.000499,-0.000519,-0.000504,-0.000498,-0.000504,8e-06
3,86.308773,1.05528,2.734748,0.11416,0.5,0.05,8,4,1000,"{'colsample_bytree': 0.5, 'learning_rate': 0.0...",...,-0.004375,0.00041,7,-0.000532,-0.000549,-0.000553,-0.000538,-0.000532,-0.000541,9e-06
4,50.887173,0.307348,1.48703,0.057361,0.75,0.05,6,2,1000,"{'colsample_bytree': 0.75, 'learning_rate': 0....",...,-0.003544,0.000582,1,-0.000579,-0.000592,-0.000573,-0.000599,-0.000586,-0.000586,9e-06
5,46.848844,0.856898,1.552693,0.056838,0.75,0.05,6,4,1000,"{'colsample_bytree': 0.75, 'learning_rate': 0....",...,-0.003553,0.000603,2,-0.000685,-0.000666,-0.000648,-0.000651,-0.000666,-0.000663,1.3e-05
6,79.56994,1.375975,1.785652,0.093082,0.75,0.05,8,2,1000,"{'colsample_bytree': 0.75, 'learning_rate': 0....",...,-0.003896,0.00052,6,-0.000335,-0.000324,-0.000347,-0.000333,-0.000335,-0.000335,7e-06
7,64.606496,11.333186,1.357084,0.521345,0.75,0.05,8,4,1000,"{'colsample_bytree': 0.75, 'learning_rate': 0....",...,-0.003743,0.000518,3,-0.00037,-0.00037,-0.000374,-0.000367,-0.000359,-0.000368,5e-06


In [20]:
# Reverse lgo transform
from math import exp
print(exp(0.002815877635209021),exp(0.018692487597072604)) 

1.002819845942519 1.0188682858035676


# Random Forest

## Data augmentation

Datetime columns are drops, they are repleced by day month year columns

In [14]:
data = pd.read_pickle('clean_data/clustered_data.pkl')
# reomve 'fare_amount_log','pickup_datetime'
data = data.drop(['pickup_datetime','date'],axis=1)

## Fare amount

In [15]:
param_grid = {
    'n_estimators': [1000], #Irena 1000
    'max_features': [1.0,'sqrt'], #Irena tuned this one
    'max_depth' : [6,8],
    'min_samples_split': [2,4], #Irena tuned this one
}


In [16]:
model_RF = TrainRF(data, 'fare_amount', param_grid=param_grid, kfolds=5, seed=12345)
best_model, train_score, test_score, cv_results = model_RF.train_model()
with open('saved_models/cv_results_RF.pkl', 'wb') as file:
    pickle.dump(cv_results, file)
model_RF.save_best_model('saved_models/RF_model.pkl')
print(train_score, test_score)
display(cv_results)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
0.15220014325416023 1.525826892039344


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_max_features,param_min_samples_split,param_n_estimators,params,split0_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,1971.465744,11.419319,3.234525,0.139486,6,1.0,2,1000,"{'max_depth': 6, 'max_features': 1.0, 'min_sam...",-0.007941,...,-0.009035,0.002054,3,-0.009954,-0.008402,-0.008618,-0.009589,-0.00874,-0.009061,0.000602
1,1973.284979,14.873216,3.025078,0.139926,6,1.0,4,1000,"{'max_depth': 6, 'max_features': 1.0, 'min_sam...",-0.007953,...,-0.009037,0.002053,4,-0.009959,-0.00841,-0.008626,-0.009593,-0.008748,-0.009067,0.0006
2,332.624241,0.567379,2.129956,0.038609,6,sqrt,2,1000,"{'max_depth': 6, 'max_features': 'sqrt', 'min_...",-0.118589,...,-0.118019,0.000614,7,-0.117016,-0.118149,-0.115549,-0.116447,-0.116762,-0.116785,0.000843
3,339.231319,3.548718,2.433706,0.133191,6,sqrt,4,1000,"{'max_depth': 6, 'max_features': 'sqrt', 'min_...",-0.118651,...,-0.118481,0.001056,8,-0.117061,-0.117564,-0.117763,-0.116681,-0.117296,-0.117273,0.00038
4,2332.420956,57.753164,2.548654,0.338324,8,1.0,2,1000,"{'max_depth': 8, 'max_features': 1.0, 'min_sam...",-0.002798,...,-0.002939,0.000914,2,-0.001822,-0.002084,-0.002365,-0.001979,-0.002195,-0.002089,0.000185
5,1765.743726,274.698982,1.74553,0.011948,8,1.0,4,1000,"{'max_depth': 8, 'max_features': 1.0, 'min_sam...",-0.002733,...,-0.002891,0.000889,1,-0.001949,-0.002202,-0.00247,-0.002063,-0.002289,-0.002195,0.00018
6,460.279173,7.330845,2.877313,0.098222,8,sqrt,2,1000,"{'max_depth': 8, 'max_features': 'sqrt', 'min_...",-0.088372,...,-0.088694,0.00068,5,-0.085826,-0.086833,-0.086732,-0.08569,-0.08718,-0.086452,0.000588
7,413.008563,16.80943,2.493263,0.288907,8,sqrt,4,1000,"{'max_depth': 8, 'max_features': 'sqrt', 'min_...",-0.088844,...,-0.088876,0.000836,6,-0.086393,-0.087486,-0.08611,-0.087236,-0.086043,-0.086654,0.000594


## Fare amount log

In [17]:
model_RF_log = TrainRF(data, 'fare_amount_log', param_grid=param_grid, kfolds=5, seed=12345)
best_model, train_score, test_score, cv_results = model_RF_log.train_model()
with open('saved_models/cv_results_RF_log.pkl', 'wb') as file:
    pickle.dump(cv_results, file)
model_RF_log.save_best_model('saved_models/RF_model_log.pkl')
print(train_score, test_score)
display(cv_results)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
0.0012367290600387903 0.004980158830760376


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_max_features,param_min_samples_split,param_n_estimators,params,split0_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,1971.96753,17.864739,2.853613,0.10313,6,1.0,2,1000,"{'max_depth': 6, 'max_features': 1.0, 'min_sam...",-0.001426,...,-0.001539,0.000184,3,-0.001348,-0.001241,-0.00143,-0.001406,-0.00135,-0.001355,6.5e-05
1,1993.020507,42.699491,2.865165,0.215583,6,1.0,4,1000,"{'max_depth': 6, 'max_features': 1.0, 'min_sam...",-0.001465,...,-0.001541,0.000182,4,-0.00138,-0.001267,-0.001449,-0.001425,-0.001391,-0.001382,6.3e-05
2,355.97449,3.959039,2.637348,0.317489,6,sqrt,2,1000,"{'max_depth': 6, 'max_features': 'sqrt', 'min_...",-0.024992,...,-0.024755,0.000374,7,-0.024448,-0.024697,-0.024895,-0.024024,-0.024084,-0.024429,0.000338
3,354.623183,4.450712,2.279461,0.108991,6,sqrt,4,1000,"{'max_depth': 6, 'max_features': 'sqrt', 'min_...",-0.024885,...,-0.02493,0.000344,8,-0.02436,-0.024625,-0.024972,-0.024731,-0.024348,-0.024607,0.000235
4,2274.911496,64.767201,2.471037,0.411457,8,1.0,2,1000,"{'max_depth': 8, 'max_features': 1.0, 'min_sam...",-0.000748,...,-0.00085,0.000381,1,-0.000395,-0.000446,-0.000466,-0.000435,-0.000424,-0.000433,2.4e-05
5,1726.808351,252.355091,1.735384,0.225786,8,1.0,4,1000,"{'max_depth': 8, 'max_features': 1.0, 'min_sam...",-0.000881,...,-0.000872,0.000385,2,-0.000593,-0.000663,-0.000711,-0.000572,-0.000654,-0.000638,5e-05
6,458.358966,5.11814,3.318657,0.105139,8,sqrt,2,1000,"{'max_depth': 8, 'max_features': 'sqrt', 'min_...",-0.019379,...,-0.01891,0.000258,5,-0.018547,-0.01823,-0.018158,-0.018193,-0.018166,-0.018259,0.000146
7,423.703771,31.486224,2.711215,0.2861,8,sqrt,4,1000,"{'max_depth': 8, 'max_features': 'sqrt', 'min_...",-0.019236,...,-0.019035,0.000182,6,-0.018429,-0.018386,-0.0182,-0.018431,-0.018561,-0.018402,0.000117


In [19]:
# Reverse transform back from log
from math import exp
print(exp(train_score), exp(test_score))

1.0012374941247828 1.004992580433708
