# Block 6 Exercise 2: finding the best parameters for predicting the fare of taxi rides

# By Christian Wegert

We return to our Random Forest Regression and want to automatically optimize all free parameters ...

In [1]:
import pandas as pd
import numpy as np
import folium
from sklearn.model_selection import train_test_split

In [2]:
# we load the data we have saved after wrangling and pre-processing in block I
X=pd.read_csv('../../DATA/train_cleaned.csv')
drop_columns=['Unnamed: 0','Unnamed: 0.1','Unnamed: 0.1.1','key','pickup_datetime','pickup_date','pickup_latitude_round3','pickup_longitude_round3','dropoff_latitude_round3','dropoff_longitude_round3']
X=X.drop(drop_columns,axis=1)
X=pd.get_dummies(X)# one hot coding
#generate labels
y=X['fare_amount']
X=X.drop(['fare_amount'],axis=1)

In [3]:
X.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pickup_day,pickup_hour,pickup_day_of_week,pickup_month,pickup_year,...,pickup_borough_manhattan,pickup_borough_others,pickup_borough_queens,pickup_borough_staten_island,dropoff_borough_bronx,dropoff_borough_brooklyn,dropoff_borough_manhattan,dropoff_borough_others,dropoff_borough_queens,dropoff_borough_staten_island
0,-73.844311,40.721319,-73.84161,40.712278,1,15,17,1,6,2009,...,0,0,1,0,0,0,0,0,1,0
1,-74.016048,40.711303,-73.979268,40.782004,1,5,16,2,1,2010,...,1,0,0,0,0,0,1,0,0,0
2,-73.982738,40.76127,-73.991242,40.750562,2,18,0,4,8,2011,...,1,0,0,0,0,0,1,0,0,0
3,-73.98713,40.733143,-73.991567,40.758092,1,21,4,6,4,2012,...,1,0,0,0,0,0,1,0,0,0
4,-73.968095,40.768008,-73.956655,40.783762,1,9,7,2,3,2010,...,1,0,0,0,0,0,1,0,0,0


In [4]:
y.head()

0     4.5
1    16.9
2     5.7
3     7.7
4     5.3
Name: fare_amount, dtype: float64

### Scikit Optimize
Scikit Optimize (https://scikit-optimize.github.io/stable/index.html) is a AutoML toolbox wrapped around Scikit-Learn. It allows us to use state-of-the-art automatic hyper-parameter optimization on top of our learning algorithms.   



In [5]:
# install 
!pip install scikit-optimize



### E 2.1 Bayesian Optimization of a Random Forest Regression Model
use Bayesian Optimization with Cross-Validation (https://scikit-optimize.github.io/stable/modules/generated/skopt.BayesSearchCV.html#skopt.BayesSearchCV) to find the best regression model. Compare
* linear regression (https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html#sklearn.linear_model.LinearRegression) 
* Random Forest regression (https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn.ensemble.RandomForestRegressor)
* and SVM regression (https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html#sklearn.svm.SVR)

NOTES: this can become quite compute intensive! Hence,
* use a smaller subset of the training data to run the experiments 
* think about the range of your parameters (e.g. larger number of trees in RF or high C-values in SMV will make models expensive)
* optimize only the following parameters per model type:
    * linear: no parameters to optimize
    * RF: #trees and depth
    * SVM: C and gamma (use RBF kernel)
* parallelize -> n_jobs
* use CoLab to rum the job for up to 12h 


In [6]:
X_train=X[X['pickup_year']<2015]
y_train=y[X['pickup_year']<2015]
X_test=X[X['pickup_year']==2015]
y_test=y[X['pickup_year']==2015]

In [7]:
print(X_train.shape,y_train.shape,X_test.shape,y_test.shape)

(372110, 31) (372110,) (27890, 31) (27890,)


In [8]:
y_train=np.array(y_train)
y_test=np.array(y_test)
X_train=np.array(X_train)
X_test=np.array(X_test)

In [9]:
from sklearn import svm
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

In [10]:
regr=svm.SVR()
linear=LinearRegression()
rf=RandomForestRegressor()

few Tests:

In [11]:
%%time
linear.fit(X_train[0:25000],y_train[0:25000]) 

Wall time: 18 ms


LinearRegression()

In [12]:
%%time
regr.fit(X_train[0:25000],y_train[0:25000])

Wall time: 23.4 s


SVR()

In [13]:
%%time
rf.fit(X_train[0:25000],y_train[0:25000])

Wall time: 16.8 s


RandomForestRegressor()

In [None]:
%%time

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.pipeline import Pipeline


pipe = Pipeline([
    ('model', SVC())])

svc_search = {
    'model': Categorical([svm.SVR()]),
    'model__C': Real(1e-6, 1e+6),
    'model__gamma': Real(1e-6, 1e+1, prior='log-uniform'),
    'model__kernel': Categorical(['rbf']),
}

random_forest_search = {
    'model': Categorical([RandomForestRegressor()]),
    'model__n_estimators': (1,5000),
    'model__max_depth': (1,100),
}

linear_search = {
    'model': Categorical([LinearRegression()]),
}

opt = BayesSearchCV(pipe,
    [(svc_search), (random_forest_search), (linear_search)],
    cv=3,
    n_jobs=-1                
)

opt.fit(X_train[0:100], y_train[0:100])

In [None]:
print("val. score: %s" % opt.best_score_)
print("test score: %s" % opt.score(X_test, y_test))
print("best params: %s" % str(opt.best_params_))

# First Try Problem (3in1 optimization):
MemoryError: could not allocate 4784128 bytes

(after 2-3 hours of computing)

# Secound Try Problem (3in1 optimization):
ValueError: array must not contain infs or NaNs (maybe caused of linear_search, 2-in-1 search without linear works fine)

# New Method: Make two times BayesSearchCV and compare the results!

In [None]:
%%time
from skopt import BayesSearchCV
opt_SVM = BayesSearchCV(
    svm.SVR(),
    {
        'C': (1e-6, 1e+6, 'log-uniform'),
        'gamma': (1e-6, 1e+1, 'log-uniform'),
        'degree': (1, 8),  
        'kernel': ['rbf'], 
    },
    n_iter=32,
    cv=3,
    n_jobs=-1
)

opt_SVM.fit(X_train[0:25000], y_train[0:25000])

In [None]:
print("val. score: %s" % opt_SVM.best_score_)
print("test score: %s" % opt_SVM.score(X_test, y_test))
print("best params: %s" % str(opt_SVM.best_params_))

In [None]:
%%time
opt_RF = BayesSearchCV(
    RandomForestRegressor(),
    {
        'n_estimators': (1,5000),
        'max_depth': (1,100), 
    },
    n_iter=32,
    cv=3,
    n_jobs=-1
)

opt_RF.fit(X_train[0:25000], y_train[0:25000])

In [None]:
print("val. score: %s" % opt_RF.best_score_)
print("test score: %s" % opt_RF.score(X_test, y_test))
print("best params: %s" % str(opt_RF.best_params_))

# Third Try Problem (2in1 optimization): After ~20 hours no result because of too expensive Parameters


Repeat with a very very low set of data to show the basic procedure:

In [14]:
%%time
from skopt import BayesSearchCV
opt_SVM = BayesSearchCV(
    svm.SVR(),
    {
        'C': (1e-6, 1e+6, 'log-uniform'),
        'gamma': (1e-6, 1e+1, 'log-uniform'),  
        'kernel': ['rbf'], 
    },
    n_iter=32,
    cv=3,
    n_jobs=-1
)

opt_SVM.fit(X_train[0:100], y_train[0:100])

print("val. score: %s" % opt_SVM.best_score_)
print("test score: %s" % opt_SVM.score(X_test, y_test))
print("best params: %s" % str(opt_SVM.best_params_))

val. score: 0.2796673378405467
test score: 0.5034948733965661
best params: OrderedDict([('C', 22.93037014403077), ('gamma', 0.0018194918088105397), ('kernel', 'rbf')])
Wall time: 17.4 s


In [15]:
%%time
opt_RF = BayesSearchCV(
    RandomForestRegressor(),
    {
        'n_estimators': (1,5000),
        'max_depth': (1,100), 
    },
    n_iter=32,
    cv=3,
    n_jobs=-1
)

opt_RF.fit(X_train[0:100], y_train[0:100])

print("val. score: %s" % opt_RF.best_score_)
print("test score: %s" % opt_RF.score(X_test, y_test))
print("best params: %s" % str(opt_RF.best_params_))



val. score: 0.454184847416059
test score: 0.5569221850016266
best params: OrderedDict([('max_depth', 99), ('n_estimators', 462)])
Wall time: 1min 31s


In [20]:
regr_optimize=svm.SVR(C=22.93037014403077, gamma=0.0018194918088105397, kernel='rbf')
linear_optimize=LinearRegression()
rf_optimize=RandomForestRegressor(n_estimators=462 ,max_depth=99)

In [21]:
%%time
linear_optimize.fit(X_train[0:25000],y_train[0:25000]) 
regr_optimize.fit(X_train[0:25000],y_train[0:25000])
rf_optimize.fit(X_train[0:25000],y_train[0:25000])

Wall time: 1min 39s


RandomForestRegressor(max_depth=99, n_estimators=462)

In [27]:

from sklearn.metrics import mean_squared_error
print("rf score:")
print(mean_squared_error(y_test,rf.predict(X_test)))
print("rf_optimize score:")
print(mean_squared_error(y_test,rf_optimize.predict(X_test)))
print("linear score:")
print(mean_squared_error(y_test,linear.predict(X_test)))
print("regr score:")
print(mean_squared_error(y_test,regr.predict(X_test)))
print("regr_optimize score:")
print(mean_squared_error(y_test,regr_optimize.predict(X_test)))

rf score:
26.707272821415568
rf_optimize score:
26.434845630505222
linear score:
32.839326405454266
regr score:
151.19669793706905
regr_optimize score:
28.564977267967507
