# Block 6 Exercise 2: finding the best parameters for predicting the fare of taxi rides
We return to our Random Forest Regression and want to automatically optimize all free parameters ...

In [0]:
import pandas as pd
import numpy as np
import folium


In [7]:
#check if notebook runs in colab
import sys
IN_COLAB = 'google.colab' in sys.modules
print('running in Colab:',IN_COLAB)
path='..'
if IN_COLAB:
  #in colab, we need to clone the data from the repo
  !git clone https://github.com/keuperj/DataScienceSS20.git
  path='DataScienceSS20'

running in Colab: True
fatal: destination path 'DataScienceSS20' already exists and is not an empty directory.


In [0]:
# we load the data we have saved after wrangling and pre-processing in block I
X=pd.read_csv(path+'/DATA/train_cleaned.csv')
drop_columns=['Unnamed: 0','Unnamed: 0.1','Unnamed: 0.1.1','key','pickup_datetime','pickup_date','pickup_latitude_round3','pickup_longitude_round3','dropoff_latitude_round3','dropoff_longitude_round3']
X=X.drop(drop_columns,axis=1)
X=pd.get_dummies(X)# one hot coding
#generate labels
y=X['fare_amount']
X=X.drop(['fare_amount'],axis=1)

### Scikit Optimize
Scikit Optimize (https://scikit-optimize.github.io/stable/index.html) is a AutoML toolbox wrapped around Scikit-Learn. It allows us to use state-of-the-art automatic hyper-parameter optimization on top of our learning algorithms.   



In [9]:
# install 
!pip install scikit-optimize



### E 2.1 Bayesian Optimization of a Random Forest Regression Model
use Bayesian Optimization with Cross-Validation (https://scikit-optimize.github.io/stable/modules/generated/skopt.BayesSearchCV.html#skopt.BayesSearchCV) to find the best regression model. Compare
* linear regression (https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html#sklearn.linear_model.LinearRegression) 
* Random Forest regression (https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn.ensemble.RandomForestRegressor)
* and SVM regression (https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html#sklearn.svm.SVR)

NOTES: this can become quite compute intensive! Hence,
* use a smaller subset of the training data to run the experiments 
* think about the range of your parameters (e.g. larger number of trees in RF or high C-values in SMV will make models expensive)
* optimize only the following parameters per model type:
    * linear: no parameters to optimize
    * RF: #trees and depth
    * SVM: C and gamma (use RBF kernel)
* parallelize -> n_jobs
* use CoLab to rum the job for up to 12h 


In [0]:
mask = X['pickup_year']< 2015
X_train = X[mask] 
y_train = y[mask]

X_test = X[mask==False] 
y_test = y[mask==False]

In [0]:
from skopt import BayesSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from skopt.space import Real, Categorical, Integer

pipe = Pipeline([('model', LinearRegression())])

rf_search = {
    'model': Categorical([RandomForestRegressor()]),
    'model__n_estimators': Integer(10, 100, 'log-uniform'),
    
}

# explicit dimension classes can be specified like this
svr_search = {
    'model': Categorical([SVR()]),
    'model__C': Real(1e-6, 1e+6, prior='log-uniform'),
    'model__gamma': Real(1e-6, 1e+1, prior='log-uniform'),
}

lin_search = {
    'model' : Categorical([LinearRegression()]),
}


clf = BayesSearchCV(
     pipe,
     [(svr_search, 10), (rf_search, 10), (lin_search,1)],
     cv=5,
     n_iter=32,
     n_jobs = 4,
     scoring='neg_mean_squared_error'
 )

In [13]:
clf.fit(X_train[:10000], y_train[:10000])

BayesSearchCV(cv=5, error_score='raise',
              estimator=Pipeline(memory=None,
                                 steps=[('model',
                                         LinearRegression(copy_X=True,
                                                          fit_intercept=True,
                                                          n_jobs=None,
                                                          normalize=False))],
                                 verbose=False),
              fit_params=None, iid=True, n_iter=32, n_jobs=4, n_points=1,
              optimizer_kwargs=None, pre_dispatch='2*n_jobs', random_state=None,
              refit=True, return_train_score=False,
              scoring='neg_me...
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False),), prior=None),
                               'model__n_estima

In [14]:
clf.cv_results_

defaultdict(list,
            {'mean_fit_time': [16.35885877609253,
              15.158479309082031,
              15.475470638275146,
              14.957780265808106,
              15.616298389434814,
              15.595632457733155,
              18.431623458862305,
              15.434605407714844,
              16.342826938629152,
              15.054692602157592,
              12.216678237915039,
              14.622124862670898,
              12.247656393051148,
              8.175550031661988,
              3.152827596664429,
              13.902809333801269,
              2.1337278842926026,
              14.313216352462769,
              14.431573104858398,
              6.308212089538574,
              0.03404159545898437],
             'mean_score_time': [2.5338961124420165,
              2.539213466644287,
              2.406764507293701,
              2.7406782627105715,
              2.5255290031433106,
              2.5451337814331056,
              2.390886354446411,

In [15]:
clf.best_params_

OrderedDict([('model',
              RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                                    max_depth=None, max_features='auto', max_leaf_nodes=None,
                                    max_samples=None, min_impurity_decrease=0.0,
                                    min_impurity_split=None, min_samples_leaf=1,
                                    min_samples_split=2, min_weight_fraction_leaf=0.0,
                                    n_estimators=75, n_jobs=None, oob_score=False,
                                    random_state=None, verbose=0, warm_start=False)),
             ('model__n_estimators', 75)])