# Block 6 Exercise 2: finding the best parameters for predicting the fare of taxi rides
We return to our Random Forest Regression and want to automatically optimize all free parameters ...

In [1]:
import pandas as pd
import numpy as np
import folium


In [2]:
# we load the data we have saved after wrangling and pre-processing in block I
X=pd.read_csv('../../DATA/train_cleaned.csv')
drop_columns=['Unnamed: 0','Unnamed: 0.1','Unnamed: 0.1.1','key','pickup_datetime','pickup_date','pickup_latitude_round3','pickup_longitude_round3','dropoff_latitude_round3','dropoff_longitude_round3']
X=X.drop(drop_columns,axis=1)
X=pd.get_dummies(X)# one hot coding
#generate labels
y=X['fare_amount']
X=X.drop(['fare_amount'],axis=1)

### Scikit Optimize
Scikit Optimize (https://scikit-optimize.github.io/stable/index.html) is a AutoML toolbox wrapped around Scikit-Learn. It allows us to use state-of-the-art automatic hyper-parameter optimization on top of our learning algorithms.   



In [3]:
# install 
!pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.8.1-py2.py3-none-any.whl (101 kB)
Collecting pyaml>=16.9
  Downloading pyaml-20.4.0-py2.py3-none-any.whl (17 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-20.4.0 scikit-optimize-0.8.1


### E 2.1 Bayesian Optimization of a Random Forest Regression Model
use Bayesian Optimization with Cross-Validation (https://scikit-optimize.github.io/stable/modules/generated/skopt.BayesSearchCV.html#skopt.BayesSearchCV) to find the best regression model. Compare
* linear regression (https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html#sklearn.linear_model.LinearRegression) 
* Random Forest regression (https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn.ensemble.RandomForestRegressor)
* and SVM regression (https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html#sklearn.svm.SVR)

NOTES: this can become quite compute intensive! Hence,
* use a smaller subset of the training data to run the experiments 
* think about the range of your parameters (e.g. larger number of trees in RF or high C-values in SMV will make models expensive)
* optimize only the following parameters per model type:
    * linear: no parameters to optimize
    * RF: #trees and depth
    * SVM: C and gamma (use RBF kernel)
* parallelize -> n_jobs
* use CoLab to rum the job for up to 12h 


In [7]:
from skopt import BayesSearchCV
# parameter ranges are specified by one of below
from skopt.space import Real, Categorical, Integer

from sklearn.metrics import f1_score

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.01, random_state=0)

In [8]:
%%time

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.01, random_state=0)

from sklearn.svm import SVR

optSVC = BayesSearchCV(
    SVR(kernel='rbf'),
    {
        'C': Real(1e-3, 1e3),#, prior='log-uniform'),
        'gamma': ['scale', 'auto'],
    },
    n_iter=32,
    random_state=0,
    n_jobs=-1
)

# executes bayesian optimization
_ = optSVC.fit(X_train, y_train)

Wall time: 6min 35s


In [9]:
# model can be saved, used for predictions or scoring
print(optSVC.score(X_test, y_test))

0.6572228144556258


In [10]:
optSVC.best_params_

OrderedDict([('C', 97.36142832341862), ('gamma', 'auto')])

In [4]:
%%time

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.1, random_state=0)

Wall time: 273 ms


In [None]:
%%time

from sklearn.svm import SVR

mySVR = SVR(kernel='rbf', gamma='auto', C=97.36142832341862)

mySVR.fit(X_train, y_train)

from joblib import dump, load
dump(mySVR, 'mySVR0.1.joblib')

In [5]:
from joblib import dump, load
mySVR = load('mySVR0.1.joblib')

y_train_predSVR = mySVR.predict(X_train)
y_test_predSVR = mySVR.predict(X_test)

In [6]:
from sklearn.metrics import mean_squared_error

mean_squared_error(y_train, y_train_predSVR)

14.58636471868495

In [7]:
mean_squared_error(y_test, y_test_predSVR)

25.2940772129688

In [None]:
%%time


X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.01, random_state=0)

from sklearn.ensemble import RandomForestRegressor

optRFR = BayesSearchCV(
    RandomForestRegressor(),
    {
        'n_estimators': Integer(10, 1000),
        'max_depth': Integer(3, 30),
    },
    n_iter=32,
    random_state=0,
    n_jobs=-1
)

# executes bayesian optimization
_ = optRFR.fit(X_train, y_train)

In [None]:
# model can be saved, used for predictions or scoring
print(optRFR.score(X_test, y_test))

In [None]:
optRFR.best_params_

In [8]:
%%time

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=0)

Wall time: 127 ms


In [None]:
%%time

myRFR = RandomForestRegressor(n_estimators = 639, max_depth=6, n_jobs=-1)

myRFR.fit(X_train, y_train)

from joblib import dump, load
dump(myRFR, 'myRFR.joblib')

In [9]:
from joblib import dump, load
myRFR = load('myRFR.joblib')

y_train_predRFR = myRFR.predict(X_train)
y_test_predRFR = myRFR.predict(X_test)

In [10]:
from sklearn.metrics import mean_squared_error

mean_squared_error(y_train, y_train_predRFR)

18.776169173548443

In [11]:
mean_squared_error(y_test, y_test_predRFR)

21.49330474971364

In [12]:
%%time

from sklearn.linear_model import LinearRegression

LinReg = LinearRegression(n_jobs=-1)

LinReg.fit(X_train, y_train)



Wall time: 229 ms


LinearRegression(n_jobs=-1)

In [13]:
%%time

y_train_predLinReg = LinReg.predict(X_train)

y_test_predLinReg = LinReg.predict(X_test)

Wall time: 56 ms


In [14]:
from sklearn.metrics import mean_squared_error

mean_squared_error(y_train, y_train_predLinReg)

25.44901729209009

In [15]:
mean_squared_error(y_test, y_test_predLinReg)

29.393393042065448