# Block 6 Exercise 2: finding the best parameters for predicting the fare of taxi rides
We return to our Random Forest Regression and want to automatically optimize all free parameters ...

In [1]:
import pandas as pd
import numpy as np
import folium


In [2]:
# we load the data we have saved after wrangling and pre-processing in block I
X=pd.read_csv('../../DATA/train_cleaned.csv')
drop_columns=['Unnamed: 0','Unnamed: 0.1','Unnamed: 0.1.1','key','pickup_datetime','pickup_date','pickup_latitude_round3','pickup_longitude_round3','dropoff_latitude_round3','dropoff_longitude_round3']
X=X.drop(drop_columns,axis=1)
X=pd.get_dummies(X)# one hot coding
#generate labels
y=X['fare_amount']
X=X.drop(['fare_amount'],axis=1)

### Scikit Optimize
Scikit Optimize (https://scikit-optimize.github.io/stable/index.html) is a AutoML toolbox wrapped around Scikit-Learn. It allows us to use state-of-the-art automatic hyper-parameter optimization on top of our learning algorithms.   



In [3]:
# install 
!pip install scikit-optimize



In [4]:
#print(X)
#print(y)

### E 2.1 Bayesian Optimization of a Random Forest Regression Model
use Bayesian Optimization with Cross-Validation (https://scikit-optimize.github.io/stable/modules/generated/skopt.BayesSearchCV.html#skopt.BayesSearchCV) to find the best regression model. Compare
* linear regression (https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html#sklearn.linear_model.LinearRegression) 
* Random Forest regression (https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn.ensemble.RandomForestRegressor)
* and SVM regression (https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html#sklearn.svm.SVR)

NOTES: this can become quite compute intensive! Hence,
* use a smaller subset of the training data to run the experiments 
* think about the range of your parameters (e.g. larger number of trees in RF or high C-values in SMV will make models expensive)
* optimize only the following parameters per model type:
    * linear: no parameters to optimize
    * RF: #trees and depth
    * SVM: C and gamma (use RBF kernel)
* parallelize -> n_jobs
* use CoLab to rum the job for up to 12h 


In [5]:
from skopt import BayesSearchCV
from sklearn.model_selection import train_test_split
#test mit 5000 Trainingsdaten

X_train, X_test, y_train, y_test = train_test_split(X.to_numpy()[:20000,:], y.to_numpy()[:20000], shuffle=False)

In [6]:
print(len(X))
print(len(y))


print(len(X_train))
print(len(y_train))
print(len(X_test))
print(len(y_test))


400000
400000
15000
15000
5000
5000


### Random Forrest Regressor

In [7]:
import warnings
warnings.filterwarnings('ignore', message='The objective has been evaluated at this point before.')

from skopt import BayesSearchCV
from sklearn.ensemble import RandomForestRegressor

random_opt = BayesSearchCV( 
         RandomForestRegressor(),
         {
             'n_estimators':[50, 150, 200, 250, 300, 350],
             'max_depth':[2, 4, 6, 8, 10, 12, 14]
         },
         n_iter=32,
         random_state=0,
         n_jobs=-1,
         cv=3,
         #scoring='neg_mean_squared_error'
     )

random_optimal = random_opt.fit(X_train, y_train)
print(random_opt.score(X_test, y_test))


0.8223070207279266


In [8]:
print(random_optimal.best_score_)
print(random_optimal.best_params_)


0.7974348648123033
OrderedDict([('max_depth', 10), ('n_estimators', 300)])


### SVR

In [9]:
from sklearn.svm import SVR

svr_opt = BayesSearchCV(
    SVR(),
    {
        'C': [0.1, 0.2, 0.3, 0.5, 1.0],
        'gamma': [0.1, 0.01, 0.001, 0.0001, 1.0],
        'kernel': ['rbf'],
    },
    n_iter=32,
    random_state=0,
    n_jobs=-1,
    pre_dispatch='n_jobs'
)


svr_optimal = svr_opt.fit(X_train, y_train)
print(svr_opt.score(X_test, y_test))




0.7528765033343172


In [10]:
svr_optimal.best_score_
svr_optimal.best_params_


OrderedDict([('C', 1.0), ('gamma', 0.001), ('kernel', 'rbf')])

### Linear Regression

In [11]:
from sklearn.linear_model import LinearRegression
linear = LinearRegression()
linear.fit(X_train,y_train)
print(linear.score(X_test, y_test))


0.7404218066372036


## Auswertung Parameter

In [12]:
#Random Forrester mit bestem param

print(random_optimal.best_params_)


OrderedDict([('max_depth', 10), ('n_estimators', 300)])


In [13]:
rnd_best = RandomForestRegressor(
    n_estimators=150,
    max_depth=8)

In [14]:
rnd_best_best = rnd_best.fit(X_train, y_train)
print(rnd_best_best)
print(rnd_best_best.score(X_test, y_test))

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=8, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=150, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)
0.8206656912205776


In [15]:
#df = pd.DataFrame(random_optimal.cv_results_)
#print(df)

In [16]:
#SVR mit bestem param
svr_optimal.best_params_


OrderedDict([('C', 1.0), ('gamma', 0.001), ('kernel', 'rbf')])

In [17]:
svr_best = SVR(C=1.0,
        gamma= 0.001,
        kernel= 'rbf')

In [18]:
svr_best_best = svr_best.fit(X_train, y_train)
print(svr_best_best)
print(svr_best.score(X_test, y_test))


SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.001,
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)
0.7528765033343172


In [19]:
#df_svr = pd.DataFrame(svr_optimal.cv_results_)
#print(df_svr)