# Block 6 Exercise 2: finding the best parameters for predicting the fare of taxi rides
We return to our Random Forest Regression and want to automatically optimize all free parameters ...

In [0]:
import pandas as pd
import numpy as np
import folium


In [39]:
#check if notebook runs in colab
import sys
IN_COLAB = 'google.colab' in sys.modules
print('running in Colab:',IN_COLAB)
path='..'
if IN_COLAB:
  #in colab, we need to clone the data from the repo
  !git clone https://github.com/keuperj/DataScienceSS20.git
  path='DataScienceSS20'

running in Colab: True
fatal: destination path 'DataScienceSS20' already exists and is not an empty directory.


In [0]:
# we load the data we have saved after wrangling and pre-processing in block I
X=pd.read_csv(path+'/DATA/train_cleaned.csv')
drop_columns=['Unnamed: 0','Unnamed: 0.1','Unnamed: 0.1.1','key','pickup_datetime','pickup_date','pickup_latitude_round3','pickup_longitude_round3','dropoff_latitude_round3','dropoff_longitude_round3']
X=X.drop(drop_columns,axis=1)
X=pd.get_dummies(X)# one hot coding
#generate labels
y=X['fare_amount']
X=X.drop(['fare_amount'],axis=1)

### Scikit Optimize
Scikit Optimize (https://scikit-optimize.github.io/stable/index.html) is a AutoML toolbox wrapped around Scikit-Learn. It allows us to use state-of-the-art automatic hyper-parameter optimization on top of our learning algorithms.   



In [4]:
# install 
!pip install scikit-optimize

Collecting scikit-optimize
[?25l  Downloading https://files.pythonhosted.org/packages/5c/87/310b52debfbc0cb79764e5770fa3f5c18f6f0754809ea9e2fc185e1b67d3/scikit_optimize-0.7.4-py2.py3-none-any.whl (80kB)
[K     |████                            | 10kB 17.0MB/s eta 0:00:01[K     |████████▏                       | 20kB 1.7MB/s eta 0:00:01[K     |████████████▎                   | 30kB 2.1MB/s eta 0:00:01[K     |████████████████▎               | 40kB 2.3MB/s eta 0:00:01[K     |████████████████████▍           | 51kB 1.9MB/s eta 0:00:01[K     |████████████████████████▌       | 61kB 2.2MB/s eta 0:00:01[K     |████████████████████████████▌   | 71kB 2.3MB/s eta 0:00:01[K     |████████████████████████████████| 81kB 2.2MB/s 
Collecting pyaml>=16.9
  Downloading https://files.pythonhosted.org/packages/15/c4/1310a054d33abc318426a956e7d6df0df76a6ddfa9c66f6310274fb75d42/pyaml-20.4.0-py2.py3-none-any.whl
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-

### E 2.1 Bayesian Optimization of a Random Forest Regression Model
use Bayesian Optimization with Cross-Validation (https://scikit-optimize.github.io/stable/modules/generated/skopt.BayesSearchCV.html#skopt.BayesSearchCV) to find the best regression model. Compare
* linear regression (https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html#sklearn.linear_model.LinearRegression) 
* Random Forest regression (https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn.ensemble.RandomForestRegressor)
* and SVM regression (https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html#sklearn.svm.SVR)

NOTES: this can become quite compute intensive! Hence,
* use a smaller subset of the training data to run the experiments 
* think about the range of your parameters (e.g. larger number of trees in RF or high C-values in SMV will make models expensive)
* optimize only the following parameters per model type:
    * linear: no parameters to optimize
    * RF: #trees and depth
    * SVM: C and gamma (use RBF kernel)
* parallelize -> n_jobs
* use CoLab to rum the job for up to 12h 


In [0]:
from skopt import BayesSearchCV
# parameter ranges are specified by one of below
from skopt.space import Real, Categorical, Integer

from sklearn.datasets import load_iris
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.1, test_size=0.02, random_state=0)

In [42]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40000 entries, 39831 to 326041
Data columns (total 31 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   pickup_longitude               40000 non-null  float64
 1   pickup_latitude                40000 non-null  float64
 2   dropoff_longitude              40000 non-null  float64
 3   dropoff_latitude               40000 non-null  float64
 4   passenger_count                40000 non-null  int64  
 5   pickup_day                     40000 non-null  int64  
 6   pickup_hour                    40000 non-null  int64  
 7   pickup_day_of_week             40000 non-null  int64  
 8   pickup_month                   40000 non-null  int64  
 9   pickup_year                    40000 non-null  int64  
 10  is_pickup_JFK                  40000 non-null  int64  
 11  is_dropoff_JFK                 40000 non-null  int64  
 12  is_pickup_EWR                  40000 non-

In [0]:
# log-uniform: understand as search over p = exp(x) by varying x
opt = BayesSearchCV(
    SVC(),
    {
        'C': Real(1e-6, 1e+6, prior='log-uniform'),
        'gamma': Real(1e-6, 1e+1, prior='log-uniform'),
        'degree': Integer(1,8),
        'kernel': Categorical(['linear', 'poly', 'rbf']),
    },
    n_iter=32,
    random_state=0
)

In [29]:
# executes bayesian optimization
_ = opt.fit(X_train, y_train)

ValueError: ignored

In [0]:
# model can be saved, used for predictions or scoring
print(opt.score(X_test, y_test))

In [23]:
pip install bayesian-optimization

Collecting bayesian-optimization
  Downloading https://files.pythonhosted.org/packages/bb/7a/fd8059a3881d3ab37ac8f72f56b73937a14e8bb14a9733e68cc8b17dbe3c/bayesian-optimization-1.2.0.tar.gz
Building wheels for collected packages: bayesian-optimization
  Building wheel for bayesian-optimization (setup.py) ... [?25l[?25hdone
  Created wheel for bayesian-optimization: filename=bayesian_optimization-1.2.0-cp36-none-any.whl size=11685 sha256=8c4fa5e1fdc4481bebacb52e52f7a07f5c725fdd8a99fcaba19ad16a510cdc13
  Stored in directory: /root/.cache/pip/wheels/5a/56/ae/e0e3c1fc1954dc3ec712e2df547235ed072b448094d8f94aec
Successfully built bayesian-optimization
Installing collected packages: bayesian-optimization
Successfully installed bayesian-optimization-1.2.0


In [0]:
#Import libraries
import pandas as pd
import numpy as np
from bayes_opt import BayesianOptimization
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [0]:
#Bayesian optimization
def bayesian_optimization(X_train, y_train, X_test, y_test, function, parameters):
   n_iterations = 5
   gp_params = {"alpha": 1e-4}

   BO = BayesianOptimization(function, parameters)
   BO.maximize(n_iter=n_iterations, **gp_params)

   return BO.max

In [0]:
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.1, test_size=0.02, random_state=0)

In [71]:
#np.all(np.isfinite(X_train))
#np.any(np.isnan(X_train))
# np.nan_to_num(y_train)
# np.nan_to_num(X_train)
# np.nan_to_num(y_test)
# np.nan_to_num(X_test)
#np.isnan(X_test)
np.where(X_train.values >= np.finfo(np.float64).max)
np.where(X_test.values >= np.finfo(np.float64).max)
np.where(y_train.values >= np.finfo(np.float64).max)
np.where(y_test.values >= np.finfo(np.float64).max)

(array([], dtype=int64), array([], dtype=int64))

In [72]:
function, parameters = rfc_optimization(50) 
bayesian_optimization(X_train, y_train, X_test, y_test, function, parameters) #always same error... no matter if check and replace of nan and inf
## same as here: https://www.kaggle.com/c/word2vec-nlp-tutorial/discussion/60665 and https://datascience.stackexchange.com/questions/11928/valueerror-input-contains-nan-infinity-or-a-value-too-large-for-dtypefloat32

|   iter    |  target   | max_depth | min_sa... | n_esti... |
-------------------------------------------------------------
| [0m 1       [0m | [0m nan     [0m | [0m 6.484   [0m | [0m 7.144   [0m | [0m 220.3   [0m |
| [0m 2       [0m | [0m nan     [0m | [0m 36.6    [0m | [0m 8.962   [0m | [0m 747.3   [0m |
| [0m 3       [0m | [0m nan     [0m | [0m 89.29   [0m | [0m 9.767   [0m | [0m 594.7   [0m |
| [0m 4       [0m | [0m nan     [0m | [0m 94.82   [0m | [0m 4.547   [0m | [0m 496.0   [0m |
| [0m 5       [0m | [0m nan     [0m | [0m 35.87   [0m | [0m 6.551   [0m | [0m 363.6   [0m |


ValueError: ignored

In [0]:
def rfc_optimization(cv_splits):
    def function(n_estimators, max_depth, min_samples_split):
        return cross_val_score(
               RandomForestClassifier(
                   n_estimators=int(max(n_estimators,0)),                                                               
                   max_depth=int(max(max_depth,1)),
                   min_samples_split=int(max(min_samples_split,2)), 
                   n_jobs=-1, 
                   random_state=42,   
                   class_weight="balanced"),  
               X=X_train, 
               y=y_train, 
               cv=cv_splits,
               scoring="roc_auc",
               n_jobs=-1).mean()

    parameters = {"n_estimators": (10, 1000),
                  "max_depth": (1, 150),
                  "min_samples_split": (2, 10)}
    
    return function, parameters

In [0]:
def xgb_optimization(cv_splits, eval_set):
    def function(eta, gamma, max_depth):
            return cross_val_score(
                   xgb.XGBClassifier(
                       objective="binary:logistic",
                       learning_rate=max(eta, 0),
                       gamma=max(gamma, 0),
                       max_depth=int(max_depth),                                               
                       seed=42,
                       nthread=-1,
                       scale_pos_weight = len(y_train[y_train == 0])/
                                          len(y_train[y_train == 1])),  
                   X=X_train, 
                   y=y_train, 
                   cv=cv_splits,
                   scoring="roc_auc",
                   fit_params={
                        "early_stopping_rounds": 10, 
                        "eval_metric": "auc", 
                        "eval_set": eval_set},
                   n_jobs=-1).mean()

    parameters = {"eta": (0.001, 0.4),
                  "gamma": (0, 20),
                  "max_depth": (1, 2000)}
    
    return function, parameters

In [0]:
#Train model
def train(X_train, y_train, X_test, y_test, function, parameters):
    dataset = (X_train, y_train, X_test, y_test)
    cv_splits = 4
    
    best_solution = bayesian_optimization(dataset, function, parameters)      
    params = best_solution["params"]

    model = RandomForestClassifier(
             n_estimators=int(max(params["n_estimators"], 0)),
             max_depth=int(max(params["max_depth"], 1)),
             min_samples_split=int(max(params["min_samples_split"], 2)), 
             n_jobs=-1, 
             random_state=42,   
             class_weight="balanced")

    model.fit(X_train, y_train)
    
    return model