## tuning hyperparameters

hyperparameters are the variables that will determine how the model <br>
will be trained, in order to output the results that we need <br>
we have to *tune* them

In [1]:
# creating the evaluating function
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

def evaluate_regression_preds(y_true, y_preds):
    """
    evaluate a regression model performance based on four criteria:
    accuracy, r2_score, MAE and MSE. I print the scores and create
    a dict with the values in it
    """
    r_squared = r2_score(y_true, y_preds)
    mae = mean_absolute_error(y_true, y_preds)
    mse = mean_squared_error(y_true, y_preds)
    metrics_dict = {'R squared': round(r_squared, 2),
                    'mean absolute error': round(mae, 2),
                    'mean squared error': round(mse, 2)}

    print(f'R squared score: {r_squared * 100:.2f}%')
    print(f'mean absolute error: {mae * 100:.2f}')
    print(f'mean squared error: {mse * 100:.2f}')
    
    return metrics_dict

In [2]:
# importing the hungarian chickenpox dataset
import pandas as pd
chickenpox = pd.read_csv('data/hungary_chickenpox.csv')
chickenpox.head()

Unnamed: 0,Date,BUDAPEST,BARANYA,BACS,BEKES,BORSOD,CSONGRAD,FEJER,GYOR,HAJDU,...,JASZ,KOMAROM,NOGRAD,PEST,SOMOGY,SZABOLCS,TOLNA,VAS,VESZPREM,ZALA
0,03/01/2005,168,79,30,173,169,42,136,120,162,...,130,57,2,178,66,64,11,29,87,68
1,10/01/2005,157,60,30,92,200,53,51,70,84,...,80,50,29,141,48,29,58,53,68,26
2,17/01/2005,96,44,31,86,93,30,93,84,191,...,64,46,4,157,33,33,24,18,62,44
3,24/01/2005,163,49,43,126,46,39,52,114,107,...,63,54,14,107,66,50,25,21,43,31
4,31/01/2005,122,78,53,87,103,34,95,131,172,...,61,49,11,124,63,56,7,47,85,60


In [3]:
# creating a column with the total of each case per day 
chickenpox['total'] = chickenpox.sum(axis = 1)
chickenpox.head()

Unnamed: 0,Date,BUDAPEST,BARANYA,BACS,BEKES,BORSOD,CSONGRAD,FEJER,GYOR,HAJDU,...,KOMAROM,NOGRAD,PEST,SOMOGY,SZABOLCS,TOLNA,VAS,VESZPREM,ZALA,total
0,03/01/2005,168,79,30,173,169,42,136,120,162,...,57,2,178,66,64,11,29,87,68,1807
1,10/01/2005,157,60,30,92,200,53,51,70,84,...,50,29,141,48,29,58,53,68,26,1407
2,17/01/2005,96,44,31,86,93,30,93,84,191,...,46,4,157,33,33,24,18,62,44,1284
3,24/01/2005,163,49,43,126,46,39,52,114,107,...,54,14,107,66,50,25,21,43,31,1255
4,31/01/2005,122,78,53,87,103,34,95,131,172,...,49,11,124,63,56,7,47,85,60,1478


In [4]:
from sklearn.ensemble import RandomForestRegressor

rg = RandomForestRegressor()

rg.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [5]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

np.random.seed(10)

X = chickenpox.drop(['total', 'Date'], axis = 1)
y = chickenpox.total

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .15)

rg = RandomForestRegressor(n_estimators = 200)

rg.fit(X_train, y_train)

# making baseline predictions
y_preds = rg.predict(X_test)

baseline_metrics = evaluate_regression_preds(y_test, y_preds)

R squared score: 95.10%
mean absolute error: 7412.79
mean squared error: 1415121.74


## using RandomizedSearchCV

In [6]:
from sklearn.model_selection import RandomizedSearchCV

# set up the grid
grid = {'n_estimators': [100, 200, 500, 800, 1100],
        'max_features': ['auto', 'sqrt', 'log2'],
        'min_samples_split': [2, 4, 6],
        'min_samples_leaf': [1, 2, 3, 4]}

# make results reproducible
np.random.seed(10)

# split into X nd y
X = chickenpox.drop(['total', 'Date'], axis = 1)
y = chickenpox.total

# split into train and test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .15)

# instantiate the model
rg = RandomForestRegressor()

# instantiating the random search
rs_rg = RandomizedSearchCV(estimator = rg,
                            param_distributions = grid,
                            n_iter = 5, # n of different evaluations
                            cv = 5, # cross-fold evaluation, total fits: n_iter * cv
                            verbose = 2) # verbose indicate how much
                                         # information to output while 
                                         # fitting
# fitting the random search 
rs_rg.fit(X_train, y_train);

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] END max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.7s
[CV] END max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.3s
[CV] END max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.5s
[CV] END max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.7s
[CV] END max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   0.7s
[CV] END max_features=auto, min_samples_leaf=4, min_samples_split=6, n_estimators=500; total time=   5.4s
[CV] END max_features=auto, min_samples_leaf=4, min_samples_split=6, n_estimators=500; total time=   3.4s
[CV] END max_features=auto, min_samples_leaf=4, min_samples_split=6, n_estimators=500; total time=   2.6s
[CV] END max_features=auto, min_samples_leaf=4, min_samples_split=6, n_estimators=500; total

In [7]:
rs_rg.best_params_

{'n_estimators': 800,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt'}

In [8]:
from sklearn.model_selection import GridSearchCV

grid_2 = {'n_estimators': [200, 800],
          'min_samples_split': [2],
          'min_samples_leaf': [1, 2, 3],
          'max_features': ['auto', 'sqrt']}

# GridSearchCV is very similar to RandomSearchCV
gs_rg = GridSearchCV(estimator = rg,
                     param_grid = grid_2,
                     cv = 5,
                     verbose = 2)

gs_rg.fit(X_train, y_train);

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.5s
[CV] END max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.0s
[CV] END max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.1s
[CV] END max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.3s
[CV] END max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=   1.0s
[CV] END max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=800; total time=   5.2s
[CV] END max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=800; total time=   5.3s
[CV] END max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=800; total time=   5.3s
[CV] END max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=800; tota

In [9]:
gs_rg.best_params_

{'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 200}

## saving and loading our model

**using pickle**

In [22]:
import pickle as pk

# saving our model
# wb stand for "write binary"
pk.dump(gs_rg, open('random_forest_regressor_model.pkl', 'wb'))

# loading our model
# rb stands for "reading binary"
loaded_pk_model = pk.load(open('random_forest_regressor_model.pkl', 'rb'))

In [23]:
pk_y_preds = loaded_pk_model.predict(X_test)
pickle_model_metrics = evaluate_regression_preds(y_test, pk_y_preds);

R squared score: 96.03%
mean absolute error: 6705.04
mean squared error: 1146566.21


**using joblib**

In [18]:
from joblib import dump, load

# saving our model
dump(gs_rg, filename='random_forest_regressor_model.joblib');

In [19]:
loaded_joblib_model = load(filename='random_forest_regressor_model.joblib')

In [20]:
joblib_y_preds = loaded_joblib_model.predict(X_test)
joblib_model_metrics = evaluate_regression_preds(y_test, joblib_y_preds)

R squared score: 96.03%
mean absolute error: 6705.04
mean squared error: 1146566.21


In [24]:
pd.DataFrame(data = {'pickle model': pickle_model_metrics,
                     'joblib model': joblib_model_metrics})

Unnamed: 0,pickle model,joblib model
R squared,0.96,0.96
mean absolute error,67.05,67.05
mean squared error,11465.66,11465.66
