# Tuning hyperparameters

Author: Fadoua Ghourabi (fadouaghourabi@gmail.com)

Date: June 27, 2019

In [1]:
import numpy as np
import pandas as pd
from scipy.stats import uniform
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge, LinearRegression, LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

### Cross validation

Recall cross validation from the session _linear models_. Instead of splitting the data into one train set and one test set, cross validation splits data repeatedly and multiple models are trained. The most common cross validation is K-fold cross validation. When performing K-fold, the model is trained and tested on K partitions of the dats. 

In [2]:
iris = datasets.load_iris()
model = LogisticRegression()

# Perform 5-fold cross validation, i.e. cv = 5
scores = cross_val_score(model, iris.data, iris.target, cv=5, scoring='accuracy')
print("Cross validation score: {}".format(scores))
print("Cross validation mean score: {}".format(scores.mean()))
print("Cross validation std score: {}".format(scores.std()))



Cross validation score: [1.         0.96666667 0.93333333 0.9        1.        ]
Cross validation mean score: 0.9600000000000002
Cross validation std score: 0.038873012632301994




``cross_val_score`` computes the scores of the test sets. Since it is a classification problem, the score function is the mean accuracy on the given test set.
$$\text{accuracy} = \frac{\text{# correct predictions}}{\text{# of target values}}$$

**Ohno san asks "How do I know whether the model is overfitting or underfitting?".** In other words, how do I get the score of the training set so that we can compare with the score of the test set? Unfortunately, ``cross_val_score`` does not give such information, but we can get it manually as follows.

In [3]:
def evaluate_fitting_KFold(model, X, y, kfold):
    train_scores, test_scores = [], []
    cv = KFold(kfold) # split into k partitions of train and test sets
    
    # for each partition, we compute the train score and test score
    for train, test in cv.split(X):
        model.fit(X[train], y[train])
        train_scores.append(model.score(X[train], y[train]))
        test_scores.append(model.score(X[test], y[test]))

    mean_train_score = np.mean(train_scores)
    mean_test_score = np.mean(test_scores)
    
    return mean_train_score, mean_test_score

In [5]:
evaluate_fitting_KFold(LogisticRegression(), iris.data, iris.target, 5) # overfitting



(0.9316666666666666, 0.7533333333333333)

In [6]:
boston = datasets.load_boston()

In [11]:
evaluate_fitting_KFold(GradientBoostingRegressor(), boston.data, boston.target, 3) # overfitting

(0.9825743779053129, 0.5765448987355657)

In [9]:
diabetes = datasets.load_diabetes()

In [10]:
evaluate_fitting_KFold(Ridge(), diabetes.data, diabetes.target, 3) # very bad model: overfitting & underfitting

(0.4210754013322908, 0.40942743830329875)

### Grid search with cross validation

Grid search is an approach to hyperparameter tuning that will evaluate a model for each combination of algorithm hyperparameters specified in a grid.

In [12]:
Ridge().get_params().keys()

dict_keys(['alpha', 'copy_X', 'fit_intercept', 'max_iter', 'normalize', 'random_state', 'solver', 'tol'])

In [16]:
#dataset = datasets.load_diabetes()
dataset = datasets.load_boston()

# a range of alpha values to test
alphas = np.array([0.1,0.5,1,10,100])

# a range of solvers
solver = ['sag', 'saga','svd','sparse_cg','lsqr','cholesky']
param_grid = {'alpha': alphas, 'solver': solver}

# create and fit a ridge regression model
model = Ridge()

# test the model on diffrent alphas and solvers
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='r2', cv=2)
grid.fit(dataset.data, dataset.target)

# summarize the results of the grid search
print(grid.best_score_)
print(grid.best_estimator_.alpha)
print(grid.best_estimator_.solver)
print(grid.best_estimator_)

0.603264372336529
100.0
saga
Ridge(alpha=100.0, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='saga', tol=0.001)


In [17]:
grid.best_estimator_.score(X_test, y_test)

0.5911192572696201

In [18]:
results = pd.DataFrame(grid.cv_results_)
display(results)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_solver,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score
0,0.023444,0.00561,0.000455,1e-05,0.1,sag,"{'alpha': 0.1, 'solver': 'sag'}",0.597338,0.414181,0.50576,0.091579,13
1,0.035548,0.010256,0.000533,2.3e-05,0.1,saga,"{'alpha': 0.1, 'solver': 'saga'}",0.60258,0.563901,0.58324,0.01934,6
2,0.002903,0.001993,0.000603,4e-05,0.1,svd,"{'alpha': 0.1, 'solver': 'svd'}",0.609503,-1.922376,-0.656436,1.26594,29
3,0.00142,0.000216,0.000501,0.000124,0.1,sparse_cg,"{'alpha': 0.1, 'solver': 'sparse_cg'}",0.555546,-0.671501,-0.057977,0.613524,23
4,0.001697,0.00029,0.000369,7.6e-05,0.1,lsqr,"{'alpha': 0.1, 'solver': 'lsqr'}",0.555542,-0.671501,-0.05798,0.613521,24
5,0.000742,0.000206,0.00037,5.9e-05,0.1,cholesky,"{'alpha': 0.1, 'solver': 'cholesky'}",0.609503,-1.922376,-0.656436,1.26594,30
6,0.027872,0.010077,0.000485,9.1e-05,0.5,sag,"{'alpha': 0.5, 'solver': 'sag'}",0.596884,0.416739,0.506811,0.090072,12
7,0.041539,0.009535,0.000619,7.1e-05,0.5,saga,"{'alpha': 0.5, 'solver': 'saga'}",0.602732,0.563821,0.583276,0.019455,5
8,0.001404,0.00051,0.000433,4e-06,0.5,svd,"{'alpha': 0.5, 'solver': 'svd'}",0.604466,-1.298929,-0.347232,0.951698,27
9,0.001756,0.000165,0.000439,7.8e-05,0.5,sparse_cg,"{'alpha': 0.5, 'solver': 'sparse_cg'}",0.555709,-0.651947,-0.048119,0.603828,21


**Practice.** Find the best score using gradient boosting tree model? Compare with team mates.

In [80]:
GradientBoostingRegressor()

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [34]:
#dataset = datasets.load_diabetes()
dataset = datasets.load_boston()

# a range of learning_rates values
learning_rates = np.arange(0.1,1,0.1)

# a range of number of features to consider when looking for the best split
max_features = ['sqrt', 'log2','auto',2,3,5,6,7,8,9,10,11,12]

# a range of number of trees
n_estimators = np.arange(10,100,5)

param_grid = {'learning_rate': learning_rates, 'max_features': max_features, 'n_estimators': n_estimators}

# create and fit a Gradient boosting regressor regression model
model = GradientBoostingRegressor()

# test the model on diffrent alphas and solvers
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='r2', cv=2)
grid.fit(dataset.data, dataset.target)

# summarize the results of the grid search
print(grid.best_score_)
print(grid.best_estimator_.learning_rate)
print(grid.best_estimator_.max_features)
print(grid.best_estimator_.n_estimators)
print(grid.best_estimator_)

0.7594051136079744
0.30000000000000004
7
95
GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.30000000000000004, loss='ls',
                          max_depth=3, max_features=7, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=95,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)


### Random search with cross validation

Random search is an approach to parameter tuning that will sample algorithm hyperparameters from a random distribution (i.e. uniform) for a fixed number of iterations. A model is constructed and evaluated for each combination of hyperparameters choice. See: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.uniform.html

In [174]:
# load the diabetes datasets
dataset = datasets.load_diabetes()
# alpha is sampled from a uniform distribution
rand_params = {'alpha': uniform()}

# create and fit a ridge regression model, testing random alpha values
model = Ridge()
rsearch = RandomizedSearchCV(estimator=model, param_distributions=rand_params, n_iter=100, cv=5)
rsearch.fit(dataset.data, dataset.target)

# summarize the results of the random parameter search
print(rsearch.best_score_)
print(rsearch.best_estimator_.alpha)

0.4822780849355016
0.001008132834983133
