# Tuning parameters

Author: Fadoua Ghourabi (fadouaghourabi@gmail.com)

Date: June 20, 2019

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import uniform
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge, LinearRegression, LogisticRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

### Cross validation

Recall cross validation from the session _linear models_. Instead of splitting the data into one train set and one test set, cross validation splits data repeatedly and multiple models are trained. The most common cross validation is K-fold cross validation. When performing K-fold, the model is trained and tested on K partitions of the dats. 

In [None]:
iris = datasets.load_iris()
model = LogisticRegression()

# Perform 5-fold cross validation, i.e. cv = 5
scores = cross_val_score(model, iris.data, iris.target, cv=5, scoring='accuracy')
print("Cross validation score: {}".format(scores))
print("Cross validation mean score: {}".format(scores.mean()))
print("Cross validation std score: {}".format(scores.std()))

``cross_val_score`` computes the scores of the test sets. Since it is a classification problem, the score function is the mean accuracy on the given test set.
$$\text{accuracy} = \frac{\text{# correct predictions}}{\text{# of target values}}$$

**Ohno san asks "How do I know whether the model is overfitting or underfitting?".** In other words, how do I get the score of the training set so that we can compare with the score of the test set? Unfortunately, ``cross_val_score`` does not give such information, but we can get it manually as follows.

In [None]:
def evaluate_fitting_KFold(model, X, y, kfold):
    train_scores, test_scores = [], []
    cv = KFold(kfold) # split into k partitions of train and test sets
    
    # for each partition, we compute the train score and test score
    for train, test in cv.split(X):
        model.fit(X[train], y[train])
        train_scores.append(model.score(X[train], y[train]))
        test_scores.append(model.score(X[test], y[test]))

    mean_train_score = np.mean(train_scores)
    mean_test_score = np.mean(test_scores)
    
    return mean_train_score, mean_test_score

In [None]:
evaluate_fitting_KFold(LogisticRegression(), iris.data, iris.target, 4) # overfitting

In [None]:
boston = datasets.load_boston()

In [None]:
evaluate_fitting_KFold(GradientBoostingRegressor(), boston.data, boston.target, 3) # overfitting

In [None]:
diabetes = datasets.load_diabetes()

In [None]:
evaluate_fitting_KFold(Ridge(), diabetes.data, diabetes.target, 3) # very bad model: overfitting & underfitting

### Grid search with cross validation

Grid search is an approach to parameter tuning that will evaluate a model for each combination of algorithm parameters specified in a grid.

In [None]:
Ridge().get_params().keys()

In [None]:
#dataset = datasets.load_diabetes()
dataset = datasets.load_boston()
X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, random_state=0)

# a range of alpha values to test
alphas = np.array([0.1,0.5,1,10,100])

# a range of solvers
solver = ['sag', 'saga','svd','sparse_cg','lsqr','cholesky']
param_grid = {'alpha': alphas, 'solver': solver}

# create and fit a ridge regression model
model = Ridge()

# test the model on diffrent alphas and solvers
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='r2', cv=3)
grid.fit(dataset.data, dataset.target)

# summarize the results of the grid search
print(grid.best_score_)
print(grid.best_estimator_.alpha)
print(grid.best_estimator_.solver)
print(grid.best_estimator_)

In [None]:
grid.best_estimator_.score(X_test, y_test)

In [None]:
results = pd.DataFrame(grid.cv_results_)
display(results)

**Practice.** Find the best score using gradient boosting tree model? Compare with team mates.

In [None]:
GradientBoostingRegressor()

### Random search with cross validation

Random search is an approach to parameter tuning that will sample algorithm parameters from a random distribution (i.e. uniform) for a fixed number of iterations. A model is constructed and evaluated for each combination of parameters chosen. See: https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.uniform.html

In [None]:
# load the diabetes datasets
dataset = datasets.load_diabetes()
# alpha is sampled from a uniform distribution
rand_params = {'alpha': uniform()}

# create and fit a ridge regression model, testing random alpha values
model = Ridge()
rsearch = RandomizedSearchCV(estimator=model, param_distributions=rand_params, n_iter=100, cv=5)
rsearch.fit(dataset.data, dataset.target)

# summarize the results of the random parameter search
print(rsearch.best_score_)
print(rsearch.best_estimator_.alpha)