# Importance of Cross Validation for hyper-parameters selection
Cross-validation is a statistical method used to estimate the skill of machine learning models, as mentioned <a href="https://machinelearningmastery.com/k-fold-cross-validation/">here</a>.
It is commonly used in applied machine learning to compare and select a model for a given predictive modeling problem because it is easy to understand, easy to implement, and results in skill estimates that generally have a lower bias than other methods.


To be learned: Overfitting, Underfitting (<a href="https://towardsdatascience.com/what-are-overfitting-and-underfitting-in-machine-learning-a96b30864690">further reading</a>), <a href="https://scikit-learn.org/stable/modules/cross_validation.html#cross-validation"> Cross Validation </a> for model validation


In [None]:
#Needed Module

#Algebra, Powerful n-dimensional arrays. Numerical computing tools.
import numpy as np
# data visualization
import matplotlib.pyplot as plt
# machine learning library
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

## Dataset creation

In [None]:
#Create the dataset in this way to hide the generating function, so you don't have any expectation :-) 
y_train = np.array([ 0.99292384, -0.17602701, -0.4310528 , -0.33073868, -0.83399427,
       -0.81227499, -0.9457809 , -0.85624286, -0.95165592, -0.92715604,
       -1.13032117, -0.88798652, -0.51239893, -0.19203875, -0.20370536])

X_train = np.array([0.07103606, 0.38344152, 0.4236548 , 0.43758721, 0.52889492,
       0.54488318, 0.5488135 , 0.56804456, 0.60276338, 0.64589411,
       0.71518937, 0.79172504, 0.891773  , 0.92559664, 0.96366276])

y_test = np.array([ 0.93275396,  0.95087121,  0.86633854,  0.71088378,  0.81383568,
        0.64752137,  0.39816723,  0.46616733,  0.1767985 ,  0.11331356,
        0.01877015, -0.20207215, -0.25619809, -0.63903644, -0.60715212,
       -0.83064306, -0.94393689, -0.98686169, -1.00777581, -0.99291688,
       -1.11065294, -0.87346734, -0.86100918, -0.98131337, -0.57717027,
       -0.4155853 , -0.35053048, -0.33729401, -0.26885726,  0.10544517])
X_test = np.array([0.        , 0.03448276, 0.06896552, 0.10344828, 0.13793103,
       0.17241379, 0.20689655, 0.24137931, 0.27586207, 0.31034483,
       0.34482759, 0.37931034, 0.4137931 , 0.44827586, 0.48275862,
       0.51724138, 0.55172414, 0.5862069 , 0.62068966, 0.65517241,
       0.68965517, 0.72413793, 0.75862069, 0.79310345, 0.82758621,
       0.86206897, 0.89655172, 0.93103448, 0.96551724, 1.        ])

np.random.seed(0)
n_samples = 15
degrees = [1, 4, 12]






## Experiment n.1 training a model without a data validation strategy

### Evaluate the models on the training data

In [None]:
#for each hyper-parameter (degree) initialized and fit a model (linear regression)
#check the performance of that model using the mean squared error metrics


trained_model = [0,0,0]
for i,d in enumerate(degrees):
    
    polynomial_features = PolynomialFeatures(degree=degrees[i],
                                             include_bias=True)
    linear_regression = LinearRegression()
    trained_model[i] = Pipeline([("polynomial_features", polynomial_features),("linear_regression", linear_regression)])
    trained_model[i].fit(X_train[:, np.newaxis], y_train)

    # Evaluate the models performance on the train dataset
    train_scores = mean_squared_error(y_train,trained_model[i].predict(X_train[:, np.newaxis]))

    #print the results
    print("Degree: {} :: Train MSE = {:.5f}".format(d,train_scores))
    


### Evaluate the models on the test data
what would you expect?  what do you get?

In [None]:
#check the performance of the trained model on the test dataset
for i,d in enumerate(degrees):
    test_scores = mean_squared_error(y_test,trained_model[i].predict(X_test[:, np.newaxis]))
    print("Degree: {} :: Test MSE = {:.5f}".format(d,test_scores))

## Experiment n.2 training a model with a model validation strategy
Imagine not to have the test dataset, what would you do?

Cross-Validation allows to get similar information as in the case of evaluating against a test set

In [None]:
#for each hyper-parameter (degree) initialized and fit a model (linear regression) to train dataset only(!!!)
#check the performance of that model using the mean squared error metrics and cross validation

for i,d in enumerate(degrees):

    polynomial_features = PolynomialFeatures(degree=degrees[i],
                                             include_bias=False)
    linear_regression = LinearRegression()
    pipeline = Pipeline([("polynomial_features", polynomial_features),
                         ("linear_regression", linear_regression)])
    pipeline.fit(X_train[:, np.newaxis], y_train)

    # Evaluate the models using crossvalidation
    cv_scores = cross_val_score(pipeline, X_train[:, np.newaxis], y_train,
                             scoring="neg_mean_squared_error", cv=5)

    print("Degree: {} :: Train CV MSE = {:.5f}".format(d,-cv_scores.mean()))

## For fun, let's have a look at the data distribution

In [None]:
plt.figure(figsize=(16, 6))
for i,d in enumerate(degrees):
    ax = plt.subplot(1, len(degrees), i + 1)
    plt.setp(ax, xticks=(), yticks=())

    X_fun = np.linspace(0, 1, 1000)
    plt.plot(X_fun, trained_model[i].predict(X_fun[:, np.newaxis]), label="Model")
    plt.plot(X_fun, np.cos(1.5 * np.pi * X_fun), label="True function")
    plt.scatter(X_train, y_train, edgecolor='b', s=20, label="Train Samples")
    plt.xlabel("x")
    plt.ylabel("y")
    plt.xlim((0, 1))
    plt.ylim((-2, 2))
    plt.legend(loc="best")
    plt.title("Degree: {}".format(d))
plt.show()