# Testing on the training data

**DO NOT DO IT**, since it is methodologically wrong!

In [1]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

iris = load_iris()

X = iris.data
y = iris.target

clf = RandomForestClassifier(n_estimators=2, random_state=0)
# X is our training data
clf.fit(X, y)

# This is an overly optimistic estimation since we are using X again!
y_pred = clf.predict(X)
acc = accuracy_score(y, y_pred)

print(f'Accuracy: {acc:.2f}')

Accuracy: 0.97


## Two-way holdout

In [2]:
from sklearn.model_selection import train_test_split

# split in train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

clf = RandomForestClassifier(n_estimators=2, random_state=0)
clf.fit(X_train, y_train)

# test with unseen data
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print(f'Accuracy: {acc:.2f}')

Accuracy: 0.91


# k-fold cross validation

In [3]:
from sklearn.model_selection import cross_validate
import timeit

def do_cross_validation(clf, print_model=False, print_duration=False):
    start = timeit.default_timer()
    cv = cross_validate(clf, X, y, scoring='accuracy', cv=3)
    scores = ' + '.join(f'{s:.2f}' for s in cv["test_score"])
    mean_ = cv["test_score"].mean()
    msg = f'Cross-validated accuracy: ({scores}) / 3 = {mean_:.2f}'

    if print_model:
        msg = f'\nClassifier: {clf}\n{msg}\n'

    if print_duration:
        msg = f'Duration: {timeit.default_timer() - start}{msg}\n'

    print(msg)

In [4]:
clf = RandomForestClassifier(n_estimators=2, random_state=0)
do_cross_validation(clf, True, True)

Duration: 0.04880734300240874
Classifier: RandomForestClassifier(n_estimators=2, random_state=0)
Cross-validated accuracy: (0.98 + 0.92 + 0.96) / 3 = 0.95




## Applying cross-validation for model selection

In [5]:
from sklearn.svm import SVC

start = timeit.default_timer()
svc = SVC(random_state=0)
print('Default value for kernel: ', svc.kernel)
do_cross_validation(svc, True, True)

Default value for kernel:  rbf
Duration: 0.036153421009657905
Classifier: SVC(random_state=0)
Cross-validated accuracy: (0.96 + 0.98 + 0.94) / 3 = 0.96




In [6]:
do_cross_validation(SVC(kernel='linear', random_state=0), print_model=True)
do_cross_validation(SVC(kernel='poly', random_state=0), print_model=True)
do_cross_validation(RandomForestClassifier(n_estimators=2, random_state=0), print_model=True)
do_cross_validation(RandomForestClassifier(n_estimators=5, random_state=0), print_model=True)


Classifier: SVC(kernel='linear', random_state=0)
Cross-validated accuracy: (1.00 + 1.00 + 0.98) / 3 = 0.99


Classifier: SVC(kernel='poly', random_state=0)
Cross-validated accuracy: (0.98 + 0.94 + 0.98) / 3 = 0.97


Classifier: RandomForestClassifier(n_estimators=2, random_state=0)
Cross-validated accuracy: (0.98 + 0.92 + 0.96) / 3 = 0.95


Classifier: RandomForestClassifier(n_estimators=5, random_state=0)
Cross-validated accuracy: (0.98 + 0.94 + 0.94) / 3 = 0.95



# Nested cross-validation

In [9]:
from sklearn.model_selection import GridSearchCV

start = timeit.default_timer()
# random forest inner loop
clf_grid = GridSearchCV(RandomForestClassifier(random_state=0), param_grid={'n_estimators': [2, 5]})
# random forest outer loop
do_cross_validation(clf_grid, print_model=True, print_duration=True)

start = timeit.default_timer()
# svc inner loop
svc_grid = GridSearchCV(SVC(random_state=0), param_grid={'kernel': ['linear', 'poly']})
# svc outer loop
do_cross_validation(svc_grid, print_model=True, print_duration=True)

Duration: 0.5532464570133016
Classifier: GridSearchCV(estimator=RandomForestClassifier(random_state=0),
             param_grid={'n_estimators': [2, 5]})
Cross-validated accuracy: (0.98 + 0.92 + 0.96) / 3 = 0.95


Duration: 0.14918377198046073
Classifier: GridSearchCV(estimator=SVC(random_state=0),
             param_grid={'kernel': ['linear', 'poly']})
Cross-validated accuracy: (1.00 + 0.94 + 0.98) / 3 = 0.97




# Nested CV - getting the final model

Nested cross-validation itself doesn't directly produce a final model. Rather, it is a technique to get an unbiased estimated of the generalization error. 

There are three alternative approaches to produce the final model **after** using nested CV. 
1. The final model is produced by training on the entire dataset, and using the best hyperparameters found during the inner loop.
2. The final model is produced using the algorithm selected in the inner loop, but performing an additional hyperparameter setting on the whole dataset.
3. (Ensemble Model) The final model is built as an ensemble model by combining predictions from the multiple models trained in the inner loop.

Approaches 1 and 2 are the most common ones. Both involve using the entire dataset to refit a model AFTER the generalization error has been estimated.

Notice that in all of the three approaches described above, the estimate of the generalization error to be reported is the one resulting from the nested CV procedure. 

The two code blocks below provide examples of using the second approach.

## Classification example

In [15]:
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import numpy as np

# `outer_cv` creates 3 folds for estimating generalization error
outer_cv = KFold(3)

# when we train on a certain fold, we use a second cross-validation
# split in order to choose hyperparameters
inner_cv = KFold(3)

# create some regression data
X, y = make_classification(n_samples=1000, n_features=10)

# give shorthand names to models and use those as dictionary keys mapping
# to models and parameter grids for that model
models_and_parameters = {
    'svc': (SVC(),
            {'C': [0.01, 0.05, 0.1, 1]}),
    'rfc': (RandomForestClassifier(),
           {'max_depth': [5, 10, 50, 100, 200, 500]})}

# we will collect the average of the scores on the 3 outer folds in this dictionary
# with keys given by the names of the models in `models_and_parameters`
average_scores_across_outer_folds_for_each_model = dict()

# find the model with the best generalization error
for name, (model, params) in models_and_parameters.items():
    # this object is a regressor that also happens to choose
    # its hyperparameters automatically using `inner_cv`
    regressor_that_optimizes_its_hyperparams = GridSearchCV(
        estimator=model, param_grid=params,
        cv=inner_cv, scoring='accuracy')

    # estimate generalization error on the 3-fold splits of the data
    scores_across_outer_folds = cross_val_score(
        regressor_that_optimizes_its_hyperparams,
        X, y, cv=outer_cv, scoring='accuracy')

    # get the mean MSE across each of outer_cv's 3 folds
    average_scores_across_outer_folds_for_each_model[name] = np.mean(scores_across_outer_folds)
    error_summary = 'Model: {name}\nAccuracy in the 3 outer folds: {scores}.\nAverage acc: {avg}'
    print(error_summary.format(
        name=name, scores=scores_across_outer_folds,
        avg=np.mean(scores_across_outer_folds)))
    print()

print('Average score across the outer folds: ',
      average_scores_across_outer_folds_for_each_model)

many_stars = '\n' + '*' * 100 + '\n'
print(many_stars + 'Now we choose the best model and refit on the whole dataset' + many_stars)

best_model_name, best_model_avg_score = max(
    average_scores_across_outer_folds_for_each_model.items(),
    key=(lambda name_averagescore: name_averagescore[1]))

# get the best model and its associated parameter grid
best_model, best_model_params = models_and_parameters[best_model_name]

# now we refit this best model on the whole dataset so that we can start
# making predictions on other data, and now we have a reliable estimate of
# this model's generalization error and we are confident this is the best model
# among the ones we have tried
final_classifier = GridSearchCV(best_model, best_model_params, cv=inner_cv)
final_classifier.fit(X, y)

print('Best model: \n\t{}'.format(best_model), end='\n\n')
print('Estimation of its generalization error (accuracy):\n\t{}'.format(
    best_model_avg_score), end='\n\n')
print('Best parameter choice for this model: \n\t{params}'
      '\n(according to cross-validation `{cv}` on the whole dataset).'.format(
      params=final_regressor.best_params_, cv=inner_cv))

Model: svc
Accuracy in the 3 outer folds: [0.94011976 0.96696697 0.93393393].
Average acc: 0.9470068871266476

Model: rfc
Accuracy in the 3 outer folds: [0.95808383 0.97597598 0.93993994].
Average acc: 0.9579999160837485

Average score across the outer folds:  {'svc': 0.9470068871266476, 'rfc': 0.9579999160837485}

****************************************************************************************************
Now we choose the best model and refit on the whole dataset
****************************************************************************************************

Best model: 
	RandomForestClassifier()

Estimation of its generalization error (accuracy):
	0.9579999160837485

Best parameter choice for this model: 
	{'max_depth': 50}
(according to cross-validation `KFold(n_splits=3, random_state=None, shuffle=False)` on the whole dataset).


## Regression example

In [12]:
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.datasets import make_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import numpy as np

# `outer_cv` creates 3 folds for estimating generalization error
outer_cv = KFold(3)

# when we train on a certain fold, we use a second cross-validation
# split in order to choose hyperparameters
inner_cv = KFold(3)

# create some regression data
X, y = make_regression(n_samples=1000, n_features=10)

# give shorthand names to models and use those as dictionary keys mapping
# to models and parameter grids for that model
models_and_parameters = {
    'svr': (SVR(),
            {'C': [0.01, 0.05, 0.1, 1]}),
    'rfr': (RandomForestRegressor(),
           {'max_depth': [5, 10, 50, 100, 200, 500]})}

# we will collect the average of the scores on the 3 outer folds in this dictionary
# with keys given by the names of the models in `models_and_parameters`
average_scores_across_outer_folds_for_each_model = dict()

# find the model with the best generalization error
for name, (model, params) in models_and_parameters.items():
    # this object is a regressor that also happens to choose
    # its hyperparameters automatically using `inner_cv`
    regressor_that_optimizes_its_hyperparams = GridSearchCV(
        estimator=model, param_grid=params,
        cv=inner_cv, scoring='neg_mean_squared_error')

    # estimate generalization error on the 3-fold splits of the data
    scores_across_outer_folds = cross_val_score(
        regressor_that_optimizes_its_hyperparams,
        X, y, cv=outer_cv, scoring='neg_mean_squared_error')

    # get the mean MSE across each of outer_cv's 3 folds
    average_scores_across_outer_folds_for_each_model[name] = np.mean(scores_across_outer_folds)
    error_summary = 'Model: {name}\nMSE in the 3 outer folds: {scores}.\nAverage error: {avg}'
    print(error_summary.format(
        name=name, scores=scores_across_outer_folds,
        avg=np.mean(scores_across_outer_folds)))
    print()

print('Average score across the outer folds: ',
      average_scores_across_outer_folds_for_each_model)

many_stars = '\n' + '*' * 100 + '\n'
print(many_stars + 'Now we choose the best model and refit on the whole dataset' + many_stars)

best_model_name, best_model_avg_score = max(
    average_scores_across_outer_folds_for_each_model.items(),
    key=(lambda name_averagescore: name_averagescore[1]))

# get the best model and its associated parameter grid
best_model, best_model_params = models_and_parameters[best_model_name]

# now we refit this best model on the whole dataset so that we can start
# making predictions on other data, and now we have a reliable estimate of
# this model's generalization error and we are confident this is the best model
# among the ones we have tried
final_regressor = GridSearchCV(best_model, best_model_params, cv=inner_cv)
final_regressor.fit(X, y)

print('Best model: \n\t{}'.format(best_model), end='\n\n')
print('Estimation of its generalization error (negative mean squared error):\n\t{}'.format(
    best_model_avg_score), end='\n\n')
print('Best parameter choice for this model: \n\t{params}'
      '\n(according to cross-validation `{cv}` on the whole dataset).'.format(
      params=final_regressor.best_params_, cv=inner_cv))

Model: svr
MSE in the 3 outer folds: [-35614.04476457 -35178.32923572 -32425.78493914].
Average error: -34406.05297980857

Model: rf
MSE in the 3 outer folds: [-8666.61697967 -8442.3388464  -8122.1276498 ].
Average error: -8410.361158623991

Average score across the outer folds:  {'svr': -34406.05297980857, 'rf': -8410.361158623991}

****************************************************************************************************
Now we choose the best model and refit on the whole dataset
****************************************************************************************************

Best model: 
	RandomForestRegressor()

Estimation of its generalization error (negative mean squared error):
	-8410.361158623991

Best parameter choice for this model: 
	{'max_depth': 50}
(according to cross-validation `KFold(n_splits=3, random_state=None, shuffle=False)` on the whole dataset).


# References

1. [Model selection done right: A gentle introduction to nested cross-validation](https://ploomber.io/blog/nested-cv/).

2. [Which is the final model from Nested Cross Validation: Accuracy or Frequency?](https://datascience.stackexchange.com/questions/116311/which-is-the-final-model-from-nested-cross-validation-accuracy-or-frequency)

3. [What is the correct procedure for nested cross-validation?](https://stackoverflow.com/questions/64238730/what-is-the-correct-procedure-for-nested-cross-validation)

4. [Nested Cross Validation (Cynthia Rudin)](https://youtu.be/az60jS7MQhU?list=PLNeXFnYrCJneoY_rKtWJy833YiMrCRi5f)

5. [Nested cross-validation and selecting the best regression model - is this the right SKLearn process?](https://datascience.stackexchange.com/questions/13185/nested-cross-validation-and-selecting-the-best-regression-model-is-this-the-ri)

6. [Model evaluation, model selection, and algorithm selection in machine learning](https://sebastianraschka.com/blog/2016/model-evaluation-selection-part1.html)
