# Credits

The code in this notebook is adapted from [here](https://ploomber.io/blog/nested-cv/).

# Testing on the training data

**DO NOT DO IT**, since it is methodologically wrong!

In [1]:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

iris = load_iris()

X = iris.data
y = iris.target

clf = RandomForestClassifier(n_estimators=2, random_state=0)
# X is our training data
clf.fit(X, y)

# This is an overly optimistic estimation since we are using X again!
y_pred = clf.predict(X)
acc = accuracy_score(y, y_pred)

print(f'Accuracy: {acc:.2f}')

Accuracy: 0.97


## Two-way holdout

In [2]:
from sklearn.model_selection import train_test_split

# split in train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

clf = RandomForestClassifier(n_estimators=2, random_state=0)
clf.fit(X_train, y_train)

# test with unseen data
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print(f'Accuracy: {acc:.2f}')

Accuracy: 0.91


# k-fold cross validation

In [3]:
from sklearn.model_selection import cross_validate
import timeit

def do_cross_validation(clf, print_model=False, print_duration=False):
    start = timeit.default_timer()
    cv = cross_validate(clf, X, y, scoring='accuracy', cv=3)
    scores = ' + '.join(f'{s:.2f}' for s in cv["test_score"])
    mean_ = cv["test_score"].mean()
    msg = f'Cross-validated accuracy: ({scores}) / 3 = {mean_:.2f}'

    if print_model:
        msg = f'\nClassifier: {clf}\n{msg}\n'

    if print_duration:
        msg = f'Duration: {timeit.default_timer() - start}{msg}\n'

    print(msg)

In [4]:
clf = RandomForestClassifier(n_estimators=2, random_state=0)
do_cross_validation(clf, True, True)

Duration: 0.04880734300240874
Classifier: RandomForestClassifier(n_estimators=2, random_state=0)
Cross-validated accuracy: (0.98 + 0.92 + 0.96) / 3 = 0.95




## Applying cross-validation for model selection

In [5]:
from sklearn.svm import SVC

start = timeit.default_timer()
svc = SVC(random_state=0)
print('Default value for kernel: ', svc.kernel)
do_cross_validation(svc, True, True)

Default value for kernel:  rbf
Duration: 0.036153421009657905
Classifier: SVC(random_state=0)
Cross-validated accuracy: (0.96 + 0.98 + 0.94) / 3 = 0.96




In [6]:
do_cross_validation(SVC(kernel='linear', random_state=0), print_model=True)
do_cross_validation(SVC(kernel='poly', random_state=0), print_model=True)
do_cross_validation(RandomForestClassifier(n_estimators=2, random_state=0), print_model=True)
do_cross_validation(RandomForestClassifier(n_estimators=5, random_state=0), print_model=True)


Classifier: SVC(kernel='linear', random_state=0)
Cross-validated accuracy: (1.00 + 1.00 + 0.98) / 3 = 0.99


Classifier: SVC(kernel='poly', random_state=0)
Cross-validated accuracy: (0.98 + 0.94 + 0.98) / 3 = 0.97


Classifier: RandomForestClassifier(n_estimators=2, random_state=0)
Cross-validated accuracy: (0.98 + 0.92 + 0.96) / 3 = 0.95


Classifier: RandomForestClassifier(n_estimators=5, random_state=0)
Cross-validated accuracy: (0.98 + 0.94 + 0.94) / 3 = 0.95



# Nested cross-validation

In [9]:
from sklearn.model_selection import GridSearchCV

start = timeit.default_timer()
# random forest inner loop
clf_grid = GridSearchCV(RandomForestClassifier(random_state=0), param_grid={'n_estimators': [2, 5]})
# random forest outer loop
do_cross_validation(clf_grid, print_model=True, print_duration=True)

start = timeit.default_timer()
# svc inner loop
svc_grid = GridSearchCV(SVC(random_state=0), param_grid={'kernel': ['linear', 'poly']})
# svc outer loop
do_cross_validation(svc_grid, print_model=True, print_duration=True)

Duration: 0.5532464570133016
Classifier: GridSearchCV(estimator=RandomForestClassifier(random_state=0),
             param_grid={'n_estimators': [2, 5]})
Cross-validated accuracy: (0.98 + 0.92 + 0.96) / 3 = 0.95


Duration: 0.14918377198046073
Classifier: GridSearchCV(estimator=SVC(random_state=0),
             param_grid={'kernel': ['linear', 'poly']})
Cross-validated accuracy: (1.00 + 0.94 + 0.98) / 3 = 0.97


