# Model selection and evaluation

- Adapted for AI Black Belt - Yellow (June 2019).
- [Tutorial](https://github.com/amueller/ml-workshop-1-of-4/) created by Andreas Mueller (2019). MIT License.

---

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
digits = load_digits()
plt.imshow(digits.data[0].reshape(8,8))

## Threefold split

In [None]:
X_trainval, X_test, y_trainval, y_test = train_test_split(digits.data, digits.target)
X_train, X_val, y_train, y_val = train_test_split(X_trainval, y_trainval)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

val_scores = []
neighbors = np.arange(1, 15, 2)

for i in neighbors:
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    val_scores.append(knn.score(X_val, y_val))

In [None]:
print("best validation score: {:.3f}".format(np.max(val_scores)))
best_n_neighbors = neighbors[np.argmax(val_scores)]
print("best n_neighbors:", best_n_neighbors)

In [None]:
knn = KNeighborsClassifier(n_neighbors=best_n_neighbors)
knn.fit(X_trainval, y_trainval)
print("test-set score: {:.3f}".format(knn.score(X_test, y_test)))

## Cross-validation

In [None]:
X_train, X_test, y_train, y_test = train_test_split(digits.data, digits.target)

In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(KNeighborsClassifier(), X_train, y_train, cv=5)

In [None]:
scores

In [None]:
np.mean(scores)

<div class="alert alert-success">
    <b>EXERCISE</b>:

Use <code>cross_val_score</code> within a loop to determine the best value of `n_neighbors`.
</div>

## Grid searches

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

param_grid = {'C': 10. ** np.arange(-3, 3),
              'gamma' : 10. ** np.arange(-5, 0)}

np.set_printoptions(suppress=True)
print(param_grid)

In [None]:
grid_search = GridSearchCV(SVC(), param_grid, verbose=3, cv=5)

A GridSearchCV object behaves just like a normal classifier.

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
grid_search.predict(X_test)

In [None]:
grid_search.score(X_test, y_test)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

In [None]:
grid_search.best_estimator_

In [None]:
scores = pd.DataFrame(grid_search.cv_results_)
scores

In [None]:
scores = grid_search.cv_results_['mean_test_score']
scores = np.array(scores).reshape(6, 5)

plt.matshow(scores)
plt.xlabel('gamma')
plt.ylabel('C')
plt.colorbar()
plt.xticks(np.arange(5), param_grid['gamma'])
plt.yticks(np.arange(6), param_grid['C'])
plt.show()

<div class="alert alert-success">
    <b>EXERCISE</b>:

Use <code>GridSearchCV</code> to adjust `n_neighbors`.
</div>