In [1]:
from sklearn.datasets import fetch_openml
import numpy as np

# MNIST

In [2]:
def sort_by_target(mnist):
    reorder_train = np.array(sorted([(target, i) for i, target in enumerate(mnist.target[:60000])]))[:, 1]
    reorder_test = np.array(sorted([(target, i) for i, target in enumerate(mnist.target[60000:])]))[:, 1]
    mnist.data[:60000] = mnist.data[reorder_train]
    mnist.target[:60000] = mnist.target[reorder_train]
    mnist.data[60000:] = mnist.data[reorder_test + 60000]
    mnist.target[60000:] = mnist.target[reorder_test + 60000]

mnist = fetch_openml('mnist_784', version=1, cache=True)
mnist.target = mnist.target.astype(np.int8) # fetch_openml() returns targets as strings

In [3]:
X, y = mnist["data"], mnist["target"]
X.shape

(70000, 784)

In [4]:
y.shape

(70000,)

In [5]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

# k-nearest neighbors

In [6]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

param_grid = [{'weights': ["uniform", "distance"], 'n_neighbors': [3, 4, 5]}]

knn_clf = KNeighborsClassifier()
grid_search = GridSearchCV(knn_clf, param_grid, cv=5, verbose=3, n_jobs=-1)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  30 | elapsed: 48.4min remaining: 32.3min
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 65.7min finished


GridSearchCV(cv=5, estimator=KNeighborsClassifier(), n_jobs=-1,
             param_grid=[{'n_neighbors': [3, 4, 5],
                          'weights': ['uniform', 'distance']}],
             verbose=3)

In [7]:
grid_search.best_estimator_

KNeighborsClassifier(n_neighbors=4, weights='distance')

In [8]:
grid_search.best_score_

0.9716166666666666

In [9]:
cvres = grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(np.sqrt(-mean_score), params)

nan {'n_neighbors': 3, 'weights': 'uniform'}
nan {'n_neighbors': 3, 'weights': 'distance'}
nan {'n_neighbors': 4, 'weights': 'uniform'}
nan {'n_neighbors': 4, 'weights': 'distance'}
nan {'n_neighbors': 5, 'weights': 'uniform'}
nan {'n_neighbors': 5, 'weights': 'distance'}


In [10]:
from sklearn.metrics import accuracy_score

y_pred = grid_search.predict(X_test)
accuracy_score(y_test, y_pred)

0.9714

# Precision vs Recall

In [14]:
from sklearn.metrics import precision_score, recall_score

precision_score(y_test, y_pred, average=None)

array([0.973     , 0.96834902, 0.98417409, 0.96819085, 0.97535934,
       0.96312849, 0.97828335, 0.95945946, 0.98818475, 0.95746785])

In [15]:
recall_score(y_test, y_pred, average=None)

array([0.99285714, 0.99735683, 0.96414729, 0.96435644, 0.96741344,
       0.96636771, 0.9874739 , 0.96692607, 0.94455852, 0.95936571])