## Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()

## MNIST Dataset

In [2]:
from sklearn.datasets import fetch_openml
mnist= fetch_openml('mnist_784', version=1)
mnist.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'details', 'categories', 'url'])

In [3]:
X, y = mnist['data'], mnist['target']

In [4]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

## KNN Classifier

In [5]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

In [6]:
from scipy.stats import randint

param_grid = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

knn = KNeighborsClassifier()

In [7]:
grid_search = GridSearchCV(knn, param_grid, cv=5, verbose=3)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] n_neighbors=3, p=1, weights=uniform .............................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  n_neighbors=3, p=1, weights=uniform, score=0.9647646813827572, total= 8.6min
[CV] n_neighbors=3, p=1, weights=uniform .............................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 42.5min remaining:    0.0s


[CV]  n_neighbors=3, p=1, weights=uniform, score=0.9642559573404432, total= 8.6min
[CV] n_neighbors=3, p=1, weights=uniform .............................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 84.9min remaining:    0.0s


[CV]  n_neighbors=3, p=1, weights=uniform, score=0.9621666666666666, total= 8.6min
[CV] n_neighbors=3, p=1, weights=uniform .............................
[CV]  n_neighbors=3, p=1, weights=uniform, score=0.9615737267650246, total= 8.5min
[CV] n_neighbors=3, p=1, weights=uniform .............................
[CV]  n_neighbors=3, p=1, weights=uniform, score=0.9631543847949317, total= 8.6min
[CV] n_neighbors=3, p=1, weights=distance ............................
[CV]  n_neighbors=3, p=1, weights=distance, score=0.9654310703873387, total= 8.6min
[CV] n_neighbors=3, p=1, weights=distance ............................
[CV]  n_neighbors=3, p=1, weights=distance, score=0.9663389435094151, total= 8.6min
[CV] n_neighbors=3, p=1, weights=distance ............................
[CV]  n_neighbors=3, p=1, weights=distance, score=0.9633333333333334, total= 8.7min
[CV] n_neighbors=3, p=1, weights=distance ............................
[CV]  n_neighbors=3, p=1, weights=distance, score=0.9634908727181796, tot

[CV]  n_neighbors=7, p=2, weights=distance, score=0.9704215964005999, total= 9.7min
[CV] n_neighbors=7, p=2, weights=distance ............................
[CV]  n_neighbors=7, p=2, weights=distance, score=0.96725, total= 9.7min
[CV] n_neighbors=7, p=2, weights=distance ............................
[CV]  n_neighbors=7, p=2, weights=distance, score=0.9687421855463866, total= 9.7min
[CV] n_neighbors=7, p=2, weights=distance ............................
[CV]  n_neighbors=7, p=2, weights=distance, score=0.969406468822941, total= 9.7min


[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed: 2708.8min finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance'], 'p': [1, 2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=3)

In [8]:
grid_search.best_params_

{'n_neighbors': 3, 'p': 2, 'weights': 'distance'}

In [9]:
grid_search.best_score_

0.9711166666666666

In [10]:
from sklearn.metrics import accuracy_score

y_pred = grid_search.predict(X_test)
accuracy_score(y_test, y_pred)

0.9717

## Data Augmentation

In [11]:
from scipy.ndimage.interpolation import shift

def shift_image(image, dx, dy):
    image = image.reshape(28, 28)
    shifted_image = shift(image, [dy, dx], cval=0, mode="constant")
    return shifted_image.reshape([-1])

In [13]:
X_train_augmented = [image for image in X_train]
y_train_augmented = [image for image in y_train]

for dx, dy in ((1, 0), (-1, 0), (0, 1), (0, -1)):
    for image, label in zip(X_train, y_train):
        X_train_augmented.append(shift_image(image, dx, dy))
        y_train_augmented.append(label)
        
X_train_augmented = np.array(X_train_augmented)
y_train_augmented = np.array(y_train_augmented)

In [14]:
shuffle_idx = np.random.permutation(len(X_train_augmented))
X_train_augmented = X_train_augmented[shuffle_idx]
y_train_augmented = y_train_augmented[shuffle_idx]

In [15]:
knn_aug = KNeighborsClassifier(**grid_search.best_params_)

knn_aug.fit(X_train_augmented, y_train_augmented)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=3, p=2,
           weights='distance')

In [16]:
y_pred_aug = knn_aug.predict(X_test)
accuracy_score(y_test, y_pred_aug)

0.9763