In [1]:
#Setup from Book
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "classification"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [2]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1)
mnist.keys()

dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [3]:
X, y = mnist["data"], mnist["target"]

X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

In [None]:
from sklearn.model_selection import GridSearchCV #import Grid search
from sklearn.neighbors import KNeighborsClassifier #import KNClassifier (recommended by exercise)
kNClassifier = KNeighborsClassifier() 
#parameters to pass gridsearch and KN
parameterGrid = [{'weights': ["uniform", "distance"], 'n_neighbors': [3, 4, 5]}]
gridSearch = GridSearchCV(kNClassifier, parameterGrid, cv=5, verbose=3)
#Actual training
gridSearch.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] n_neighbors=3, weights=uniform ..................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


In [None]:
gridSearch.best_params_
gridSearch.best_score_

In [None]:

from sklearn.metrics import accuracy_score
gridSearch.best_score_

y_pred = grid_search.predict(X_test)
accuracy_score(y_test, y_pred)

In [None]:
from scipy.ndimage.interpolation import shift

In [None]:
def myImageShift(image, xChange, yChange):
    #in this function we will reshape from numpy
    image = image.reshape((28, 28))
    # the shift function from scipy 
    newImage = shift(image, [xChange, yChange], cval=0, mode="constant")
    return newImage.reshape([-1])

In [None]:
XAugment = [image for image in X_train]
yAugment = [label for label in y_train]

for dx, dy in ((1, 0), (-1, 0), (0, 1), (0, -1)):
    for image, label in zip(X_train, y_train):
        XAugment.append(myImageShift(image, dx, dy))
        yAugment.append(label)

XAugment = np.array(XAugment)
yAugment = np.array(yAugment)

anotherKNClassifier = KNeighborsClassifier(**gridSearch.best_params_)
anotherKNClassifier.fit(XAugment,yAugment)

yPrediction = anotherKNClassifier.predict(X_test)
accuracy(y_test, y_pred)