**Chapter 3 – Classification**

_This notebook contains all the sample code and solutions to the exercises in chapter 3._

# Setup

First, let's make sure this notebook works well in both python 2 and 3, import a few common modules, ensure MatplotLib plots figures inline and prepare a function to save the figures:

In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "classification"

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

# MNIST

**Warning**: `fetch_mldata()` is deprecated since Scikit-Learn 0.20. You should use `fetch_openml()` instead. However, it returns the unsorted MNIST dataset, whereas `fetch_mldata()` returned the dataset sorted by target (the training set and the test test were sorted separately). In general, this is fine, but if you want to get the exact same results as before, you need to sort the dataset using the following function:

In [2]:
def sort_by_target(mnist):
    reorder_train = np.array(sorted([(target, i) for i, target in enumerate(mnist.target[:60000])]))[:, 1]
    reorder_test = np.array(sorted([(target, i) for i, target in enumerate(mnist.target[60000:])]))[:, 1]
    mnist.data[:60000] = mnist.data[reorder_train]
    mnist.target[:60000] = mnist.target[reorder_train]
    mnist.data[60000:] = mnist.data[reorder_test + 60000]
    mnist.target[60000:] = mnist.target[reorder_test + 60000]

In [3]:
try:
    from sklearn.datasets import fetch_openml
    mnist = fetch_openml('mnist_784', version=1, cache=True)
    mnist.target = mnist.target.astype(np.int8) # fetch_openml() returns targets as strings
    sort_by_target(mnist) # fetch_openml() returns an unsorted dataset
except ImportError:
    from sklearn.datasets import fetch_mldata
    mnist = fetch_mldata('MNIST original')
mnist["data"], mnist["target"]

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]),
 array([0, 0, 0, ..., 9, 9, 9], dtype=int8))

In [4]:
mnist.data.shape

(70000, 784)

In [5]:
X, y = mnist["data"], mnist["target"]
X.shape

(70000, 784)

In [6]:
y.shape

(70000,)

In [7]:
def plot_digit(data):
    image = data.reshape(28, 28)
    plt.imshow(image, cmap = mpl.cm.binary,
               interpolation="nearest")
    plt.axis("off")

In [8]:
# EXTRA
def plot_digits(instances, images_per_row=10, **options):
    size = 28
    images_per_row = min(len(instances), images_per_row)
    images = [instance.reshape(size,size) for instance in instances]
    n_rows = (len(instances) - 1) // images_per_row + 1
    row_images = []
    n_empty = n_rows * images_per_row - len(instances)
    images.append(np.zeros((size, size * n_empty)))
    for row in range(n_rows):
        rimages = images[row * images_per_row : (row + 1) * images_per_row]
        row_images.append(np.concatenate(rimages, axis=1))
    image = np.concatenate(row_images, axis=0)
    plt.imshow(image, cmap = mpl.cm.binary, **options)
    plt.axis("off")

In [9]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]

In [10]:
import numpy as np

shuffle_index = np.random.permutation(60000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]


In [11]:
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone
from sklearn.model_selection import cross_val_score
from sklearn.base import BaseEstimator
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_curve

In [12]:
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision", linewidth=2)
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall", linewidth=2)
    plt.xlabel("Threshold", fontsize=16)
    plt.legend(loc="upper left", fontsize=16)
    plt.ylim([0, 1])

In [13]:
def plot_precision_vs_recall(precisions, recalls):
    plt.plot(recalls, precisions, "b-", linewidth=2)
    plt.xlabel("Recall", fontsize=16)
    plt.ylabel("Precision", fontsize=16)
    plt.axis([0, 1, 0, 1])

In [14]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)

**Note**: we set `n_estimators=10` to avoid a warning about the fact that its default value will be set to 100 in Scikit-Learn 0.22.

In [15]:
from sklearn.multiclass import OneVsOneClassifier
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))

In [16]:

def plot_confusion_matrix(matrix):
    """If you prefer color and a colorbar"""
    fig = plt.figure(figsize=(8,8))
    ax = fig.add_subplot(111)
    cax = ax.matshow(matrix)
    fig.colorbar(cax)

In [17]:
# from sklearn.model_selection import GridSearchCV
# param_grid = [
#     {'n_neighbors': [3, 4, 5], 'weights': ['distance', 'uniform']}
# ]
# # 
from sklearn.neighbors import KNeighborsClassifier
# neigh = KNeighborsClassifier()
# # 
# grid_search = GridSearchCV(neigh, param_grid, cv=2, n_jobs=-1, verbose=100)
# grid_search.fit(X_train, y_train)

In [18]:
# grid_search.best_params_
# 

In [19]:
# grid_search.best_score_



In [20]:

from sklearn.metrics import accuracy_score

# y_pred = grid_search.predict(X_test)
neigh = KNeighborsClassifier(n_neighbors=4, weights='distance', n_jobs=-1)
# neigh.fit(X_train, y_train)
# y_pred = neigh.predict(X_test)
# accuracy_score(y_test, y_pred)

In [21]:
# Write a function that can shift an MNIST image in any direction (left, right, up, or
# down) by one pixel. Then, for each image in the training set, create four shifted
# copies (one per direction) and add them to the training set. Finally, train your best
# model on this expanded training set and measure its accuracy on the test set.

#You can use the shift() function from the scipy.ndimage.interpolation module. For example,
# shift(image, [2, 1], cval=0) shifts the image 2 pixels down and 1 pixel to the right.

from scipy.ndimage.interpolation import shift

def shift_arr(arr, reshape_value = 28):
    arr2 = arr.reshape(reshape_value, reshape_value)
    return [
        shift(arr2, [1, 0]), #down
        shift(arr2, [0, 1]), #right
        shift(arr2, [-1, 0]),#up
        shift(arr2, [0, -1]) #left
    ]

In [22]:
my_arr = [1, 2, 3, 4]

In [23]:
my_arr0 = np.array(my_arr).reshape(2, 2).tolist()
my_arr0

[[1, 2], [3, 4]]

In [24]:
# for img in X_train:
#     shifted = shift_arr(img)
    
my_arr1 = np.array(my_arr0)
my_arr1

array([[1, 2],
       [3, 4]])

In [25]:
my_arr2 = shift_arr(np.array([1, 2, 3, 4]), 2)
my_arr2

[array([[0, 0],
        [1, 2]]), array([[0, 1],
        [0, 3]]), array([[3, 4],
        [0, 0]]), array([[2, 0],
        [4, 0]])]

In [26]:
#my_arr2.shape

In [27]:
my_arr3 = np.array([my_arr0])
my_arr3

array([[[1, 2],
        [3, 4]]])

In [28]:
my_arr3.shape

(1, 2, 2)

In [29]:
my_arr4 = [my_arr1] + my_arr2
my_arr4
#my_arr2.shape()

[array([[1, 2],
        [3, 4]]), array([[0, 0],
        [1, 2]]), array([[0, 1],
        [0, 3]]), array([[3, 4],
        [0, 0]]), array([[2, 0],
        [4, 0]])]

In [30]:
my_arr1.flatten()


array([1, 2, 3, 4])

In [31]:
my_arr5 = np.array(my_arr4)
my_arr5


array([[[1, 2],
        [3, 4]],

       [[0, 0],
        [1, 2]],

       [[0, 1],
        [0, 3]],

       [[3, 4],
        [0, 0]],

       [[2, 0],
        [4, 0]]])

In [32]:
my_arr6 = map(lambda e: e.flatten(), my_arr4)
my_arr7 = list(my_arr6)
my_arr7

[array([1, 2, 3, 4]),
 array([0, 0, 1, 2]),
 array([0, 1, 0, 3]),
 array([3, 4, 0, 0]),
 array([2, 0, 4, 0])]

In [33]:
my_arr8 = np.array(my_arr7)
my_arr8

array([[1, 2, 3, 4],
       [0, 0, 1, 2],
       [0, 1, 0, 3],
       [3, 4, 0, 0],
       [2, 0, 4, 0]])

In [34]:
np.array([[[1,2], [3,4]], [[5,6], [7,8]]]).flatten()
#-> need [[1,2], [3,4], [5,6], [7,8]]

array([1, 2, 3, 4, 5, 6, 7, 8])

In [35]:
[[1,2], [3,4]] + [[5,6], [7,8]]

[[1, 2], [3, 4], [5, 6], [7, 8]]

In [36]:
from functools import reduce
list(reduce(lambda e1, e2: e1 + e2, [[[1,2], [3,4]], [[5,6], [7,8]]]))

[[1, 2], [3, 4], [5, 6], [7, 8]]

In [37]:
my_arr8.tolist()

[[1, 2, 3, 4], [0, 0, 1, 2], [0, 1, 0, 3], [3, 4, 0, 0], [2, 0, 4, 0]]

In [38]:
def shift_arr2(arr, reshape_value = 28):
    arr0 = np.array(arr)
    arr1 = arr0.reshape(reshape_value, reshape_value)
    arr2 = [arr1] + shift_arr(arr0, reshape_value)
    arr3 = map(lambda e: e.flatten(), arr2)
    arr4 = list(arr3)
    arr5 = np.array(arr4).tolist()
    return arr5

In [39]:
my_arr9 = shift_arr2([1, 2, 3, 4], 2)
my_arr9


[[1, 2, 3, 4], [0, 0, 1, 2], [0, 1, 0, 3], [3, 4, 0, 0], [2, 0, 4, 0]]

In [40]:
X_train2 = X_train[:3]
X_train2

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [41]:
X_train2.shape

(3, 784)

In [42]:
X_train3 = np.array(list(map(shift_arr2, X_train2.tolist())))
Xt3size = X_train3.shape
Xt3size
# list(map(shift_arr2, X_train2.tolist()))

(3, 5, 784)

In [43]:
X_train4 = reduce(lambda e1, e2: e1 + e2, X_train3.tolist())
X_t4_size = np.array(X_train4).shape
X_t4_size

(15, 784)

In [44]:
X_t5 = np.array(X_train4).reshape(Xt3size[0] * Xt3size[1], Xt3size[2])
X_t5

array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
        -3.05567807e-23,  1.86675009e-24,  1.99667477e-24],
       [ 0.00000000e+00, -1.90074332e-26, -3.28220186e-25, ...,
         1.95607738e-25, -1.75216509e-24,  4.02402892e-25],
       ...,
       [ 0.00000000e+00,  4.10332167e-24,  9.33817199e-24, ...,
        -7.67613468e-23, -1.45588858e-23, -1.55499814e-23],
       [-7.74418095e-24, -1.28312673e-23, -7.10519895e-24, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 9.33817199e-24,  7.76579986e-23, -1.47916599e-22, ...,
        -1.55499814e-23, -9.26797211e-24,  0.00000000e+00]])

In [45]:
X_t5.shape

(15, 784)

In [46]:
X_train_aug0 = list(map(shift_arr2, X_train.tolist()))
X_train_aug1 = np.array(X_train_aug0)
X1_size = X_train_aug1.shape
X_train_aug = X_train_aug1.reshape(X1_size[0] * X1_size[1], X1_size[2])
X_train_aug.shape

(300000, 784)

In [47]:
my_arr10 = [1,2,3]
aug_size = 5
my_arr11 = np.array(list(map(lambda e: [e] * aug_size, my_arr10)))
my_arr11

array([[1, 1, 1, 1, 1],
       [2, 2, 2, 2, 2],
       [3, 3, 3, 3, 3]])

In [48]:
my_arr11.reshape(aug_size * my_arr10.__len__())

array([1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3])

In [49]:
Y_train_aug = np.array(list(map(lambda e: [e] * aug_size, y_train))).reshape(aug_size * y_train.shape[0])
Y_train_aug.shape

(300000,)

In [50]:
neigh.fit(X_train_aug, Y_train_aug)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=-1, n_neighbors=4, p=2,
                     weights='distance')

In [51]:
y_pred = neigh.predict(X_test)

In [52]:
accuracy_score(y_test, y_pred)



0.9763