In [1]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', version=1, as_frame=False)
X, Y = mnist['data'], mnist['target']

In [2]:
import numpy as np
from scipy.ndimage.interpolation import shift

def add_shifts(X, Y, width, height, distance = 1):
    side = (2 * distance) + 1
    area = side ** 2
    expanded_X = np.repeat(X, area, axis=0)
    expanded_Y = np.repeat(Y, area)
    for index in range(len(expanded_X)):
        vertical =  ((index % area) // side) - 1
        horizontal = (index % side) - 1
        image = expanded_X[index]
        shifted_image = shift(image.reshape(height, width), [vertical, horizontal], cval=0, order=0, prefilter=False).reshape(height * width)
        expanded_X[index] = shifted_image
    return expanded_X, expanded_Y

In [3]:
X_train, X_test, Y_train, Y_test = X[:60000], X[60000:], Y[:60000], Y[60000:]
X_train_augmented, Y_train_augmented = add_shifts(X_train, Y_train, width=28, height=28)
X_train_augmented.shape, Y_train_augmented.shape #((540000, 784), (540000,))


((540000, 784), (540000,))

In [5]:
X_train_level0, X_holdout, Y_train_level0, Y_holdout = \
    X_train_augmented[:459000], X_train_augmented[459000:], \
    Y_train_augmented[:459000], Y_train_augmented[459000:] 

In [23]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_jobs=-1)
knn.fit(X_train_level0, Y_train_level0)

KNeighborsClassifier(n_jobs=-1)

In [22]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_jobs=-1, random_state=42)
forest.fit(X_train_level0, Y_train_level0)

RandomForestClassifier(n_jobs=-1, random_state=42)

In [25]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

svc = Pipeline([
    ('scaler', StandardScaler()),
    ('svc', SVC(kernel='poly', degree=2, C=0.17670169402947947,
    gamma=0.012606912518374066, probability=True, random_state=42))
]) # These are the best hyperparameters found in the SVM Tuning notebook
# on the unaugmented data
svc_pipeline.fit(X_train_level0, Y_train_level0)

Pipeline(steps=[('scaler', StandardScaler()),
                ('svc',
                 SVC(C=0.17670169402947947, degree=2,
                     gamma=0.012606912518374066, kernel='poly',
                     random_state=42))])

In [32]:
level_0 = [knn, forest, svc]
predictions_matrix = np.zeros((81000, 3))
for i, model in enumerate(level_0):
    predictions_matrix[:,i] = model.predict(X_holdout)
predictions_matrix 
# array([[7., 7., 7.],
#       [7., 7., 7.],
#       [7., 7., 7.],
#       ...,
#       [8., 8., 8.],
#       [8., 8., 8.],
#       [8., 8., 8.]])

array([[7., 7., 7.],
       [7., 7., 7.],
       [7., 7., 7.],
       ...,
       [8., 8., 8.],
       [8., 8., 8.],
       [8., 8., 8.]])

In [39]:
predictions_matrix.shape # (81000, 3)

(81000, 3)

In [35]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(n_jobs=-1, solver='saga', max_iter=1000)
lr.fit(predictions_matrix, Y_holdout)

LogisticRegression(max_iter=1000, n_jobs=-1, solver='saga')

In [43]:
from sklearn.metrics import accuracy_score

predictions_matrix_test = np.zeros((10000, 3))
for i, model in enumerate(level_0):
    predictions_matrix_test[:,i] = model.predict(X_test)
predictions = lr.predict(predictions_matrix_test)
accuracy = accuracy_score(Y_test, predictions)
accuracy # 0.9678 
# the blender underperforms the individual models

0.9678

In [48]:
knn_pred = knn.predict(X_test)
knn_accuracy = accuracy_score(Y_test, knn_pred)
knn_accuracy # 0.974

0.974

In [49]:
forest_pred = forest.predict(X_test)
forest_accuracy = accuracy_score(Y_test, forest_pred)
forest_accuracy # 0.9802

0.9802

In [52]:
svc_pred = svc.predict(X_test)
svc_accuracy = accuracy_score(Y_test, svc_pred)
svc_accuracy # 0.9839

0.9839

In [80]:
from sklearn.ensemble import ExtraTreesClassifier

extra = ExtraTreesClassifier(n_jobs=-1, random_state=42)
extra.fit(X_train, Y_train)

ExtraTreesClassifier(n_jobs=-1, random_state=42)

In [88]:
# The blender was disappointing, let's try a normal voting classifier,
# add an extra random forest to add diversity, and
# add the augmented data from the Translation notebook.
from sklearn.ensemble import VotingClassifier

ensemble = VotingClassifier(estimators=[('1', knn), ('2', forest), ('3', extra), ('4', svc)],
                        voting='soft',
                        n_jobs=-1)
ensemble.fit(X_train_augmented, Y_train_augmented)
soft_pred = ensemble.predict(X_test)
soft_accuracy = accuracy_score(Y_test, soft_pred)
soft_accuracy # 0.9891 This is the best performing model

0.9891

In [90]:
ensemble.voting = 'hard'
hard_pred = ensemble.predict(X_test)
hard_accuracy = accuracy_score(Y_test, hard_pred)
hard_accuracy # 0.9861

0.9861