### MNIST

## Load MNIST dataset

In [25]:
import numpy as np

In [26]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', version=1)

In [27]:
X, y = mnist["data"], mnist["target"]

print(X.shape)
print(y.shape)

(70000, 784)
(70000,)


In [28]:
# split data into train, test
X_train, X_test = X[:60000], X[60000:]
y_train, y_test = y[:60000], y[60000:]

y_train = y_train.astype(np.uint8)
y_test = y_test.astype(np.uint8)

In [29]:
# automize preprocessing process using scikit-learn's pipeline feature
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

mnist_pipeline = Pipeline([
    ('min_max_scaler', MinMaxScaler())
])

In [30]:
X_train = mnist_pipeline.fit_transform(X_train)

In [31]:
some_digit = X_train[0].reshape(28, 28)

[[0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.        ]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.    

In [32]:
import matplotlib.pyplot as plt
import copy

# implement utility function for viewing image
def show_mnist_784(mnist_digit):
    digit = copy.deepcopy(mnist_digit)
    if digit.shape is not (28, 28):
        digit.reshape(28, 28)
    plt.imshow(digit, cmap="binary")
    plt.axis("off")
    plt.show()  

In [33]:
# use KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier

knc = KNeighborsClassifier()

In [0]:
# investigate cross validation score of the model
from sklearn.model_selection import cross_val_score

knc_scores = cross_val_score(knc, X_train, y_train, cv=5, scoring="neg_mean_squared_error")