## Building a classifier for the MNIST dataset using KNeighborsClassifier that achieves over 97% accuracy on the test set

In [1]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml("mnist_784", as_frame=False)
mnist

{'data': array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64),
 'target': array(['5', '0', '4', ..., '4', '5', '6'], dtype=object),
 'frame': None,
 'categories': {'class': ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']},
 'feature_names': ['pixel1',
  'pixel2',
  'pixel3',
  'pixel4',
  'pixel5',
  'pixel6',
  'pixel7',
  'pixel8',
  'pixel9',
  'pixel10',
  'pixel11',
  'pixel12',
  'pixel13',
  'pixel14',
  'pixel15',
  'pixel16',
  'pixel17',
  'pixel18',
  'pixel19',
  'pixel20',
  'pixel21',
  'pixel22',
  'pixel23',
  'pixel24',
  'pixel25',
  'pixel26',
  'pixel27',
  'pixel28',
  'pixel29',
  'pixel30',
  'pixel31',
  'pixel32',
  'pixel33',
  'pixel34',
  'pixel35',
  'pixel36',
  'pixel37',
  'pixel38',
  'pixel39',
  'pixel40',
  'pixel41',
  'pixel42',
  'pixel43',
  'pixel44',
  'pixel45',
  'pixel46

In [2]:
X, y = mnist.data, mnist.target

In [3]:
X.shape

(70000, 784)

In [4]:
y.shape

(70000,)

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, train_size=0.8, random_state=42)

In [6]:
X_train.shape

(56000, 784)

In [7]:
X_test.shape

(14000, 784)

In [8]:
y_train.shape

(56000,)

In [9]:
y_test.shape

(14000,)

In [27]:
# Reshaping the data
X_train_flat = X_train.reshape(X_train.shape[0], -1)
X_test_flat = X_test.reshape(X_test.shape[0], -1)

In [28]:
# Scaling the data for normalization 
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_trained_scaled = scaler.fit_transform(X_train_flat)
X_test_scaled = scaler.transform(X_test_flat)

In [15]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

knn_clf = KNeighborsClassifier()
param_grid = {
    "n_neighbors" : [3,5,7],
    "weights" : ["uniform","distance"],
    "algorithm" : ["auto"],
    "p": [1,2]
}

In [16]:
grid_search = GridSearchCV(knn_clf, param_grid=param_grid, cv=3, scoring="accuracy", n_jobs=-1)

In [17]:
grid_search.fit(X_trained_scaled, y_train)

In [18]:
print(grid_search.best_params_)
print(grid_search.best_score_)

{'algorithm': 'auto', 'n_neighbors': 3, 'p': 2, 'weights': 'distance'}
0.9696785589727869


In [21]:
test_accuracy = grid_search.best_estimator_.score(X_test_scaled, y_test)
test_accuracy

0.9728571428571429

In [29]:
# total number of misclassified samples
import numpy as np

y_pred = grid_search.best_estimator_.predict(X_test_scaled)
misclassified = np.where(y_pred != y_test)[0]
misclassified

array([   49,   183,   240,   241,   256,   313,   385,   431,   485,
         565,   605,   631,   672,   699,   756,   758,   760,   777,
         830,   868,  1026,  1106,  1120,  1148,  1163,  1194,  1214,
        1251,  1297,  1341,  1342,  1351,  1364,  1480,  1487,  1516,
        1528,  1611,  1650,  1658,  1670,  1707,  1714,  1716,  1718,
        1737,  1744,  1790,  1826,  1861,  1896,  1903,  1918,  1924,
        1965,  1987,  2023,  2025,  2030,  2044,  2067,  2077,  2108,
        2139,  2141,  2201,  2299,  2327,  2338,  2396,  2420,  2467,
        2481,  2482,  2511,  2551,  2602,  2607,  2617,  2622,  2651,
        2666,  2689,  2724,  2757,  2834,  2857,  2896,  2934,  2986,
        2993,  3031,  3044,  3096,  3101,  3120,  3207,  3258,  3295,
        3297,  3352,  3390,  3403,  3446,  3475,  3534,  3535,  3581,
        3596,  3731,  3732,  3760,  3762,  3779,  3827,  3876,  3998,
        4023,  4087,  4107,  4141,  4155,  4167,  4169,  4203,  4228,
        4245,  4280,

In [24]:
misclassified.shape

(380,)