# Handwritten Digit Recognition - MNIST Dataset
## K-Nearest-Neighbour

In [30]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, confusion_matrix, precision_recall_fscore_support
from sklearn.preprocessing import scale
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA, IncrementalPCA

from sklearn.pipeline import Pipeline

import time

### Loading Dataset

In [10]:
path_train_data = './archive/mnist_train.csv'
path_test_data = './archive/mnist_test.csv'

def load_data():
    df_train = pd.read_csv(path_train_data)
    df_test = pd.read_csv(path_test_data)
    
    return (df_train, df_test)

In [None]:
df_train, df_test = load_data()

### Class Distribution

In [None]:
classes = df_train.groupby('label')['label'].count()
plt.pie(classes, labels=classes.index, autopct='%.2f%%')
plt.show()

## Preprocessing

In [14]:
x_train_scale = scale(df_train.loc[ : , df_train.columns != 'label'])
x_test_scale = scale(df_test.loc[ : , df_test.columns != 'label'])

In [18]:
y_train = df_train['label']
y_test = df_test['label']

### Dimentionality Reduction using PCA

In [22]:
def ipca_analysis(x_train, x_test, n_components, batch_size):
    
    ipca = IncrementalPCA(n_components=n_components, batch_size=batch_size)

    x_train_pca = ipca.fit_transform(x_train)
    x_test_pca = ipca.transform(x_test)
    
    return (x_train_pca, x_test_pca)

In [23]:
def pca_analysis(x_train, x_test, n_components):
    
    pca = PCA(n_components=n_components, whiten=True)
    
    x_train_pca = pca.fit_transform(x_train)
    x_test_pca = pca.transform(x_test)
    
    return (x_train_pca, x_test_pca)

In [None]:
x_train_pca, y_train = x_test_pca(x_train_scale, x_test_scale, 50)

## KNN

In [34]:
knn = KNeighborsClassifier(n_neighbors=11)
knn.fit(x_train_pca, y_train)
y_predicted = knn.predict(x_test_pca)

In [29]:
accuracy = accuracy_score(y_test, y_predicted)
print('Model accuracy : ', accuracy)

metric_score = precision_recall_fscore_support(y_test, y_predicted, average='macro')
metric_name = ['precision', 'recall', 'fscore']

for i in range(0, 3):
    print('Model ' + metric_name[i] + ' : ' + str(metric_score[i]))

### Accuracy and Results

In [28]:
cmat = confusion_matrix(y_test, y_predicted)
    
fig, axs = plt.subplots(nrows=1)
fig.set_size_inches(15, 9)
sns.heatmap(cmat, annot=True)