# MNIST classification with kNN classifier

## 1. Load MNIST dataset and prepare data

In [1]:
import pandas as pd
train = pd.read_csv("./data/archive/mnist_train.csv")[:1000]
test = pd.read_csv("./data/archive/mnist_test.csv")[:100]

X_train = train.drop(['label'], axis=1).values
y_train = train['label'].values

X_test = test.drop(['label'], axis=1).values
y_test = test['label'].values

y_train = y_train.reshape(train.shape[0], 1)
y_test = y_test.reshape(test.shape[0], 1)

## 2. kNN classifier implementation

In [2]:
from knn import OwnKNeighborsClassifier

## 3. Check correctness (compare with exist implementations)

In [3]:
from sklearn.metrics import classification_report, confusion_matrix
K = 5

#### 3.1 our implementation

In [9]:
our_knn_model = OwnKNeighborsClassifier(K)
our_knn_model.fit(X_train, y_train)
y = our_knn_model.predict(X_test)

print(confusion_matrix(y_test, y))
print(classification_report(y_test, y))


100%|██████████| 100/100 [00:01<00:00, 99.97it/s]


[[ 8  0  0  0  0  0  0  0  0  0]
 [ 0 14  0  0  0  0  0  0  0  0]
 [ 0  1  6  0  1  0  0  0  0  0]
 [ 0  0  0  9  0  1  0  0  0  1]
 [ 0  0  0  0 10  0  0  0  0  4]
 [ 0  1  0  1  1  4  0  0  0  0]
 [ 1  0  0  0  0  0  9  0  0  0]
 [ 0  0  0  0  0  0  0 14  0  1]
 [ 0  0  0  0  0  0  0  0  2  0]
 [ 0  0  0  0  0  0  0  2  0  9]]
              precision    recall  f1-score   support

           0       0.89      1.00      0.94         8
           1       0.88      1.00      0.93        14
           2       1.00      0.75      0.86         8
           3       0.90      0.82      0.86        11
           4       0.83      0.71      0.77        14
           5       0.80      0.57      0.67         7
           6       1.00      0.90      0.95        10
           7       0.88      0.93      0.90        15
           8       1.00      1.00      1.00         2
           9       0.60      0.82      0.69        11

    accuracy                           0.85       100
   macro avg       

#### 3.2 sklearn implementation

In [11]:
from sklearn.neighbors import KNeighborsClassifier
sklearn_knn_model = KNeighborsClassifier(K)
sklearn_knn_model.fit(X_train, y_train)
y = sklearn_knn_model.predict(X_test)

print(confusion_matrix(y_test, y))
print(classification_report(y_test, y))

[[ 8  0  0  0  0  0  0  0  0  0]
 [ 0 14  0  0  0  0  0  0  0  0]
 [ 0  1  6  0  1  0  0  0  0  0]
 [ 0  1  1  8  0  0  0  0  0  1]
 [ 0  0  0  0 11  0  0  0  0  3]
 [ 0  1  0  1  1  4  0  0  0  0]
 [ 1  0  0  0  1  0  8  0  0  0]
 [ 0  0  0  0  0  0  0 14  0  1]
 [ 0  0  0  0  0  0  0  0  2  0]
 [ 0  0  0  0  0  0  0  2  0  9]]
              precision    recall  f1-score   support

           0       0.89      1.00      0.94         8
           1       0.82      1.00      0.90        14
           2       0.86      0.75      0.80         8
           3       0.89      0.73      0.80        11
           4       0.79      0.79      0.79        14
           5       1.00      0.57      0.73         7
           6       1.00      0.80      0.89        10
           7       0.88      0.93      0.90        15
           8       1.00      1.00      1.00         2
           9       0.64      0.82      0.72        11

    accuracy                           0.84       100
   macro avg       

  sklearn_knn_model.fit(X_train, y_train)


## 4. Define features and distance functions

In [4]:
from hog import HogFeatureLoader
from histagram import HistFeatureLoader
from nn import NnFeatureLoader

hog = HogFeatureLoader()
hist = HistFeatureLoader()
nn = NnFeatureLoader()

In [13]:
hog.save_features(X_test, "test")
hog.save_features(X_train, "train")

In [14]:
hist.save_features(X_test, "test")
hist.save_features(X_train, "train")

In [19]:
nn.save_features(X_test, "test")
nn.save_features(X_train, "train")

In [5]:
from knn import euclidian_distance, manhattan_distance

pixels_feature = {"x_train": X_train, "y_train": y_train, "x_test": X_test, "y_test": y_test}
hog_feature = {"x_train": hog.load_features("train"), "y_train": y_train, "x_test": hog.load_features("test"), "y_test": y_test}
hist_feature = {"x_train": hist.load_features("train"), "y_train": y_train, "x_test": hist.load_features("test"), "y_test": y_test}
# nn_feature = {"x_train": nn.load_features("train"), "y_train": y_train, "x_test": nn.load_features("train"), "y_test": y_test}

K = [3, 5, 7, 9]
dist_functions = [["euclidian", euclidian_distance], ["manhattan", manhattan_distance]]
features = [["pixels", pixels_feature], ["hog", hog_feature], ["hist", hist_feature]]

## 5. Find hyper-parameters

In [6]:
from hyper_param_search import GridSearch, RandomSearch
grid_search_result = GridSearch().find(dist_functions, features, K)
random_search_result = RandomSearch().find(dist_functions, features, K)

100%|██████████| 100/100 [00:01<00:00, 86.55it/s]
100%|██████████| 100/100 [00:01<00:00, 83.50it/s]
100%|██████████| 100/100 [00:01<00:00, 90.74it/s]
100%|██████████| 100/100 [00:01<00:00, 92.32it/s]
100%|██████████| 100/100 [00:00<00:00, 168.62it/s]
100%|██████████| 100/100 [00:00<00:00, 205.06it/s]
100%|██████████| 100/100 [00:00<00:00, 206.10it/s]
100%|██████████| 100/100 [00:00<00:00, 206.51it/s]
100%|██████████| 100/100 [00:00<00:00, 238.08it/s]
100%|██████████| 100/100 [00:00<00:00, 224.57it/s]
100%|██████████| 100/100 [00:00<00:00, 185.30it/s]
100%|██████████| 100/100 [00:00<00:00, 183.60it/s]
100%|██████████| 100/100 [00:01<00:00, 84.57it/s]
100%|██████████| 100/100 [00:01<00:00, 85.57it/s]
100%|██████████| 100/100 [00:01<00:00, 86.01it/s]
100%|██████████| 100/100 [00:01<00:00, 84.38it/s]
100%|██████████| 100/100 [00:00<00:00, 151.73it/s]
100%|██████████| 100/100 [00:00<00:00, 147.42it/s]
100%|██████████| 100/100 [00:00<00:00, 146.53it/s]
100%|██████████| 100/100 [00:00<00:00, 

## 6. Results
#### 6.1 Grid search results

In [7]:
for result in grid_search_result.items():
    print(result)

(0.96, ['euclidian', 'hog', 3])
(0.95, ['manhattan', 'hog', 7])
(0.91, ['manhattan', 'hog', 9])
(0.86, ['euclidian', 'pixels', 9])
(0.85, ['manhattan', 'pixels', 7])
(0.84, ['manhattan', 'pixels', 3])
(0.83, ['manhattan', 'pixels', 9])
(0.82, ['euclidian', 'pixels', 3])
(0.14, ['manhattan', 'hist', 9])
(0.08, ['manhattan', 'hist', 5])


#### 6.2 Grid search results

In [8]:
for result in random_search_result.items():
    print(result)

(0.96, ['euclidian', 'hog', 3])
(0.95, ['manhattan', 'hog', 3])
(0.91, ['euclidian', 'hog', 9])
(0.83, ['manhattan', 'pixels', 9])
(0.14, ['manhattan', 'hist', 7])
(0.08, ['euclidian', 'hist', 3])


#### 6.3 Special test with NN feature (because they take up too much memory)

In [13]:
x_train = nn.load_features("train")
x_test = nn.load_features("test")

In [16]:
model = OwnKNeighborsClassifier(3, euclidian_distance)
model.fit(x_train, y_train)
score = model.score(x_test, y_test)
print(f"Accuracy: {score}")

100%|██████████| 100/100 [00:04<00:00, 21.96it/s]


Accuracy: 0.55


### The best model parameters:
* k: 3
* Distance function: euclidean
* Features: hog

### The best model test accuracy(on full dataset):

In [1]:
from best_model import best_score

best_score()

100%|██████████| 10000/10000 [59:45<00:00,  2.79it/s] 


Accuracy: 0.9645


###  Why such model parameters are the best?
Mey be tiny dataset is course.

- Basically for classification is need large K value. Usually it is square root of sample numbers. But in this case K=3 may be because distance value is well and near to each other.
- Euclidean distance is good and basic way to work with vectors.
- Hog summarizes well.

What is not working.
- Histogram is lost information and it gained worse result.
- NN may not working by different reasons.
    - VGG-16 is trained for 3-channel RGB images while Mnist digit data is 1-channel grayscale
    - VGG-16 trained for 1000-class classification while for this task we used it only for 10 classes
    - There are could be something missed in image preprocessing

