In [1]:
import pickle, gzip
import numpy as np
 
###############################################################################
## load data
 
f = gzip.open('mnist_10000.pkl.gz', 'rb')
trainData, trainLabels, valData, valLabels, testData, testLabels = pickle.load(f,encoding='latin1')
f.close()
 
print("training data points: {}".format(len(trainLabels)))
print("validation data points: {}".format(len(valLabels)))
print("testing data points: {}".format(len(testLabels)))

training data points: 10000
validation data points: 2000
testing data points: 2000


In [2]:
# import cv2
# image = trainData[0]
# image = image.reshape((28, 28))
# cv2.imshow("Image", image)
# cv2.waitKey(0) # press enter to go to next!

In [3]:
# knn from scratch

import operator
from collections import Counter
import math

def distance(instance1, instance2):
    distance = 0
    for x in range(len(instance1)):
        distance += pow((instance1[x] - instance2[x]), 2)
    return math.sqrt(distance)

def knn(k, train, test, trainlabel):
    res = []
    for i in range(len(test)):
        distances = []
        for j in range(len(train)):
            distances.append((trainlabel[i], distance(train[j], test[i])))
        distances.sort(key=operator.itemgetter(1))
        neighbors = [distances[x][0] for x in range(k)]
        res.append(neighbors)
    return res

def pred_labels(out):
    labels = []
    for i in out:
        dic = Counter(i)
        most_com = dic.most_common(1)[0][0]
        labels.append(most_com)
    return labels

def score(pred, actual):
    count = sum([1 for i in range(len(pred)) if pred[i] == actual[i]])
    return (count / len(pred)) * 100

In [4]:
# scikit-learn approach

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

def sklearn_knn(X_train, y_train, X_test, y_test, k):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    acc = accuracy_score(y_pred, y_test)
    report = classification_report(y_pred, y_test)
    matrix = confusion_matrix(y_pred, y_test)
    yield acc
    yield report
    yield matrix

In [5]:
accuracy, f1, matrix = sklearn_knn(trainData, trainLabels, valData, valLabels, 3)

In [6]:
print(accuracy)

0.953


In [7]:
print(f1)

             precision    recall  f1-score   support

          0       1.00      0.97      0.98       207
          1       1.00      0.91      0.95       224
          2       0.94      0.98      0.96       180
          3       0.94      0.93      0.93       201
          4       0.93      0.96      0.94       181
          5       0.92      0.97      0.94       181
          6       0.99      1.00      0.99       201
          7       0.96      0.95      0.95       224
          8       0.93      0.98      0.95       205
          9       0.93      0.91      0.92       196

avg / total       0.95      0.95      0.95      2000



In [8]:
print(matrix)

[[201   0   0   0   0   0   2   0   1   3]
 [  0 204   5   0   2   1   1   4   5   2]
 [  1   0 176   1   0   1   0   1   0   0]
 [  0   0   2 187   0   6   0   0   4   2]
 [  0   0   0   0 173   2   0   2   1   3]
 [  0   0   0   4   0 175   0   0   2   0]
 [  0   0   0   0   0   1 200   0   0   0]
 [  0   1   3   1   0   1   0 212   2   4]
 [  0   0   1   3   0   1   0   0 200   0]
 [  0   0   0   3  11   2   0   1   1 178]]
