In [3]:
import pandas as pd
import numpy as np
from scipy.stats import mode

In [4]:
X_train = pd.read_csv('TinyMNIST/trainData.csv', header = None).values
y_train = pd.read_csv('TinyMNIST/trainLabels.csv', header = None).values
X_test = pd.read_csv('TinyMNIST/testData.csv', header = None).values
y_test = pd.read_csv('TinyMNIST/testLabels.csv', header = None).values

According to slides, a reasonable estimate for posterior probability is

$P_n(w_i|x) = \frac{k_i}{k}$

So there's no need to estimate probabilties and we can predict by majority voting in K-neighbors.

In [24]:

def KNN_nomral_predict(X_test, X_train, k=5):
    predicts = []
    probs = []
    for i, x in enumerate(X_test):
        kn_idx = np.argsort(np.sum((X_train - x)**2, axis=1))[:k]
        probs.append(y_train[kn_idx] == y_test[i].mean())
        predicts.append(mode(y_train[kn_idx]).mode[0])
    return predicts, probs

In [32]:
for k in [1,2,5,10,50,100,len(X_train)]:
    predicts, probs = KNN_nomral_predict(X_test, X_train, k)
    print('K:', k, end=' ')
    print('CCR:', (predicts == y_test).mean(), end=' ')
    print('Error rate:', 1 - np.mean(probs))

K: 1 CCR: 0.9184 Error rate: 0.0816
K: 2 CCR: 0.9004 Error rate: 0.0978
K: 5 CCR: 0.9084 Error rate: 0.12824000000000002
K: 10 CCR: 0.9012 Error rate: 0.15603999999999996
K: 50 CCR: 0.8584 Error rate: 0.27656000000000003
K: 100 CCR: 0.828 Error rate: 0.35682400000000003
K: 5000 CCR: 0.1148 Error rate: 0.8997868
