In [1]:
import numpy as np
from scipy import spatial
from scipy import stats
from keras.datasets import cifar10
import math

In [2]:
(Xtr, Ytr), (Xte, Yte) = cifar10.load_data()

print(Xtr.shape)
print(Ytr.shape)
print(Xte.shape)
print(Yte.shape)

(50000, 32, 32, 3)
(50000, 1)
(10000, 32, 32, 3)
(10000, 1)


In [3]:
Xtr = Xtr.reshape((len(Xtr), -1))
Ytr = Ytr.flatten()
Xte = Xte.reshape((len(Xte), -1))
Yte = Yte.flatten()


Xtr = Xtr[::10]
Ytr = Ytr[::10]
Xte = Xte[::10]
Yte = Yte[::10]

print(Xtr.shape)
print(Ytr.shape)
print(Xte.shape)
print(Yte.shape)

(5000, 3072)
(5000,)
(1000, 3072)
(1000,)


In [4]:
class KNN:
    def __init__(self, train_X, train_Y, k=10):
        self.train_X = train_X
        self.train_Y = train_Y
        self.k = k


    def predict(self, X, metric=None):
        if metric is None:
            metric = self.L2

        dist = metric(X)
        ind = np.argsort(dist)
        ind = ind[:, :self.k]

        classes = self.train_Y[ind]
        classes = stats.mode(classes, axis=1).mode.flatten()

        return classes

    def L2(self, X):
         return spatial.distance.cdist(self.train_X, X).T

    def L1(self, X):
          return spatial.distance.cdist(self.train_X, X, metric='cityblock').T

In [5]:
knn = KNN(Xtr, Ytr)
Ypr = knn.predict(Xte)
print(f'Accuracy with the L2 metric = {np.mean(Ypr == Yte)}')

Accuracy with the L2 metric = 0.274


In [6]:
knn = KNN(Xtr, Ytr)
Ypr = knn.predict(Xte, metric=knn.L1)
print(f'Accuracy with the L1 metric = {np.mean(Ypr == Yte)}')

Accuracy with the L1 metric = 0.297


In [7]:
class kFold():
    def __init__(self, k):
        self.k = k

    def split(self, X, Y):
        p = np.random.permutation(len(X))
        X, Y = X[p], Y[p]

        split_len = math.ceil(len(X) / self.k)

        for i in range(self.k):
            yield np.vstack((X[0:i*split_len], X[min((i+1)*split_len, len(X)):])), \
                  np.hstack((Y[0:i*split_len], Y[min((i+1)*split_len, len(Y)):])), \
                  X[i*split_len: min((i+1)*split_len, len(X))], \
                  Y[i*split_len: min((i+1)*split_len, len(Y))]

In [8]:
k_fold = kFold(4)
accuracies = {}

for k in [1,3,5,7]:
    print(f"k-NearestNeighbors for k = {k}")

    fold_accuracies = []
    for i, (Xtr_fold, Ytr_fold, Xva_fold, Yva_fold) in enumerate(k_fold.split(Xtr, Ytr)):
        knn = KNN(Xtr_fold, Ytr_fold, k=k)
        Ypr_fold = knn.predict(Xva_fold)
        curr_accuracy = np.mean(Ypr_fold == Yva_fold)
        fold_accuracies.append(curr_accuracy)

        print(f"fold = {i}, accuracy = {curr_accuracy}")
    

    accuracies[k] = np.mean(fold_accuracies)
    print(f"k = {k}, accuracy = {accuracies[k]}\n")
  

k_selected = max(accuracies, key=accuracies.get)

knn = KNN(Xtr, Ytr, k=k_selected)
Ypr = knn.predict(Xte)
final_accuracy = np.mean(Ypr == Yte)

print(f"\nSelected k for k-NearestNeighbors = {k_selected}, accuracy = {final_accuracy}")

k-NearestNeighbors for k = 1
fold = 0, accuracy = 0.2304
fold = 1, accuracy = 0.2336
fold = 2, accuracy = 0.2648
fold = 3, accuracy = 0.24
k = 1, accuracy = 0.24219999999999997

k-NearestNeighbors for k = 3
fold = 0, accuracy = 0.2512
fold = 1, accuracy = 0.2304
fold = 2, accuracy = 0.2608
fold = 3, accuracy = 0.2344
k = 3, accuracy = 0.24419999999999997

k-NearestNeighbors for k = 5
fold = 0, accuracy = 0.2744
fold = 1, accuracy = 0.2488
fold = 2, accuracy = 0.2416
fold = 3, accuracy = 0.2552
k = 5, accuracy = 0.255

k-NearestNeighbors for k = 7
fold = 0, accuracy = 0.2536
fold = 1, accuracy = 0.26
fold = 2, accuracy = 0.2584
fold = 3, accuracy = 0.2472
k = 7, accuracy = 0.2548


Selected k for k-NearestNeighbors = 5, accuracy = 0.273
