 k-Nearest Neighbor

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pickle
import os

In [None]:
def load_batch(file_path):
    """Load a batch of CIFAR data"""
    with open(file_path, 'rb') as f:
        d = pickle.load(f, encoding='bytes')
        # decode utf8
        d_decoded = {}
        for k, v in d.items():
            d_decoded[k.decode('utf8')] = v
        d = d_decoded
    data = d['data']
    labels = d['labels']

    data = data.reshape(data.shape[0], 3, 32, 32).transpose(0, 2, 3, 1)
    return data, labels


def load_data(path):
    num_train_samples = 50000

    x_train_local = np.empty((num_train_samples, 32, 32, 3), dtype='uint8')
    y_train_local = np.empty((num_train_samples,), dtype='uint8')

    for i in range(1, 6):
        batch_file_path = os.path.join(path, 'data_batch_' + str(i))
        (x_train_local[(i - 1) * 10000: i * 10000, :, :, :],
         y_train_local[(i - 1) * 10000: i * 10000]) = load_batch(batch_file_path)
        
    fpath = os.path.join(path, 'test_batch')
    x_test_local, y_test_local = load_batch(fpath)

    y_train_local = np.reshape(y_train_local, (len(y_train_local), 1))
    y_test_local = np.reshape(y_test_local, (len(y_test_local), 1))
    
    # x_train_local = x_train_local.transpose(0, 2, 3, 1)
    # x_test_local = x_test_local.transpose(0, 2, 3, 1)

    return (x_train_local, y_train_local), (x_test_local, y_test_local)

In [None]:
def rgb2gray(rgb):
    return np.dot(rgb[...,:3], [0.2989, 0.5870, 0.1140])

In [None]:
def euclidean_distance(image1, image2):
    gray1 = rgb2gray(image1)
    gray2 = rgb2gray(image2)
    
    distance = gray1 - gray2
    distance_squared = distance ** 2
    
    return np.sqrt(np.sum(distance_squared))

In [None]:
def manhattan_distance(image1, image2):
    gray1 = rgb2gray(image1)
    gray2 = rgb2gray(image2)
    
    return np.sum(np.abs(gray1 - gray2))

In [None]:
path1 = 'cifar-10-batches-py/data_batch_1'
path2 = 'cifar-10-batches-py/data_batch_2'
path3 = 'cifar-10-batches-py/data_batch_3'
path4 = 'cifar-10-batches-py/data_batch_4'
path5 = 'cifar-10-batches-py/data_batch_5'
path6 = 'cifar-10-batches-py/test_batch'

x_batch, y_batch = load_batch(path1)

y_batch = np.reshape(y_batch, (len(y_batch), 1))

print("Train data (x_data): ", x_batch.shape)
print("Train labels (y_data): ", y_batch.shape)

In [None]:
# (x_train, y_train), (x_test, y_test) = cifar10.load_data()
path = 'cifar-10-batches-py'
(x_train, y_train), (x_test, y_test) = load_data(path)

print("Train data (x_train): ", x_train.shape)
print("Train labels (y_train): ", y_train.shape)
print("Test data (x_test): ", x_test.shape)
print("Test labels (y_test): ", y_test.shape)

In [None]:
k = 7
# num_samples = x_test.shape[0]
num_samples = 1000
predictions = np.zeros(num_samples)

# i for each test sample
for i in range(num_samples):
    # distances for one test sample wrt every sample in train batch
    distances = [euclidean_distance(x_test[i], x_batch[x]) for x in range(num_samples)]
    # sorted minimum distances
    min_distance_ids = np.argsort(distances)
    
    # take k distances and count number of accurate label predictions
    label_count = np.zeros(10)
    for j in range(k):
        # increment the index of the label
        label_count[y_batch[min_distance_ids[j]]] += 1
        # the most recurring label is prediction for this sample
        predictions[i] = np.argmax(label_count)


accuracy = np.mean(predictions == y_test[:num_samples])
print(accuracy)