In [123]:
import pickle
import numpy as np
import os
import platform
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [124]:
class KNearestNeighbor(object):
    def __init__(self):
        pass
    def train(self, X, y):
        self.train_images = X
        self.train_labels = y
    def predict(self, X, k=1, num_of_loops=0):
        if num_loops == 0:
            dists = self.compute_distances(X)
        else:
            raise ValueError('Invalid value %d for num_loops' % num_of_loops)
        return self.predict_labels(dists, k=k)


    def compute_distances(self, X):
        num_of_test = X.shape[0]
        num_of_train = self.train_images.shape[0]
        dists = np.zeros((num_of_test, num_of_train)) 
        dists = np.sqrt(np.sum(np.square(self.train_images), axis=1) + np.sum(np.square(X), axis=1)[:, np.newaxis] - 2 * np.dot(X, self.train_images.T))
        pass
        return dists

    def predict_labels(self, dists, k=1):
        num_of_test = dists.shape[0]
        y_pred = np.zeros(num_of_test)
        for i in range(num_of_test):
            closest_y = []
            sorted_dist = np.argsort(dists[i])
            closest_y = list(self.train_labels[sorted_dist[0:k]])
            pass
            y_pred[i]= (np.argmax(np.bincount(closest_y)))
            pass
        return y_pred

In [125]:
def load_pickle(f):
    version = platform.python_version_tuple()
    if version[0] == '2':
        return  pickle.load(f)
    elif version[0] == '3':
        return  pickle.load(f, encoding='latin1')
    raise ValueError("invalid python version: {}".format(version))

def load_CIFAR_batch(filename):

  with open(filename, 'rb') as f:
    datadict = load_pickle(f)
    X = datadict['data']
    Y = datadict['labels']
    X = X.reshape(10000, 3, 32, 32).transpose(0,2,3,1).astype("float")
    Y = np.array(Y)
    return X, Y

def load_CIFAR10(ROOT):

  xs = []
  ys = []
  for b in range(1,6):
    f = os.path.join(ROOT, 'data_batch_%d' % (b, ))
    X, Y = load_CIFAR_batch(f)
    xs.append(X)
    ys.append(Y)    
  Xtr = np.concatenate(xs)
  Ytr = np.concatenate(ys)
  del X, Y
  Xte, Yte = load_CIFAR_batch(os.path.join(ROOT, 'test_batch'))
  return Xtr, Ytr, Xte, Yte

In [126]:
cifar10_dir = '../input/cifarabid/cifar-10-batches-py/'
train_images, train_labels, test_images, test_labels = load_CIFAR10(cifar10_dir)

In [127]:
print('Training data shape: ', train_images.shape)
print('Training labels shape: ', train_labels.shape)
print('Test data shape: ', test_images.shape)
print('Test labels shape: ', test_labels.shape)

In [128]:
classes = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
num_of_classes = len(classes)
samples_per_class = 8
plt.figure(figsize = (20, 20))
for y, cls in enumerate(classes):
    idxs = np.flatnonzero(train_labels == y)
    idxs = np.random.choice(idxs, samples_per_class, replace=False)
    for i, idx in enumerate(idxs):
        plt_idx = i * num_of_classes + y + 1
        plt.subplot(samples_per_class, num_of_classes, plt_idx)
        plt.imshow(train_images[idx].astype('uint8'))
        plt.axis('off')
        if i == 0:
            plt.title(cls)
plt.show()

In [129]:
num_of_training = 8000
mask1 = list(range(num_of_training))
train_images = train_images[mask1]
train_labels = train_labels[mask1]

num_of_test = 800
mask2 = list(range(num_of_test))
test_images = test_images[mask2]
test_labels = test_labels[mask2]

In [130]:
# reshaping data 
train_images = np.reshape(train_images, (train_images.shape[0], -1))
test_images = np.reshape(test_images, (test_images.shape[0], -1))
print(train_images.shape, test_images.shape)

In [131]:
num_of_folds = 5
k_choices = [1, 4, 7, 9, 10, 14, 16, 17, 60, 100]

train_images_folds = []
train_labels_folds = []

train_images_folds = np.array_split(train_images,num_of_folds)
train_labels_folds = np.array_split(train_labels,num_of_folds)
k_to_accuracies = {}

for k in k_choices:
    k_to_accuracies[k] = []
    for num_knn in range(0,num_of_folds):
        test_images = train_images_folds[num_knn]
        test_labels = train_labels_folds[num_knn]
        train_images = train_images_folds
        train_labels = train_labels_folds
        
        temp = np.delete(train_images,num_knn,0)
        train_images = np.concatenate((temp),axis = 0)
        train_labels = np.delete(train_labels,num_knn,0)
        train_labels = np.concatenate((train_labels),axis = 0)
        
        classifier = KNearestNeighbor()
        classifier.train(train_images, train_labels)
        dists = classifier.compute_distances(test_images)
        y_test_pred = classifier.predict_labels(dists, k)

        num_correct = np.sum(y_test_pred == test_labels)
        accuracy = float(num_correct) / num_of_test
        k_to_accuracies[k].append(accuracy)

In [132]:
#cross validation part for drawing graph
for k in k_choices:
    accuracies = k_to_accuracies[k]
    plt.scatter([k] * len(accuracies), accuracies)

accuracies_mean = np.array([np.mean(v) for k,v in sorted(k_to_accuracies.items())])
accuracies_std = np.array([np.std(v) for k,v in sorted(k_to_accuracies.items())])
plt.errorbar(k_choices, accuracies_mean, yerr=accuracies_std)
plt.title('Cross-validation on k')
plt.xlabel('k')
plt.ylabel('Cross-validation accuracy')
plt.show()