In [None]:
import tensorflow as tf
from keras.datasets import cifar10
import numpy as np

In [None]:
(tX, ty), (vX, vy) = cifar10.load_data()

# Print the shape of the training set
print("x_train shape:", tX.shape)
print("y_train shape:", ty.shape)

# Print the shape of the test set
print("x_test shape:", vX.shape)
print("y_test shape:", vy.shape)

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
x_train shape: (50000, 32, 32, 3)
y_train shape: (50000, 1)
x_test shape: (10000, 32, 32, 3)
y_test shape: (10000, 1)


In [None]:
tX_1d = tX.reshape(tX.shape[0], -1)
vX_1d = vX.reshape(vX.shape[0], -1)

In [None]:
# just to check
print(tX_1d.shape)

(50000, 3072)


In [None]:
# my aim is to create a Nearest Neighbor model without using sklearn or tensorflow to demonstrate
# the first method to classify images

class NearestNeighbor(object):

  def __init__(self):
    pass

  def train(self, tX, ty):
    self.tX = tX
    self.ty = ty

  def predict(self, vX):
    self.vX = vX

    # lets make sure that the output type matches the input type
    pred = np.zeros(vX.shape[0], dtype = self.ty.dtype)

    # loop over all test rows
    for i in range(vX.shape[0]):
      # find the nearest training image to the i'th test image
      # using the L1 distance (sum of absolute value differences)
      L1_distance = np.sum(np.abs(self.tX - vX[i,:]), axis = 1)
      min_index = np.argmin(L1_distance) # get the index with smallest distance
      pred[i] = self.ty[min_index] # predict the label of the nearest example

    return pred

In [None]:
model = NearestNeighbor() # create a Nearest Neighbor classifier class
model.train(tX_1d, ty) # train the classifier on the training images and labels
pred = model.predict(vX_1d) # predict labels on the test images
# and now print the classification accuracy, which is the average number
# of examples that are correctly predicted (i.e. label matches)
print('accuracy: %f' % ( np.mean(pred == vy) ))

# so i won't run the code.

In [None]:
# we could use L2 distance only need to change
# L2_distance = np.sqrt(np.sum(np.square(self.tX - vX[i,:]), axis=1))

In [2]:
# Hyperparamter tuning

# how to find the right k value, if you use test set to tune your parameters then you'd be overfitting
# the test data. So a better method would be split data into 3 sets, train, validation, test

# so in this dataset take first 1000 images for validation and 49,000 for train. Tune parameters based
# on validation set and evaluate model on test set.

In [None]:
# Cross-validation

# if size of training data is small, then we can split into multiple folds and iterate over each fold as
# a validation set. And then finally average our performance