In [None]:
# Run some setup code for this notebook.

import random
import numpy as np
from cs231n.data_utils import load_CIFAR10
import matplotlib.pyplot as plt

# This is a bit of magic to make matplotlib figures appear inline in the
# notebook rather than in a new window.
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [None]:
# Load the raw CIFAR-10 data.
cifar10_dir = 'cs231n/datasets/cifar-10-batches-py'
X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)

In [None]:
print(X_train.shape)

In [None]:
classes = ['plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
num_classes = len(classes)
samples_per_class = 5
for y, cls in enumerate(classes):
    idxs = np.flatnonzero(y_train == y)
    idxs = np.random.choice(idxs, samples_per_class, replace = False)
    for i, idx in enumerate(idxs):
        plot_idx = i * num_classes + y + 1
        plt.subplot(samples_per_class, num_classes, plot_idx)
        plt.imshow(X_train[idx].astype('uint8'))
        plt.axis('off')
        if(i == 0):
            plt.title(classes[y])

In [None]:
num_training = 49000
num_validation = 1000
num_test = 1000
num_dev = 500

# Our validation set will be num_validation points from the original
# training set.
mask = range(num_training, num_training + num_validation)
X_val = X_train[mask]
y_val = y_train[mask]

# Our training set will be the first num_train points from the original
# training set.
mask = range(num_training)
X_train = X_train[mask]
y_train = y_train[mask]

# We will also make a development set, which is a small subset of
# the training set.
mask = np.random.choice(num_training, num_dev, replace=False)
X_dev = X_train[mask]
y_dev = y_train[mask]

# We use the first num_test points of the original test set as our
# test set.
mask = range(num_test)
X_test = X_test[mask]
y_test = y_test[mask]

In [None]:
X_train = np.reshape(X_train, (X_train.shape[0], -1))
X_val = np.reshape(X_val, (X_val.shape[0], -1))
X_test = np.reshape(X_test, (X_test.shape[0], -1))
X_dev = np.reshape(X_dev, (X_dev.shape[0], -1))

In [None]:
mean_image = np.mean(X_train, axis = 0)
plt.figure(figsize = (4, 4))
plt.imshow(mean_image.reshape(32, 32, 3).astype('uint8'))

In [None]:
X_train -= mean_image
X_val -= mean_image
X_test -= mean_image
X_dev -= mean_image

In [None]:
X_train = np.hstack([X_train, np.ones((X_train.shape[0], 1))])
X_val = np.hstack([X_val, np.ones((X_val.shape[0], 1))])
X_test = np.hstack([X_test, np.ones((X_test.shape[0], 1))])
X_dev = np.hstack([X_dev, np.ones((X_dev.shape[0], 1))])

In [None]:
X_train.shape

In [None]:
print (X_train.shape, X_val.shape, X_test.shape, X_dev.shape)

In [None]:
from cs231n.classifiers.linear_svm import svm_loss_naive
import time

tic = time.time()
W = np.random.rand(X_dev.shape[1], 10)*.0001
loss, grad = svm_loss_naive(W, X_dev, y_dev, .0001)
toc =time.time() - tic
print("It took {} seconds, with a loss of {:.2f}".format(toc, loss))

In [None]:
# Once you've implemented the gradient, recompute it with the code below
# and gradient check it with the function we provided for you

# Compute the loss and its gradient at W.
loss, grad = svm_loss_vectorized(W, X_dev, y_dev, 0.0)

# Numerically compute the gradient along several randomly chosen dimensions, and
# compare them with your analytically computed gradient. The numbers should match
# almost exactly along all dimensions.
from cs231n.gradient_check import grad_check_sparse
f = lambda w: svm_loss_vectorized(w, X_dev, y_dev, 0.0)[0]
grad_numerical = grad_check_sparse(f, W, grad)

# do the gradient check once again with regularization turned on
# you didn't forget the regularization gradient did you?
loss, grad = svm_loss_vectorized(W, X_dev, y_dev, 1e2)
f = lambda w: svm_loss_vectorized(w, X_dev, y_dev, 1e2)[0]
grad_numerical = grad_check_sparse(f, W, grad)

# Inline Question 1:
It is possible that once in a while a dimension in the gradcheck will not match exactly. What could such a discrepancy be caused by? Is it a reason for concern? What is a simple example in one dimension where a gradient check could fail? *Hint: the SVM loss function is not strictly speaking differentiable*

When we take the scores, the derivative is either 1 or 0, but if we have scores[j] - correct_score + 1 = 0 then, we don't know the derivative. Our implementation would not add an extra -X[i] at the correct column, and the numerical would just be either a 1. So it WOULD have an extra -X[i] in the gradient, while the other one would not have that extra X[i].

One example could be 
x = [1, -2]
W = [[-1, 2],
     [0, 1]]

In this case, we have x dot W[0] be -1, and x dot W[1] be 0, meaning -1 - 0 + 1 = 0. 
This would cause the max(0, 0) to be non-differentiable at that point. 

Gradcheck would calculate that we have ( f(W + h) - f(W) ) / h. For each value in the first column, increasing it by a nudge h would cause it to either be > 0 (with (-1 + h)\*1) or < 0 (with (0 + h)\*-2). 

When we take it together we get (-1 + h)*1 + (h)*-2 = -1 + h - 2h = -1 - h. Thus, we will have a value that is < 0. So when we take the loss function at f(W) we get 0, with f(W + h) we get -h so we have -h/h = -1.

In the case of our "analytical" gradient, we would just have a score of -1 - 0 + 1 = 0 so we don't actually do anything to our zero gradient. 

Overall, we would just have our numerical be -1 and our analytical be 0. But alas, there is no need to despair!
This would only happen if our score was 0 <= score < h, meaning this is very unlikely to happen! So code on dear child.

In [274]:
tic = time.time()
loss_naive, grad_naive = svm_loss_naive(W, X_dev, y_dev, 0.00001)
toc = time.time()
print ('Naive loss: %e computed in %fs' % (loss_naive, toc - tic))

from cs231n.classifiers.linear_svm import svm_loss_vectorized
tic = time.time()
loss_vectorized, _ = svm_loss_vectorized(W, X_dev, y_dev, 0.00001)
toc = time.time()
print ('Vectorized loss: %e computed in %fs' % (loss_vectorized, toc - tic))

# The losses should match but your vectorized implementation should be much faster.
print ('difference: %f' % (loss_naive - loss_vectorized))

Naive loss: 8.991033e+00 computed in 0.208981s
Vectorized loss: 8.991033e+00 computed in 0.020474s
difference: -0.000000


In [276]:
# Complete the implementation of svm_loss_vectorized, and compute the gradient
# of the loss function in a vectorized way.

# The naive implementation and the vectorized implementation should match, but
# the vectorized version should still be much faster.
tic = time.time()
_, grad_naive = svm_loss_naive(W, X_dev, y_dev, 0.00001)
toc = time.time()
print ('Naive loss and gradient: computed in %fs' % (toc - tic))

tic = time.time()
_, grad_vectorized = svm_loss_vectorized(W, X_dev, y_dev, 0.00001)
toc = time.time()
print ('Vectorized loss and gradient: computed in %fs' % (toc - tic))

# The loss is a single number, so it is easy to compare the values computed
# by the two implementations. The gradient on the other hand is a matrix, so
# we use the Frobenius norm to compare them.
difference = np.linalg.norm(grad_naive - grad_vectorized, ord='fro')
print ('difference: %f' % difference)

Naive loss and gradient: computed in 16.768927s
Vectorized loss and gradient: computed in 0.418776s
difference: 0.000000


With the other method:
Naive loss and gradient: computed in 14.499156s
Vectorized loss and gradient: computed in 0.339304s

With my method:
Naive loss and gradient: computed in 14.800224s
Vectorized loss and gradient: computed in 0.283880s
    
So my method is slightly faster
But why?


In [None]:
# Use the validation set to tune hyperparameters (regularization strength and
# learning rate). You should experiment with different ranges for the learning
# rates and regularization strengths; if you are careful you should be able to
# get a classification accuracy of about 0.4 on the validation set.
learning_rates = [1e-7]
regularization_strengths = np.arange(1e5,1e6,1e4*6)
print("Reg strengths are ", regularization_strengths)
# results is dictionary mapping tuples of the form
# (learning_rate, regularization_strength) to tuples of the form
# (training_accuracy, validation_accuracy). The accuracy is simply the fraction
# of data points that are correctly classified.
results = {}
best_val = -1   # The highest validation accuracy that we have seen so far.
best_svm = None # The LinearSVM object that achieved the highest validation rate.

################################################################################
# TODO:                                                                        #
# Write code that chooses the best hyperparameters by tuning on the validation #
# set. For each combination of hyperparameters, train a linear SVM on the      #
# training set, compute its accuracy on the training and validation sets, and  #
# store these numbers in the results dictionary. In addition, store the best   #
# validation accuracy in best_val and the LinearSVM object that achieves this  #
# accuracy in best_svm.                                                        #
#                                                                              #
# Hint: You should use a small value for num_iters as you develop your         #
# validation code so that the SVMs don't take much time to train; once you are #
# confident that your validation code works, you should rerun the validation   #
# code with a larger value for num_iters.                                      #
################################################################################
for i, lr in enumerate(learning_rates):
    for j, rs in enumerate(regularization_strengths):
        svm = LinearSVM()
        loss_hist = svm.train(X_train, y_train, learning_rate=lr, reg=rs,
                      num_iters=150, verbose=True)
        test_pred = svm.predict(X_test)
        #plt.plot(loss_hist)
        #plt.title("Loss history for lr: {} and rs: {}".format(lr, rs))
        #plt.show()
        test_loss = np.mean(test_pred == y_test)
        val_pred = svm.predict(X_val)
        val_loss = np.mean(val_pred == y_val)
        if val_loss < best_val:
            best_val = val_loss
            best_lr = lr
            best_rs = rs
        results[(lr, rs)] = (test_loss, val_loss)
################################################################################
#                              END OF YOUR CODE                                #
################################################################################
    
# Print out results.
for lr, reg in sorted(results):
    train_accuracy, val_accuracy = results[(lr, reg)]
    best_val = (val_accuracy) if best_val < val_accuracy else best_val
    print ('lr %e reg %e train accuracy: %f val accuracy: %f' % (
                lr, reg, train_accuracy, val_accuracy))
    
print ('best validation accuracy achieved during cross-validation: %f' % best_val)

In [None]:
# In the file linear_classifier.py, implement SGD in the function
# LinearClassifier.train() and then run it with the code below.
from cs231n.classifiers import LinearSVM
svm = LinearSVM()
tic = time.time()
loss_hist = svm.train(X_train, y_train, learning_rate=1e-7, reg=5e4,
                      num_iters=1000, verbose=True)
toc = time.time()
print ('That took %fs' % (toc - tic))

In [None]:
plt.plot(loss_hist)
plt.xlabel("iteration number")
plt.ylabel("error")
plt.show()

In [None]:
y_pred_train = svm.predict(X_train)
print("The percentage correct on training set: {:.2f}%".format(np.mean(y_pred_train == y_train)))
print(np.sum(y_pred_train == y_train))
print(len(y_train))
y_pred_val = svm.predict(X_val)
print("The percentage correct on validation set: {:.2f}%".format(np.mean(y_pred_val == y_val)))
print(np.sum(y_pred_val == y_val))
print(len(y_val))

In [None]:
# Write the LinearSVM.predict function and evaluate the performance on both the
# training and validation set
y_train_pred = svm.predict(X_train)
print ('training accuracy: %f' % (np.mean(y_train == y_train_pred), ))
y_val_pred = svm.predict(X_val)
print ('validation accuracy: %f' % (np.mean(y_val == y_val_pred), ))

In [None]:
best_lr = 1e-7
best_rs = 2.6e5

svm = LinearSVM()
loss_hist = svm.train(X_train, y_train, learning_rate=best_lr, reg=best_rs,
                      num_iters=1500, verbose=True)


In [None]:
test_pred = svm.predict(X_train)
test_loss = np.mean(test_pred == y_train)

In [None]:
test_loss

In [None]:
#plot the actual results
import math
x_scatter = [math.log10(x[0]) for x in results]
y_scatter = [math.log10(x[1]) for x in results]

colors = [results[x][0] for x in results]
plt.subplot(2, 1, 1)
marker_width = 100
plt.scatter(x_scatter, y_scatter, marker_width, c = colors)
plt.colorbar()
plt.title("Test data results")
plt.xlabel("lr")
plt.ylabel("regularization loss")
plt.show()

colors = [results[x][1] for x in results]
plt.subplot(2, 1, 2)
marker_width = 100
plt.scatter(x_scatter, y_scatter, marker_width, c = colors)
plt.colorbar()
plt.title("Validation data loss")
plt.xlabel("lr")
plt.ylabel("regularization loss")

In [None]:
import math
x_scatter = [math.log10(x[0]) for x in results]
y_scatter = [math.log10(x[1]) for x in results]

# plot training accuracy
marker_size = 100
colors = [results[x][0] for x in results]
plt.subplot(2, 1, 1)
plt.scatter(x_scatter, y_scatter, marker_size, c=colors)
plt.colorbar()
plt.xlabel('log learning rate')
plt.ylabel('log regularization strength')
plt.title('CIFAR-10 training accuracy')

# plot validation accuracy
colors = [results[x][1] for x in results] # default size of markers is 20
plt.subplot(2, 1, 2)
plt.scatter(x_scatter, y_scatter, marker_size, c=colors)
plt.colorbar()
plt.xlabel('log learning rate')
plt.ylabel('log regularization strength')
plt.title('CIFAR-10 validation accuracy')
plt.show()

In [273]:
X_train.shape

(49000, 3073)

In [None]:
#visualize the weights by taking the "average" picture for each class
w = 