In [1]:
#In this notebook, we'll be implementing a SVM to use to classify the CIFAR dataset.

import pickle
import numpy as np

#Let's load our data - this is identical to what we did in the knn notebook.
with open("testTrainLab1.pickle", "rb") as f:
    labData = pickle.load(f)
X_train = np.reshape(labData["X_train"], (labData["X_train"].shape[0], -1))
X_test = np.reshape(labData["X_test"], (labData["X_test"].shape[0], -1))
y_train = labData["y_train"]
y_test = labData["y_test"]



In [2]:
#Let's cut our data up into training, validation, and testing subsets.  W
#As a good practice, we're also going to create a small dataset to use for
#code development - otherwise we have to wait for a long time before errors
#become evident!

#Remember, CIFAR10 has 50,000 training cases, and 10,000 testing cases.
trainingSize = 40000
validationSize = 5000
testingSize = 5000

#You can make the dev size smaller or bigger, depending on how powerful your computer is:
devSize = 1000

#CIFAR batches are random, so we can just take sequential cases.  Here is our validation set,
#taking the last 10,000 observations (indices 40000 - 50000 are in the mask)
valMask = range(trainingSize, 50000)
X_val = X_train[valMask]
y_val = y_train[valMask]

#This will now be a 10,000 by 3,072 shape vector - representing 10,000 images with 
#3,072 pixels each (32*32*3, for three bands of color)
print(X_val.shape)

#Training - here we take the first 40k (removing the validation data from our training data).
#Note re-using X_train and y_train here is not good practice - i.e., this cell will not run
#the second time because X_train and y_train have been redefined.
#However, using these variables will make our code broadly consistent with most examples
#you will find, so we are leaving these as-is.  In your own implementations, you may want to
#consider naming this variables something else (i.e., "X_train_40ksubset").
trainMask = range(0, trainingSize)
X_train = X_train[trainMask]
y_train = y_train[trainMask]

print(X_train.shape)

#Development set - just a tiny copy of some of the development data to use as a test.
devMask = range(0, devSize)
X_dev = X_train[devMask]
y_dev = y_train[devMask]

print(X_dev.shape)


(10000, 3072)
(40000, 3072)
(1000, 3072)


In [129]:
#Now we're going to write a simple SVM classifier
#In this first case, we won't have any optimization -
#We will write a function that takes 
#X (images), y (labels), W (weights), and an epsilon value (e) as inputs,
#and then informs us what the data loss would be:

def simpleSVM(X, y, W, e):
    #Lists to hold our losses
    dataLoss = []

    #Counter to report accuracy
    correct = 0

    #Create some basic counters- note this is unnecessary,
    #but helpful to improve the readability to the code.
    countClasses = W.shape[1]
    countTrainSamples = X.shape[0]

    #Iterate over every input X to predict classes
    for i in range(countTrainSamples):
        scores = X[i].dot(W)
        trueClassScore = scores[y[i]]

        #Determine if it was correct or not:
        if(trueClassScore == np.max(scores)):
            correct = correct + 1

        #Calculate the loss for each case
        loss_i = 0
        for j in range(countClasses):
            if(j != y[i]):
                loss_i = max(scores[j] - trueClassScore + e, 0) + loss_i
        
        dataLoss.append(loss_i)
    
    #Return the total average loss across all samples, along with the percent correct for interpretation
    return({"dataLoss":np.sum(dataLoss) / countTrainSamples, "percentCorrect":correct/countTrainSamples})


#Here, we create an entirely random set of weights to test.
#We specifically are creating a 3072 * 10 matrix - one weight for every pixel, for every class.
#Right now, this will (obviously) result in a pretty bad classification.
#Once we start working with optimization, we'll work to improve this - but random is fine for now.
W = np.random.randn(3072, 10) * 0.0001 

dataLoss = simpleSVM(X_dev, y_dev, W, 1.0)
print(dataLoss)


{'dataLoss': 10.42042989605346, 'percentCorrect': 0.127}


In [131]:
#Now, let's do the same thing, but add a regularization term in.  We'll use a L2 norm in this example.

#First, we add the lambda term to determine the tradeoff between
#our data annd regularization loss.
def simpleSVMWithReg(X, y, W, e, l):
    dataLoss = []
    correct = 0
    countClasses = W.shape[1]
    countTrainSamples = X.shape[0]

    for i in range(countTrainSamples):
        scores = X[i].dot(W)
        trueClassScore = scores[y[i]]

        if(trueClassScore == np.max(scores)):
            correct = correct + 1

        loss_i = 0
        for j in range(countClasses):
            if(j != y[i]):
                loss_i = max(0, (scores[j] - trueClassScore + e)) + loss_i
        dataLoss.append(loss_i)

    #Here is our data loss function, as before:
    dataLoss = np.sum(dataLoss) / countTrainSamples

    #Regularization Loss
    regLoss = np.sum(W*W)

    #Total Loss
    totalLoss = dataLoss + (l * regLoss)
    return({"dataLoss":dataLoss, "regLoss":regLoss, "totalLoss":totalLoss, "percentCorrect":correct/countTrainSamples})

W = np.random.randn(3072, 10) * 0.0001 

dataLoss = simpleSVMWithReg(X_dev, y_dev, W, e=1.0, l=1.0)
print(dataLoss)


{'dataLoss': 10.874920734785697, 'regLoss': 0.00030475322446098924, 'totalLoss': 10.875225488010157, 'percentCorrect': 0.089}


In [142]:
#The above does what we need, but it's very ineffecient due to our use of
#for loops.  The below implementation is functionally identical,
#but uses a vectorized implementation.

def simpleSVMEffecient(X, y, W, e, l):
    dataLoss = []
    correct = 0

    #Here, we compute scores for the entire set of observations at once
    #Instead of looping over them
    scores = X.dot(W) #<--- 1000 x 10 column, representing the 1000 cases in our dev dataset and 10 classes.
    countTrainSamples = scores.shape[0] #1000
    countClasses = scores.shape[1] #10
    
    #Now we need to look up the trueClassScore for everything - i.e., if y = Cat, we need the score for cat.
    #Remember - y is a number from 0 to 9, representing the 10 classes.
    #So, to find the score for y in the matrix for each of 1000 rows,
    #we look at the column y.  That's what the below function does.
    #np.arange just creates a list from 0 to 1000, and y is in the same order,
    #giving us the lookups.
    
    trueClassScores = scores[np.arange(scores.shape[0]), y]
    print("====Vectorized Implementation====")
    print("True class score for observation 20: " + str(trueClassScores[20]))
    
    #Calculate the difference between each incorrect class score and the true class score
    trueClassMatrix = np.matrix(trueClassScores).T #Converting our array to a 1000 by 1 matrix, to allow for matrix manipulation.
    #print(trueClassMatrix[20,0]) #<--- Will be the same as trueClassScores[20] above.

    #Subtract the true class value from every element in the scores matrix:
    loss_ij = np.maximum(0, (scores - trueClassMatrix) + e) #1000 x 10 matrix
    
    print("CIFAR Class 1 Score for Observation 20: " + str(scores[20,0]))
    
    #This should be equivalent to:
    #print(str(scores[20,0] - trueClassScores[20] + e))
    print("Loss for first contrast of 20th observation: " + str(loss_ij[20,0]))
    
    #Remove the cases where we compare the true class to the true class (we only want the other cases for the SVM loss)
    loss_ij[np.arange(countTrainSamples), y] = 0

    #Note - this entire block of code is unnecessary, but helpful to illustrate what's going on.
    exampleLossObs20 = np.sum(loss_ij[20])
    print("Total loss for 20th observation:" + str(exampleLossObs20))

    #Calculate the mean data loss
    dataLossNew = np.sum(np.sum(loss_ij)) / countTrainSamples

    #The below function is identical to what we calculated before:
    print("\n\n====For Loop Implementation====")
    for i in range(countTrainSamples):
        scores = X[i].dot(W)
        trueClassScore = scores[y[i]]

        #===============
        #For comparison to vectorized implementation:
        if(i == 20):
            print("True class score for observation 20: " + str(trueClassScore))
        #===============

        if(trueClassScore == np.max(scores)):
            correct = correct + 1

        loss_i = 0
        for j in range(countClasses):
            
            
            #===============
            #For comparison to vectorized implementation:
            if(i == 20):
                if(j == 0):
                    print("CIFAR Class 1 Score for Observation 20: " + str(scores[j]))
                    print("Loss for first contrast of 20th observation: " + str(max(0, scores[j] - trueClassScore + e)))
            #===============

            if(j != y[i]):
                loss_i = max(0, (scores[j] - trueClassScore + e)) + loss_i
        
        #===============
        #For comparison to vectorized implementation:
        if(i == 20):
            print("Total loss for 20th observation: " + str(loss_i))
        #===============

        dataLoss.append(loss_i)

    #Here is our data loss function, as before:
    dataLoss_old = np.sum(dataLoss) / countTrainSamples

    #Regularization Loss
    regLoss = np.sum(W*W)

    #Total Loss
    totalLoss = dataLoss_old + (l * regLoss)
    return({"dataLoss_vectorized":dataLossNew, "dataLoss_forLoop":dataLoss_old, "regLoss":regLoss, "totalLoss":totalLoss, "percentCorrect":correct/countTrainSamples})

W = np.random.randn(3072, 10) * 0.0001 

dataLoss = simpleSVMEffecient(X_dev, y_dev, W, e=1.0, l=1.0)
print("\n\n")
print(dataLoss)

====Vectorized Implementation====
True class score for observation 20: -1.600041308191975
CIFAR Class 1 Score for Observation 20: -0.7394481065723721
Loss for first contrast of 20th observation: 1.860593201619603
Total loss for 20th observation:21.780492802827126


====For Loop Implementation====
True class score for observation 20: -1.6000413081919749
CIFAR Class 1 Score for Observation 20: -0.7394481065723724
Loss for first contrast of 20th observation: 1.8605932016196025
Total loss for 20th observation: 21.780492802827123



{'dataLoss_vectorized': 10.429355366253667, 'dataLoss_forLoop': 10.429355366253667, 'regLoss': 0.0003060647029503595, 'totalLoss': 10.429661430956617, 'percentCorrect': 0.102}
