In [18]:
# Softmax function
import random
import numpy as np
from cs231n.data_utils import load_CIFAR10
import matplotlib.pyplot as plt

from __future__ import print_function

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading extenrnal modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [19]:
def get_CIFAR10_data(num_training=49000, num_validation=1000, num_test=1000, num_dev=500):
    """
    Load the CIFAR-10 dataset from disk and perform preprocessing to prepare
    it for the linear classifier. These are the same steps as we used for the
    SVM, but condensed to a single function.  
    """
    # Load the raw CIFAR-10 data
    cifar10_dir = 'cs231n/datasets/cifar-10-batches-py'
    
    X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)
    
    # subsample the data
    mask = list(range(num_training, num_training + num_validation))
    X_val = X_train[mask]
    y_val = y_train[mask]  
    mask = list(range(num_training))
    X_train = X_train[mask]
    y_train = y_train[mask]
    mask = list(range(num_test))
    X_test = X_test[mask]
    y_test = y_test[mask]
    mask = np.random.choice(num_training, num_dev, replace=False)
    X_dev = X_train[mask]
    y_dev = y_train[mask]
       
    # Preprocessing: reshape the image data into rows
    X_train = np.reshape(X_train, (X_train.shape[0], -1))
    X_val = np.reshape(X_val, (X_val.shape[0], -1))
    X_test = np.reshape(X_test, (X_test.shape[0], -1))
    X_dev = np.reshape(X_dev, (X_dev.shape[0], -1))
    # print('after reshaping the data', X_train.shape)
    
    # Normalize the data: subtract the mean image - zero centered the data
    mean_image = np.mean(X_train, axis = 0)
    X_train -= mean_image
    X_val -= mean_image
    X_test -= mean_image
    X_dev -= mean_image
    #print('after Mean subtraction', X_train.shape)
    
    # add bias dimension and transform into columns
    X_train = np.hstack([X_train, np.ones((X_train.shape[0], 1))])
    X_val = np.hstack([X_val, np.ones((X_val.shape[0], 1))])
    X_test = np.hstack([X_test, np.ones((X_test.shape[0], 1))])
    X_dev = np.hstack([X_dev, np.ones((X_dev.shape[0], 1))])
    # print('after adding bias dimension', X_train.shape)
    
    return X_train, y_train, X_val, y_val, X_test, y_test, X_dev, y_dev


# Cleaning up variables to prevent loading data multiple times (which may cause memory issue)
try:
   del X_train, y_train
   del X_test, y_test
   print('Clear previously loaded data.')
except:
   pass

# Invoke the above function to get our data.
X_train, y_train, X_val, y_val, X_test, y_test, X_dev, y_dev = get_CIFAR10_data()
print('Train data shape: ', X_train.shape)
print('Train labels shape: ', y_train.shape)
print('Validation data shape: ', X_val.shape)
print('Validation labels shape: ', y_val.shape)
print('Test data shape: ', X_test.shape)
print('Test labels shape: ', y_test.shape)
print('dev data shape: ', X_dev.shape)
print('dev labels shape: ', y_dev.shape)

Clear previously loaded data.
Train data shape:  (49000, 3073)
Train labels shape:  (49000,)
Validation data shape:  (1000, 3073)
Validation labels shape:  (1000,)
Test data shape:  (1000, 3073)
Test labels shape:  (1000,)
dev data shape:  (500, 3073)
dev labels shape:  (500,)


In [55]:
import numpy as np
from random import shuffle

def softmax_loss_naive(W, X, y, reg):
    """
      Softmax loss function, naive implementation (with loops)
      Inputs have dimension D, there are C classes, and we operate on minibatches
      of N examples.
      Inputs:
      - W: A numpy array of shape (D, C) containing weights.
      - X: A numpy array of shape (N, D) containing a minibatch of data.
      - y: A numpy array of shape (N,) containing training labels; y[i] = c means
        that X[i] has label c, where 0 <= c < C.
      - reg: (float) regularization strength
      Returns a tuple of:
      - loss as single float
      - gradient with respect to weights W; an array of same shape as W
    """
    # Initialize the loss and gradient to zero.
    loss = 0.0
    dw = np.zeros_like(W)
    
    num_examples = X.shape[0]
    num_classes = W.shape[1]
    
    #############################################################################
      # TODO: Compute the softmax loss and its gradient using explicit loops.   #
      # Store the loss in loss and the gradient in dW. If you are not careful     #
      # here, it is easy to run into numeric instability. Don't forget the        #
      # regularization!                                                           #
    #############################################################################
    
    # compute the class score for Softmax Linear Classifier. 
    # Score will be of shape [N x C], where each row gives score for each of the class.
    # scores = X.dot(W)
    scores = np.dot(X, W)
    
    scores = scores - np.max(scores, axis=1)[:, np.newaxis]
    
    # exponentiate score value to get unnormalized probabilities (from unnormalized log probabilities)
    scores = np.exp(scores) 
    
    # Normalize it for each example to get probabilities
    class_prob = scores / np.sum(scores, axis = 1)[:, np.newaxis]
    
    # we’ve normalized them every row now sums to one.
    # We can now query for the log probabilities assigned to the correct classes in each example:
    # true_prob is a 1D array of just the probabilities assigned to the correct classes for each example
    true_prob = class_prob[range(num_examples), y]
    
    # The full loss is then the average of these log probabilities and the regularization loss:
    loss = np.sum(-np.log(true_prob))
    loss /= num_examples
    loss += reg * 0.5 * np.sum(W*W)
    
    # To get the gradient on scores
    dscores = class_prob
    dscores[range(num_examples),y] -=1
    
    # we can now backpropagate into W
    dw = np.dot(X.T, dscores)
    dw /= num_examples
    dw += reg * W
    
    return loss, dw


def softmax_loss_vectorized(W, X, y, reg):
    
    """
      Softmax loss function, vectorized version.
      Inputs and outputs are the same as softmax_loss_naive.
    """
      # Initialize the loss and gradient to zero.
    loss = 0.0
    dW = np.zeros_like(W)

    num_examples = X.shape[0]
    num_classes = W.shape[1]

  #############################################################################
  # TODO: Compute the softmax loss and its gradient using no explicit loops.  #
  # Store the loss in loss and the gradient in dW. If you are not careful     #
  # here, it is easy to run into numeric instability. Don't forget the        #
  # regularization!                                                           #
  #############################################################################
    scores = X.dot(W)
    
    # numeric stability
    scores = scores - np.max(scores, axis = 1)[:,np.newaxis]
    scores = np.exp(scores)
    class_prob = scores / np.sum(scores, axis = 1)[:, np.newaxis]
    true_prob = class_prob[range(num_examples), y]

    class_score = class_prob
    class_score[range(num_examples), y] -= 1

    loss = np.sum(-np.log(true_prob))
    loss /= num_examples
    loss += 0.5 * reg * np.sum(W*W)

    dW = np.dot(X.T, class_score)
    dW /= num_examples
    dW += reg*W

    return loss, dW


In [58]:
# First implement the naive softmax loss function with nested loops.
# Open the file cs231n/classifiers/softmax.py and implement the
# softmax_loss_naive function.

#from cs231n.classifiers.softmax import softmax_loss_naive
import time

# Generate a random softmax weight matrix and use it to compute the loss.
W = np.random.randn(3073, 10) * 0.0001
loss, grad = softmax_loss_naive(W, X_dev, y_dev, 0.0)

# As a rough sanity check, our loss should be something close to -log(0.1).
print('loss: %f' % loss)
print('sanity check: %f' % (-np.log(0.1)))

loss: 2.306529
sanity check: 2.302585


In [None]:

Inline Question 1:

Why do we expect our loss to be close to -log(0.1)?

Answer :- In CIFAR_10 dataset we have 10 classes, so '0.1' that is(1 / 10) probability for each of the class,
          our loss is giving unnormalised negative log probabilities of each class so should be close to -log(0.1)

for details you can refer "https://cs231n.github.io/neural-networks-case-study/"



In [59]:
# Complete the implementation of softmax_loss_naive and implement a (naive)
# version of the gradient that uses nested loops.
loss, grad = softmax_loss_naive(W, X_dev, y_dev, 0.0)

# As we did for the SVM, use numeric gradient checking as a debugging tool.
# The numeric gradient should be close to the analytic gradient.
from cs231n.gradient_check import grad_check_sparse
f = lambda w: softmax_loss_naive(w, X_dev, y_dev, 0.0)[0]
grad_numerical = grad_check_sparse(f, W, grad, 10)

# similar to SVM case, do another gradient check with regularization
loss, grad = softmax_loss_naive(W, X_dev, y_dev, 5e1)
f = lambda w: softmax_loss_naive(w, X_dev, y_dev, 5e1)[0]
grad_numerical = grad_check_sparse(f, W, grad, 10)

numerical: 1.989482 analytic: 1.989482, relative error: 1.854778e-08
numerical: 0.264869 analytic: 0.264869, relative error: 5.337404e-07
numerical: 2.741553 analytic: 2.741553, relative error: 2.013508e-08
numerical: -2.176560 analytic: -2.176560, relative error: 7.787166e-09
numerical: -0.722123 analytic: -0.722123, relative error: 5.315998e-10
numerical: 0.177662 analytic: 0.177662, relative error: 5.013167e-08
numerical: -0.001859 analytic: -0.001859, relative error: 1.787153e-05
numerical: 0.609591 analytic: 0.609591, relative error: 1.827300e-09
numerical: -2.253090 analytic: -2.253090, relative error: 4.593566e-09
numerical: 1.672861 analytic: 1.672861, relative error: 2.645436e-08
numerical: -3.915631 analytic: -3.915631, relative error: 2.840122e-08
numerical: 3.646172 analytic: 3.646172, relative error: 1.786551e-09
numerical: 1.463997 analytic: 1.463997, relative error: 3.206344e-08
numerical: -0.265475 analytic: -0.265475, relative error: 2.375886e-08
numerical: 1.431715 an

In [40]:
"""
import numpy as np
def softmax_loss_vectorized(W, X, y, reg):
    # initialize loss and gradient to zero
    loss = 0.0
    dW = np.zeros_like(W)
    
    # compute score vector
    num_train = X.shape[0]
    score = X.dot(W)
    # max of every sample
    score -= np.max(score, axis=1, keepdims=True)
    sum_score = np.sum(np.exp(score), axis=1, keepdims=True)
    p = np.exp(score) / sum_score
    loss = np.sum(-np.log(p[np.arange(num_train), y]))
              
    ind = np.zeros_like(p)
    ind[np.arange(num_train), y] = 1
    dW = X.T.dot(p - ind)

    loss /= num_train
    loss += 0.5 * reg * np.sum(W * W)
    dW /= num_train
    dW += reg*W
    
    return loss, dW
"""

In [60]:
# Now that we have a naive implementation of the softmax loss function and its gradient,
# implement a vectorized version in softmax_loss_vectorized.
# The two versions should compute the same results, but the vectorized version should be
# much faster.
tic = time.time()
loss_naive, grad_naive = softmax_loss_naive(W, X_dev, y_dev, 0.000005)
toc = time.time()
print('naive loss: %e computed in %fs' % (loss_naive, toc - tic))

#from cs231n.classifiers.softmax import softmax_loss_vectorized
tic = time.time()
loss_vectorized, grad_vectorized = softmax_loss_vectorized(W, X_dev, y_dev, 0.000005)
toc = time.time()
print('vectorized loss: %e computed in %fs' % (loss_vectorized, toc - tic))

# As we did for the SVM, we use the Frobenius norm to compare the two versions
# of the gradient.
grad_difference = np.linalg.norm(grad_naive - grad_vectorized, ord='fro')
print('Loss difference: %f' % np.abs(loss_naive - loss_vectorized))
print('Gradient difference: %f' % grad_difference)

naive loss: 2.306529e+00 computed in 0.012706s
vectorized loss: 2.306529e+00 computed in 0.010864s
Loss difference: 0.000000
Gradient difference: 0.000000


In [68]:
# Use the validation set to tune hyperparameters (regularization strength and
# learning rate). You should experiment with different ranges for the learning
# rates and regularization strengths; if you are careful you should be able to
# get a classification accuracy of over 0.35 on the validation set.
#from cs231n.classifiers import Softmax
results = {}
best_val = -1
best_softmax = None
learning_rates = [1e-7, 5e-7]
regularization_strengths = [5e4, 1e8]

################################################################################
# TODO:                                                                        #
# Use the validation set to set the learning rate and regularization strength. #
# This should be identical to the validation that you did for the SVM; save    #
# the best trained softmax classifer in best_softmax.                          #
################################################################################
for lr in learning_rates:
    for reg in regularization_strengths:
        model = Softmax()
        model.train(X_train, y_train, learning_rate=lr, reg=reg,
                      num_iters=400, verbose=False)
        
        y_train_pred = model.predict(X_train)
        train_acc = np.mean(y_train == y_train_pred)
        
        y_val_pred = model.predict(X_val)
        val_acc = np.mean(y_val == y_val_pred)
        
        results[(lr,reg)] = (train_acc, val_acc)
        if val_acc > best_val:
            best_val = val_acc
            best_softmax = model
################################################################################
#                              END OF YOUR CODE                                #
################################################################################
    
# Print out results.
for lr, reg in sorted(results):
    train_accuracy, val_accuracy = results[(lr, reg)]
    print('lr %e reg %e train accuracy: %f val accuracy: %f' % (
                lr, reg, train_accuracy, val_accuracy))
    
print('best validation accuracy achieved during cross-validation: %f' % best_val)

AttributeError: 'NoneType' object has no attribute 'shape'