In [1]:
import random
import numpy as np
from cs231n.data_utils import load_CIFAR10
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading extenrnal modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

In [2]:
def get_CIFAR10_data(num_training=49000, num_validation=1000, num_test=1000, num_dev=500):
  """
  Load the CIFAR-10 dataset from disk and perform preprocessing to prepare
  it for the linear classifier. These are the same steps as we used for the
  SVM, but condensed to a single function.  
  """
  # Load the raw CIFAR-10 data
  cifar10_dir = 'cs231n/datasets/cifar-10-batches-py'
  X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)
  
  # subsample the data
  mask = range(num_training, num_training + num_validation)
  X_val = X_train[mask]
  y_val = y_train[mask]
  mask = range(num_training)
  X_train = X_train[mask]
  y_train = y_train[mask]
  mask = range(num_test)
  X_test = X_test[mask]
  y_test = y_test[mask]
  mask = np.random.choice(num_training, num_dev, replace=False)
  X_dev = X_train[mask]
  y_dev = y_train[mask]
  
  # Preprocessing: reshape the image data into rows
  X_train = np.reshape(X_train, (X_train.shape[0], -1))
  X_val = np.reshape(X_val, (X_val.shape[0], -1))
  X_test = np.reshape(X_test, (X_test.shape[0], -1))
  X_dev = np.reshape(X_dev, (X_dev.shape[0], -1))
  
  # Normalize the data: subtract the mean image
  mean_image = np.mean(X_train, axis = 0)
  X_train -= mean_image
  X_val -= mean_image
  X_test -= mean_image
  X_dev -= mean_image
  
  # add bias dimension and transform into columns
  X_train = np.hstack([X_train, np.ones((X_train.shape[0], 1))])
  X_val = np.hstack([X_val, np.ones((X_val.shape[0], 1))])
  X_test = np.hstack([X_test, np.ones((X_test.shape[0], 1))])
  X_dev = np.hstack([X_dev, np.ones((X_dev.shape[0], 1))])
  
  return X_train, y_train, X_val, y_val, X_test, y_test, X_dev, y_dev


# Invoke the above function to get our data.
X_train, y_train, X_val, y_val, X_test, y_test, X_dev, y_dev = get_CIFAR10_data()
print('Train data shape: ', X_train.shape)
print('Train labels shape: ', y_train.shape)
print('Validation data shape: ', X_val.shape)
print('Validation labels shape: ', y_val.shape)
print('Test data shape: ', X_test.shape)
print('Test labels shape: ', y_test.shape)
print('dev data shape: ', X_dev.shape)
print('dev labels shape: ', y_dev.shape)

Train data shape:  (49000, 3073)
Train labels shape:  (49000,)
Validation data shape:  (1000, 3073)
Validation labels shape:  (1000,)
Test data shape:  (1000, 3073)
Test labels shape:  (1000,)
dev data shape:  (500, 3073)
dev labels shape:  (500,)


In [3]:
class LinearClassifier(object):

  def __init__(self):
    self.W = None

  def train(self, X, y, learning_rate=1e-3, reg=1e-5, num_iters=100,
            batch_size=200, verbose=False):
    """
    Train this linear classifier using stochastic gradient descent.

    Inputs:
    - X: A numpy array of shape (N, D) containing training data; there are N
      training samples each of dimension D.
    - y: A numpy array of shape (N,) containing training labels; y[i] = c
      means that X[i] has label 0 <= c < C for C classes.
    - learning_rate: (float) learning rate for optimization.
    - reg: (float) regularization strength.
    - num_iters: (integer) number of steps to take when optimizing
    - batch_size: (integer) number of training examples to use at each step.
    - verbose: (boolean) If true, print progress during optimization.

    Outputs:
    A list containing the value of the loss function at each training iteration.
    """
    num_train, dim = X.shape
    num_classes = np.max(y) + 1 # assume y takes values 0...K-1 where K is number of classes
    if self.W is None:
      # lazily initialize W
      self.W = 0.001 * np.random.randn(dim, num_classes)

    # Run stochastic gradient descent to optimize W
    loss_history = []
    for it in range(num_iters):
      X_batch = None
      y_batch = None

      #########################################################################
      # TODO:                                                                 #
      # Sample batch_size elements from the training data and their           #
      # corresponding labels to use in this round of gradient descent.        #
      # Store the data in X_batch and their corresponding labels in           #
      # y_batch; after sampling X_batch should have shape (dim, batch_size)   #
      # and y_batch should have shape (batch_size,)                           #
      #                                                                       #
      # Hint: Use np.random.choice to generate indices. Sampling with         #
      # replacement is faster than sampling without replacement.              #
      #########################################################################
      mask = np.random.choice(num_train, num_iters, replace = True)
      X_batch = X[mask]
      y_batch = y[mask]
#      pass
      #########################################################################
      #                       END OF YOUR CODE                                #
      #########################################################################

      # evaluate loss and gradient
      loss, grad = self.loss(X_batch, y_batch, reg)
      loss_history.append(loss)

      # perform parameter update
      #########################################################################
      # TODO:                                                                 #
      # Update the weights using the gradient and the learning rate.          #
      #########################################################################
      self.W = self.W - learning_rate * grad
#      pass
      #########################################################################
      #                       END OF YOUR CODE                                #
      #########################################################################

      if verbose and it % 100 == 0:
        print ('iteration %d / %d: loss %f' % (it, num_iters, loss))

    return loss_history,self.W

  def predict(self, X):
    """
    Use the trained weights of this linear classifier to predict labels for
    data points.

    Inputs:
    - X: D x N array of training data. Each column is a D-dimensional point.

    Returns:
    - y_pred: Predicted labels for the data in X. y_pred is a 1-dimensional
      array of length N, and each element is an integer giving the predicted
      class.
    """
#    X = X.T
    y_pred = np.zeros(X.shape[0])
    ###########################################################################
    # TODO:                                                                   #
    # Implement this method. Store the predicted labels in y_pred.            #
    ###########################################################################
#    scores = X.T.dot(self.W)
    scores = X.dot(self.W)
#    np.argsort(scores,axis=1)[:,-1] get the max value of the index every row(把每一行的最大值的下标取到)
    y_pred = np.argsort(scores, axis=1)[:, -1]
#    pass
    ###########################################################################
    #                           END OF YOUR CODE                              #
    ###########################################################################
    return y_pred
  
  def loss(self, X_batch, y_batch, reg):
    """
    Compute the loss function and its derivative. 
    Subclasses will override this.

    Inputs:
    - X_batch: A numpy array of shape (N, D) containing a minibatch of N
      data points; each point has dimension D.
    - y_batch: A numpy array of shape (N,) containing labels for the minibatch.
    - reg: (float) regularization strength.

    Returns: A tuple containing:
    - loss as a single float
    - gradient with respect to self.W; an array of the same shape as W
    """
    pass


class LinearSVM(LinearClassifier):
  """ A subclass that uses the Multiclass SVM loss function """

  def loss(self, X_batch, y_batch, reg):
    return svm_loss_vectorized(self.W, X_batch, y_batch, reg)


class Softmax(LinearClassifier):
  """ A subclass that uses the Softmax + Cross-entropy loss function """

  def loss(self, X_batch, y_batch, reg):
    return softmax_loss_vectorized(self.W, X_batch, y_batch, reg)



def softmax_loss_naive(W, X, y, reg):
    """
    W.shape = (D, C)
    X.shape = (N, D)
    y.shape = (N,)
    reg 是正则化的超参数
    """
    # 初始化loss和gradient
    loss = 0.0
    dW = np.zeros_like(W) # shape is (D, C)
    # 训练数
    num_train = X.shape[0]
    # 类别数
    C = W.shape[1]
    for i in range(num_train):
        scores = np.dot(X[i], W) # shape is (C,)
        exp_scores = np.exp(scores) # shape is (C,)
        sum_exp_scores = np.sum(exp_scores) # shape (1,)
        loss += - np.sum(np.log(exp_scores / sum_exp_scores))
        for j in range(C):
            if j == y[i]:
                dW[:, y[i]] += (-1 + exp_scores[y[i]] / sum_exp_scores) * X[i]
            else:
                dW[:, j] += (exp_scores[j] / sum_exp_scores) * X[i]
    
    loss /= num_train
    dW /= num_train
    loss += reg * np.sum(W*W)
    dW += reg * W
    return loss, dW


def softmax_loss_vectorized(W, X, y, reg):
    """
    W.shape = (D, C)
    X.shape = (N, D)
    y.shape = (N,)
    reg 是正则化的超参数
    """
    # 初始化loss和gradient
    loss = 0.0
    dW = np.zeros_like(W) # shape is (D, C)
    num_train = X.shape[0]
    scores = np.dot(X, W) # shape is (N, C)
    exp_scores = np.exp(scores) #shape is (N, C)
    sum_exp_scores = np.sum(exp_scores, axis = 1, keepdims = True) # shape is (N, 1)
    loss += -np.sum(np.log(exp_scores / sum_exp_scores)) / num_train + reg * np.sum(W * W) # shape is (1,)
    margins = exp_scores / sum_exp_scores # shape is (N, C)
    margins[np.arange(num_train), y] += -1  # shape is (N, C)
    dW += np.dot(X.T, margins) / num_train + reg * W
    return loss, dW

In [4]:
import time
W = np.random.randn(3073, 10) * 0.0001

tic = time.time()
loss_naive, grad_naive = softmax_loss_naive(W, X_dev, y_dev, 0.00001)
toc = time.time()
print('naive loss: %e computed in %fs' % (loss_naive, toc - tic))

tic = time.time()
loss_vectorized, grad_vectorized = softmax_loss_vectorized(W, X_dev, y_dev, 0.00001)
toc = time.time()
print('vectorized loss: %e computed in %fs' % (loss_vectorized, toc - tic))

grad_difference = np.linalg.norm(grad_naive - grad_vectorized, ord='fro')
print('Loss difference: %f' % np.abs(loss_naive - loss_vectorized))
print('Gradient difference: %f' % grad_difference)

naive loss: 2.351919e+01 computed in 0.094080s
vectorized loss: 2.351919e+01 computed in 0.005989s
Loss difference: 0.000000
Gradient difference: 0.000000


In [7]:
results = {}
best_val = -1
best_softmax = None
learning_rates = [1e-7, 5e-6]
regularization_strengths = [5e4, 1e4]
# 双重循环学习速率和正则化两个超参数，来找到最佳的超参数
for learning_rate in learning_rates:
    for regularization_strength in regularization_strengths:
        #实例一个Softmax 分类器
        softmax = Softmax()
        # 获取训练集和验证集的精确率
        loss_hist,_ = softmax.train(X_train, y_train, learning_rate, regularization_strength, num_iters=1500, batch_size=200)
        y_train_pred = softmax.predict(X_train)
        y_val_pred = softmax.predict(X_val)
        training_accuracy = np.mean(y_train_pred == y_train)
        validation_accuracy = np.mean(y_val_pred == y_val)
        # 将对应的学习速率和正则化超参数的精确率都保存起来
        results[(learning_rate, regularization_strength)] = (training_accuracy, validation_accuracy)
        
        if validation_accuracy > best_val:
            best_val = validation_accuracy
            best_softmax = softmax
            
# Print out results.
for lr, reg in sorted(results):
    training_accuracy, validation_accuracy = results[(lr, reg)]
    print('lr %e reg %e train accuracy: %f val accuracy: %f' % (
                lr, reg, training_accuracy, validation_accuracy))
    
print('best validation accuracy achieved during cross-validation: %f' % best_val)



lr 1.000000e-07 reg 1.000000e+04 train accuracy: 0.237327 val accuracy: 0.225000
lr 1.000000e-07 reg 5.000000e+04 train accuracy: 0.386898 val accuracy: 0.403000
lr 5.000000e-06 reg 1.000000e+04 train accuracy: 0.099898 val accuracy: 0.105000
lr 5.000000e-06 reg 5.000000e+04 train accuracy: 0.099898 val accuracy: 0.105000
best validation accuracy achieved during cross-validation: 0.403000
