In [6]:
import numpy as np
import matplotlib.pyplot as plt
from cs231n.classifiers.cnn import *
from cs231n.data_utils import get_CIFAR10_data
from cs231n.gradient_check import eval_numerical_gradient_array, eval_numerical_gradient
from cs231n.layers import *
from cs231n.fast_layers import *
from cs231n.solver import Solver

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

def rel_error(x, y):
  """ returns relative error """
  return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
data = get_CIFAR10_data()
for k, v in data.iteritems():
  print '%s: ' % k, v.shape

X_val:  (1000L, 3L, 32L, 32L)
X_train:  (49000L, 3L, 32L, 32L)
X_test:  (1000L, 3L, 32L, 32L)
y_val:  (1000L,)
y_train:  (49000L,)
y_test:  (1000L,)


# Convolutional "sandwich" layers
In the file `cs231n/layer_utils.py`.

In [8]:
from cs231n.layer_utils import conv_relu_conv_relu_pool_forward, conv_relu_conv_relu_pool_backward

x = np.random.randn(2, 3, 16, 16)
w1 = np.random.randn(3, 3, 3, 3)
b1 = np.random.randn(3,)
w2 = np.random.randn(3, 3, 3, 3)
b2 = np.random.randn(3,)
dout = np.random.randn(2, 3, 8, 8)
conv_param = {'stride': 1, 'pad': 1}
pool_param = {'pool_height': 2, 'pool_width': 2, 'stride': 2}

out, cache = conv_relu_conv_relu_pool_forward(x, w1, b1, w2, b2, conv_param, pool_param)
dx, dw1, db1, dw2, db2 = conv_relu_conv_relu_pool_backward(dout, cache)

dx_num = eval_numerical_gradient_array(lambda x: conv_relu_conv_relu_pool_forward(x, w1, b1, w2, b2, conv_param, pool_param)[0], x, dout)
dw1_num = eval_numerical_gradient_array(lambda w1: conv_relu_conv_relu_pool_forward(x, w1, b1, w2, b2, conv_param, pool_param)[0], w1, dout)
db1_num = eval_numerical_gradient_array(lambda b1: conv_relu_conv_relu_pool_forward(x, w1, b1, w2, b2, conv_param, pool_param)[0], b1, dout)
dw2_num = eval_numerical_gradient_array(lambda w2: conv_relu_conv_relu_pool_forward(x, w1, b1, w2, b2, conv_param, pool_param)[0], w2, dout)
db2_num = eval_numerical_gradient_array(lambda b2: conv_relu_conv_relu_pool_forward(x, w1, b1, w2, b2, conv_param, pool_param)[0], b2, dout)

print 'Testing conv_relu_conv_relu_pool'
print 'dx error: ', rel_error(dx_num, dx)
print 'dw1 error: ', rel_error(dw1_num, dw1)
print 'db1 error: ', rel_error(db1_num, db1)
print 'dw2 error: ', rel_error(dw2_num, dw2)
print 'db2 error: ', rel_error(db2_num, db2)

Testing conv_relu_conv_relu_pool
dx error:  5.15920401734e-08
dw1 error:  8.06346009126e-10
db1 error:  1.02967828355e-09
dw2 error:  4.32444890916e-10
db2 error:  1.66922322994e-11


## Sanity check loss
After you build a new network, one of the first things you should do is sanity check the loss. When we use the softmax loss, we expect the loss for random weights (and no regularization) to be about `log(C)` for `C` classes. When we add regularization this should go up.

In [9]:
fc_hidden_dims = [100, 50]
num_filters = [64, 32, 32, 16]
std = 1e-3
model = ConvNet(fc_hidden_dims, num_filters, filter_size=3, stride=1, input_dim=(3, 32, 32),
                 num_classes=10, use_batchnorm=False, dropout=0, weight_scale=std, reg=0.0,
                 dtype=np.float32)

print 'Testing initialization ... '
W1_std = abs(model.params['W_1'].std() - std)
b1 = model.params['b_1']
W2_std = abs(model.params['W_2'].std() - std)
b2 = model.params['b_2']
W3_std = abs(model.params['W_3'].std() - std)
b3 = model.params['b_3']
W11_std = abs(model.params['W11'].std() - std)
b11 = model.params['b11']
W21_std = abs(model.params['W21'].std() - std)
b21 = model.params['b21']
W12_std = abs(model.params['W12'].std() - std)
b12 = model.params['b12']
W22_std = abs(model.params['W22'].std() - std)
b22 = model.params['b22']

print 'parameters: ', model.params.keys()

assert W1_std < std / 10, 'First FC layer weights do not seem right'
assert np.all(b1 == 0), 'First FC layer biases do not seem right'
assert W2_std < std / 10, 'Second FC ayer weights do not seem right'
assert np.all(b2 == 0), 'Second FC layer biases do not seem right'
assert W3_std < std / 10, 'Third FC layer weights do not seem right'
assert np.all(b3 == 0), 'Third FC layer biases do not seem right'
assert W11_std < std / 10, 'First conv layer1 weights do not seem right'
assert np.all(b11 == 0), 'First conv layer1 biases do not seem right'
assert W21_std < std / 10, 'First conv layer2 weights do not seem right'
assert np.all(b21 == 0), 'First conv layer2 biases do not seem right'
assert W12_std < std / 10, 'Second conv layer1 weights do not seem right'
assert np.all(b11 == 0), 'Second conv layer1 biases do not seem right'
assert W22_std < std / 10, 'Second conv layer2 weights do not seem right'
assert np.all(b21 == 0), 'Second conv layer2 biases do not seem right'

N = 50
X = np.random.randn(N, 3, 32, 32)
y = np.random.randint(10, size=N)

scores = model.loss(X)
print 'Initial scores shape:', scores.shape, '. Should be (', N, ', 10).'

loss, grads = model.loss(X, y)
print 'Initial loss (no regularization): ', loss, '. Shoud be very close to', -np.log(1./10)

model.reg = 0.5
loss, grads = model.loss(X, y)
print 'Initial loss (with regularization): ', loss

Testing initialization ... 
parameters:  ['b11', 'W12', 'b21', 'b22', 'b12', 'W11', 'W21', 'W22', 'b_1', 'b_3', 'W_1', 'b_2', 'W_3', 'W_2']
Initial scores shape: (50L, 10L) . Should be ( 50 , 10).
Initial loss (no regularization):  2.30258509299 . Shoud be very close to 2.30258509299
Initial loss (with regularization):  2.33802254583


## Gradient check
After the loss looks reasonable, use numeric gradient checking to make sure that your backward pass is correct. When you use numeric gradient checking you should use a small amount of artifical data and a small number of neurons at each layer.

In [18]:
fc_hidden_dims = [50, 20]
num_filters = [32, 16, 16, 8]
std = 0.1

num_inputs = 2
input_dim = (3, 16, 16)
reg = 0.0
num_classes = 10

X = np.random.randn(num_inputs, *input_dim)
y = np.random.randint(num_classes, size=num_inputs)

model = ConvNet(fc_hidden_dims, num_filters, filter_size=3, stride=1, input_dim=input_dim,
                 num_classes=10, use_batchnorm=False, dropout=0, weight_scale=std, reg=0.5,
                 dtype=np.float64)
loss, grads = model.loss(X, y)
for param_name in sorted(grads):
    f = lambda _: model.loss(X, y)[0]
    param_grad_num = eval_numerical_gradient(f, model.params[param_name], verbose=False, h=1e-8)
    e = rel_error(param_grad_num, grads[param_name])
    print '%s max relative error: %e' % (param_name, rel_error(param_grad_num, grads[param_name]))
#     print param_grad_num.shape, param_grad_num
#     print grads[param_name].shape, grads[param_name]

W11 max relative error: 2.250286e-03
W12 max relative error: 5.115137e-02
W21 max relative error: 1.172350e-01
W22 max relative error: 1.648105e-03
W_1 max relative error: 2.939754e-03
W_2 max relative error: 1.068133e-03
W_3 max relative error: 1.000000e+00
b11 max relative error: 5.323990e-03
b12 max relative error: 5.918992e-05
b21 max relative error: 7.541519e-05
b22 max relative error: 3.177540e-05
b_1 max relative error: 3.911402e-04
b_2 max relative error: 1.026653e-05
b_3 max relative error: 1.550175e-06


## Overfit small data
A nice trick is to train your model with just a few training samples. You should be able to overfit small datasets, which will result in very high training accuracy and comparatively low validation accuracy.

In [19]:
num_train = 100
small_data = {
  'X_train': data['X_train'][:num_train],
  'y_train': data['y_train'][:num_train],
  'X_val': data['X_val'],
  'y_val': data['y_val'],
}

fc_hidden_dims = [50, 20]
num_filters = [32, 16, 16, 8]
input_dim = (3, 16, 16)

model = ConvNet(fc_hidden_dims, num_filters, filter_size=3, stride=1, input_dim=input_dim,
                 num_classes=10, use_batchnorm=False, dropout=0, weight_scale=1e-1, reg=0.5,
                 dtype=np.float64)

solver = Solver(model, small_data,
                num_epochs=10, batch_size=50,
                update_rule='adam',
                optim_config={
                  'learning_rate': 1e-3,
                },
                verbose=True, print_every=1)
solver.train()

plt.subplot(2, 1, 1)
plt.plot(solver.loss_history, 'o')
plt.xlabel('iteration')
plt.ylabel('loss')

plt.subplot(2, 1, 2)
plt.plot(solver.train_acc_history, '-o')
plt.plot(solver.val_acc_history, '-o')
plt.legend(['train', 'val'], loc='upper left')
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.show()

ValueError: shapes (50,512) and (128,50) not aligned: 512 (dim 1) != 128 (dim 0)