In [16]:
import time
import numpy as np
import matplotlib.pyplot as plt
from cs231n.classifiers.fc_net import *
from cs231n.data_utils import get_CIFAR10_data
from cs231n.gradient_check import eval_numerical_gradient, eval_numerical_gradient_array
from cs231n.solver import Solver

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

def rel_error(x, y):
    """ returns relative error """
    return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))

def print_mean_std(x,axis=0):
    print('  means: ', x.mean(axis=axis))
    print('  stds:  ', x.std(axis=axis))
    print() 

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
# Load the (preprocessed) CIFAR10 data.
data = get_CIFAR10_data()
for k, v in data.items():
    print('%s: ' % k, v.shape)

X_train:  (49000, 3, 32, 32)
y_train:  (49000,)
X_val:  (1000, 3, 32, 32)
y_val:  (1000,)
X_test:  (1000, 3, 32, 32)
y_test:  (1000,)


In [65]:
# Check the training-time forward pass by checking means and variances
# of features both before and after batch normalization   

# Simulate the forward pass for a two-layer network
np.random.seed(231)
N, D1, D2, D3 = 200, 50, 60, 3
X = np.random.randn(N, D1)
W1 = np.random.randn(D1, D2)
W2 = np.random.randn(D2, D3)
a = np.maximum(0, X.dot(W1)).dot(W2)

print('Before batch normalization, a: ')
print_mean_std(a,axis=0)

gamma = np.ones((D3,))
beta = np.zeros((D3,))
# Means should be close to zero and stds close to one
print('After batch normalization (gamma=1, beta=0), a_norm:')
a_norm, _ = batchnorm_forward(a, gamma, beta, {'mode': 'train'})
print_mean_std(a_norm,axis=0)

gamma = np.asarray([1.0, 2.0, 3.0])
beta = np.asarray([11.0, 12.0, 13.0])
# Now means should be close to beta and stds close to gamma
print('After batch normalization (gamma=', gamma, ', beta=', beta, '), a_norm:')
a_norm, _ = batchnorm_forward(a, gamma, beta, {'mode': 'train'})
print_mean_std(a_norm,axis=0)


Before batch normalization, a: 
  means:  [ -2.3814598  -13.18038246   1.91780462]
  stds:   [ 27.18502186  34.21455511  37.68611762]

After batch normalization (gamma=1, beta=0), a_norm:
  means:  [  4.44089210e-17   8.27116153e-17   4.46864767e-17]
  stds:   [ 0.99999999  1.          1.        ]

After batch normalization (gamma= [ 1.  2.  3.] , beta= [ 11.  12.  13.] ), a_norm:
  means:  [ 11.  12.  13.]
  stds:   [ 0.99999999  1.99999999  2.99999999]



In [66]:
# Check the test-time forward pass by running the training-time
# forward pass many times to warm up the running averages, and then
# checking the means and variances of activations after a test-time
# forward pass.

np.random.seed(231)
N, D1, D2, D3 = 200, 50, 60, 3
W1 = np.random.randn(D1, D2)
W2 = np.random.randn(D2, D3)

bn_param = {'mode': 'train'}
gamma = np.ones(D3)
beta = np.zeros(D3)

for t in range(50):
    X = np.random.randn(N, D1)
    a = np.maximum(0, X.dot(W1)).dot(W2)
    batchnorm_forward(a, gamma, beta, bn_param)
print('bn_param after training: ', bn_param, '\n')
    
bn_param['mode'] = 'test'
X = np.random.randn(N, D1)
a = np.maximum(0, X.dot(W1)).dot(W2)
a_norm, _ = batchnorm_forward(a, gamma, beta, bn_param)

# Means should be close to zero and stds close to one, but will be
# noisier than training-time forward passes.
print('After batch normalization (test-time), a_norm:')
print_mean_std(a_norm,axis=0)

bn_param after training:  {'mode': 'train', 'running_mean': array([ -0.32415038,  18.55718135,  14.18894184]), 'running_var': array([ 34.57738618,  35.22431942,  36.07028501])} 

After batch normalization (test-time), a_norm:
  means:  [-0.23188545 -0.25906202 -0.63026885]
  stds:   [ 5.99478954  6.03037459  5.89828084]



In [161]:
# Gradient check batchnorm backward pass
np.random.seed(231)
N, D = 4, 5
x = 5 * np.random.randn(N, D) + 12
gamma = np.random.randn(D)
beta = np.random.randn(D)
dout = np.random.randn(N, D)

bn_param = {'mode': 'train'}
fx = lambda x: batchnorm_forward(x, gamma, beta, bn_param)[0]
fg = lambda g: batchnorm_forward(x, g, beta, bn_param)[0]
fb = lambda b: batchnorm_forward(x, gamma, b, bn_param)[0]

dx_num = eval_numerical_gradient_array(fx, x, dout)
dg_num = eval_numerical_gradient_array(fg, gamma.copy(), dout)
db_num = eval_numerical_gradient_array(fb, beta.copy(), dout)

_, cache = batchnorm_forward(x, gamma, beta, bn_param)
dx, dgamma, dbeta = batchnorm_backward(dout, cache)
#You should expect to see relative errors between 1e-13 and 1e-8

print('dout:', dout, '\n')

print('x: ', x)
print('dx_num:', dx_num)
print('dx_ana:', dx)
print('dx error: ', rel_error(dx_num, dx),'\n')

print('gamma: ', gamma)
print('dg_num:', dg_num)
print('dg_ana:', dgamma)
print('dgamma error: ', rel_error(dg_num, dgamma),'\n')

print('beta: ', beta)
print('db_num:', db_num)
print('db_ana:', dbeta)
print('dbeta error: ', rel_error(db_num, dbeta),'\n')

xmu shape:  (4, 5)
dvar shape:  (5,)
dxmu shape:  (5,)
dout: [[ 0.27423503  0.76215717 -0.69550058  0.29214712 -0.38489942]
 [ 0.1228747  -1.42904497  0.70286283 -0.85850947 -1.14042979]
 [-1.58535997 -0.01530138 -0.32156083  0.56834936 -0.19961722]
 [ 1.27286625  1.27292534  1.58102968 -1.75626715  0.9217743 ]] 

x:  [[ 14.08971705  18.98550139   3.07047846   8.45586133  11.62637342]
 [  8.12491616  11.25101049  21.30864512   4.8723535   10.1182165 ]
 [ 10.28862305  13.47453818   7.81338135  16.76093835  18.64658296]
 [ 14.62326227  11.25950008  16.44765974  12.62223264  16.95546256]]
dx_num: [[-0.00310319  0.00305468 -0.00156246  0.17251307  0.01388029]
 [ 0.01147762 -0.10800884 -0.01112564 -0.02021632 -0.02098085]
 [-0.01682492 -0.01106847 -0.00384286  0.13581055 -0.04108612]
 [ 0.00845049  0.11602263  0.01653096 -0.2881073   0.04818669]]
dx_ana: [[-0.00310319  0.00305468 -0.00156246  0.17251307  0.01388029]
 [ 0.01147762 -0.10800884 -0.01112564 -0.02021632 -0.02098085]
 [-0.0168249

In [148]:
np.random.seed(231)
N, D = 4, 5
x = 5 * np.random.randn(N, D) + 12
gamma = np.random.randn(D)
beta = np.random.randn(D)
dout = np.random.randn(N, D)

bn_param = {'mode': 'train'}
fx = lambda x: batchnorm_forward(x, gamma, beta, bn_param)[0]

dx_num = eval_numerical_gradient_array(fx, x, dout)

_, cache = batchnorm_forward(x, gamma, beta, bn_param)
dx, _, _ = batchnorm_backward_alt(dout, cache)

print('x: ', x)
print('dx_num:', dx_num)
print('dx_ana:', dx)
print('dx error: ', rel_error(dx_num, dx),'\n')

x:  [[ 14.08971705  18.98550139   3.07047846   8.45586133  11.62637342]
 [  8.12491616  11.25101049  21.30864512   4.8723535   10.1182165 ]
 [ 10.28862305  13.47453818   7.81338135  16.76093835  18.64658296]
 [ 14.62326227  11.25950008  16.44765974  12.62223264  16.95546256]]
dx_num: [[-0.00310319  0.00305468 -0.00156246  0.17251307  0.01388029]
 [ 0.01147762 -0.10800884 -0.01112564 -0.02021632 -0.02098085]
 [-0.01682492 -0.01106847 -0.00384286  0.13581055 -0.04108612]
 [ 0.00845049  0.11602263  0.01653096 -0.2881073   0.04818669]]
dx_ana: [[-0.00310319  0.00305468 -0.00156246  0.17251307  0.01388029]
 [ 0.01147762 -0.10800884 -0.01112564 -0.02021632 -0.02098085]
 [-0.01682492 -0.01106847 -0.00384286  0.13581055 -0.04108612]
 [ 0.00845049  0.11602263  0.01653096 -0.2881073   0.04818669]]
dx error:  1.66746637939e-09 



In [67]:
np.random.seed(231)
N, D = 100, 500
x = 5 * np.random.randn(N, D) + 12
gamma = np.random.randn(D)
beta = np.random.randn(D)
dout = np.random.randn(N, D)

bn_param = {'mode': 'train'}
out, cache = batchnorm_forward(x, gamma, beta, bn_param)

t1 = time.time()
dx1, dgamma1, dbeta1 = batchnorm_backward(dout, cache)
t2 = time.time()
dx2, dgamma2, dbeta2 = batchnorm_backward_alt(dout, cache)
t3 = time.time()

print('dx difference: ', rel_error(dx1, dx2))
print('dgamma difference: ', rel_error(dgamma1, dgamma2))
print('dbeta difference: ', rel_error(dbeta1, dbeta2))
print('speedup: %.2fx' % ((t2 - t1) / (t3 - t2)))

dx difference:  8.46498696262e-13
dgamma difference:  0.0
dbeta difference:  0.0
speedup: 3.35x


In [190]:
# Check the training-time forward pass by checking means and variances
# of features both before and after layer normalization   

# Simulate the forward pass for a two-layer network
np.random.seed(231)
N, D1, D2, D3 =4, 50, 60, 3
X = np.random.randn(N, D1)
W1 = np.random.randn(D1, D2)
W2 = np.random.randn(D2, D3)
a = np.maximum(0, X.dot(W1)).dot(W2)

print('Before layer normalization:')
print_mean_std(a,axis=1)

gamma = np.ones(D3)
beta = np.zeros(D3)
# Means should be close to zero and stds close to one
print('After layer normalization (gamma=1, beta=0)')
a_norm, _ = layernorm_forward(a, gamma, beta, {'mode': 'train'})
print_mean_std(a_norm,axis=1)

gamma = np.asarray([3.0,3.0,3.0])
beta = np.asarray([5.0,5.0,5.0])
# Now means should be close to beta and stds close to gamma
print('After layer normalization (gamma=', gamma, ', beta=', beta, ')')
a_norm, _ = layernorm_forward(a, gamma, beta, {'mode': 'train'})
print_mean_std(a_norm,axis=1)

Before layer normalization:
  means:  [-59.06673243 -47.60782686 -43.31137368 -26.40991744]
  stds:   [ 10.07429373  28.39478981  35.28360729   4.01831507]

After layer normalization (gamma=1, beta=0)
  means:  [ -4.81096644e-16   0.00000000e+00   7.40148683e-17  -5.92118946e-16]
  stds:   [ 0.99999995  0.99999999  1.          0.99999969]

After layer normalization (gamma= [ 3.  3.  3.] , beta= [ 5.  5.  5.] )
  means:  [ 5.  5.  5.  5.]
  stds:   [ 2.99999985  2.99999998  2.99999999  2.99999907]



In [204]:
import import_ipynb
%run layers.ipynb

In [205]:
# Gradient check layernorm backward
np.random.seed(231)
N, D = 4, 5
x = 5 * np.random.randn(N, D) + 12
gamma = np.random.randn(D)
beta = np.random.randn(D)
dout = np.random.randn(N, D)

ln_param = {}
fx = lambda x: layernorm_forward(x, gamma, beta, ln_param)[0]
fg = lambda g: layernorm_forward(x, g, beta, ln_param)[0]
fb = lambda b: layernorm_forward(x, gamma, b, ln_param)[0]

dx_num = eval_numerical_gradient_array(fx, x, dout)
dg_num = eval_numerical_gradient_array(fg, gamma.copy(), dout)
db_num = eval_numerical_gradient_array(fb, beta.copy(), dout)

_, cache = layernorm_forward(x, gamma, beta, ln_param)
dx, dgamma, dbeta = layernorm_backward(dout, cache)

#You should expect to see relative errors between 1e-12 and 1e-8
print('x: ', x)
print('dx_num:', dx_num)
print('dx_ana:', dx)
print('dx error: ', rel_error(dx_num, dx),'\n')

print('gamma: ', gamma)
print('dg_num:', dg_num)
print('dg_ana:', dgamma)
print('dgamma error: ', rel_error(dg_num, dgamma),'\n')

print('beta: ', beta)
print('db_num:', db_num)
print('db_ana:', dbeta)
print('dbeta error: ', rel_error(db_num, dbeta),'\n')

x:  [[ 14.08971705  18.98550139   3.07047846   8.45586133  11.62637342]
 [  8.12491616  11.25101049  21.30864512   4.8723535   10.1182165 ]
 [ 10.28862305  13.47453818   7.81338135  16.76093835  18.64658296]
 [ 14.62326227  11.25950008  16.44765974  12.62223264  16.95546256]]
dx_num: [[-0.0148552   0.01032912 -0.01190652  0.04456401 -0.02813141]
 [ 0.06974204 -0.02127583 -0.00771128 -0.04754429  0.00678935]
 [-0.01334007 -0.01950385  0.00393253  0.09003202 -0.06112062]
 [ 0.07764743  0.38964293  0.06352497 -0.56141532  0.0306    ]]
dx_ana: [[-0.0148552   0.01032912 -0.01190652  0.04456401 -0.02813141]
 [ 0.06974204 -0.02127583 -0.00771128 -0.04754429  0.00678935]
 [-0.01334007 -0.01950385  0.00393253  0.09003202 -0.06112062]
 [ 0.07764743  0.38964293  0.06352497 -0.56141532  0.0306    ]]
dx error:  2.10727914716e-09 

gamma:  [ 0.03514666  0.26207083  0.14320173  0.90101716  0.23185863]
dg_num: [ 1.45413018 -0.74806364  4.30445918  2.71523651  1.0074201 ]
dg_ana: [ 1.45413018 -0.748063