#Week 3 - Solutions

1  Coding Homework

The Glorot, Bengio paper suggests that hyperbolic tangent avoids bias issues inherent using the sigmoid function for activation. It also gives specific numeric recommendations regarding the initialization values for the weights. Implement one or the other or both of these changes, in addition to using one of the acceleration methods (AdaGrad, AdaDelt, Nesterov, Adam). How does that affect training speed and network performance. (Need some volunteers to take various combinations in order to compare.)

The code below adds NAG (Nesterov accelerated gradient) to simple multiclass regression for classifying MNIST digits.  Any of the methods suggested require expanding the list of theano shared variables that are updated iteratively.  NAG requires two new variables in addition to the update required for the weight variable.  To appreciate this look once again at the definition of Nesterov's method.  Here's a link to the definition - https://blogs.princeton.edu/imabandit/2013/04/01/acceleratedgradientdescent/ .  To determine which variable must be updated recursively look for those variables that have the variable at the next step (indicated by an index like "i + 1") on the left hand side of the equation and the variable at an earlier time step ("i") on the right hand side of the equation.  The variable $\lambda$ needs to be updated as part of the recursion.  The variable $\gamma$ does not.  $\gamma$ at an earlier time step does not appear on the left hand side of any of the set of three equations.  

In [1]:
import theano
from theano import tensor as T
import numpy as np
from mnistReader import mnist
from math import sqrt

def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def init_weights(shape):
    (h, w) = shape
    normalizer = 2.0 * sqrt(6) / sqrt(h + w) * 0.1
    return theano.shared(floatX((np.random.random_sample(shape) - 0.5) * normalizer))  #code for using Glorot init

def model(X, w):
    return T.nnet.softmax(T.dot(X, w))

trX, teX, trY, teY = mnist()

X = T.fmatrix()
Y = T.fmatrix()

w = init_weights((784, 10))
y = theano.shared(w.get_value())

lamb = theano.shared(floatX(np.array(0.0)), 'lamb')


py_x = model(X, w)
y_pred = T.argmax(py_x, axis=1)

cost = T.mean(T.nnet.categorical_crossentropy(py_x, Y))
gradient = T.grad(cost=cost, wrt=w)
lambNew = (1 + T.sqrt(1 + 4 * lamb * lamb)) / 2.0
yNew = w - gradient * 0.0001
gamma = (1 - lamb) / lambNew


update = [(w, (1 - gamma) * (w - gradient) + gamma * y), (y, w - gradient * 0.0001 ), (lamb, lambNew)]

train = theano.function(inputs=[X, Y], outputs=cost, updates=update, allow_input_downcast=True)
predict = theano.function(inputs=[X], outputs=y_pred, allow_input_downcast=True)

for i in range(101):
    for start, end in zip(range(0, len(trX), 10000), range(10000, len(trX), 10000)):
        cost = train(trX[start:end], trY[start:end])
    print i, np.mean(np.argmax(teY, axis=1) == predict(teX)), lamb.get_value(), gamma.eval()

0 0.5191 3.29487967491 -0.598778605461
1 0.6897 5.94211673737 -0.764664769173
2 0.819 8.53253173828 -0.832584440708
3 0.8437 11.098241806 -0.869825780392
4 0.8845 13.6499710083 -0.893414735794
5 0.8889 16.1926727295 -0.909719645977
6 0.8986 18.7290496826 -0.921673119068
7 0.8903 21.2607440948 -0.930817067623
8 0.8964 23.7888298035 -0.9380402565
9 0.9062 26.3140525818 -0.943892002106
10 0.9103 28.836938858 -0.948729753494
11 0.9124 31.3578910828 -0.952796697617
12 0.9133 33.8772087097 -0.956263840199
13 0.9174 36.3951301575 -0.959254980087
14 0.9176 38.9118423462 -0.96186196804
15 0.9174 41.4275016785 -0.964154601097
16 0.9206 43.9422340393 -0.966186463833
17 0.9222 46.4561424255 -0.967999875546
18 0.9214 48.9693145752 -0.969628155231
19 0.9242 51.4818229675 -0.9710983634
20 0.921 53.9937324524 -0.972432553768
21 0.9242 56.5050964355 -0.973648786545
22 0.9208 59.0159683228 -0.974762022495
23 0.9229 61.526386261 -0.9757848382
24 0.9223 64.0363769531 -0.976727724075
25 0.9226 66.545989990

KeyboardInterrupt: 

Here's what the numbers were before modifications.
0 0.8864
10 0.9165
20 0.9203
30 0.9212
40 0.9219
50 0.9232
60 0.9241
70 0.9239
80 0.9244
90 0.9248
100 0.925


2 Code a standard 3 layer ANN for classifying Cifar images.

In [13]:
import theano
from theano import tensor as T
import numpy as np
from math import sqrt
from readCifar import cifar
from scipy.misc import imsave


def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def init_weights(shape):
    (h, w) = shape
    # last constant is derivative of activation function
    #0.25 for sigmoid and 0.1 for softmax, 1.0 for tanh
    normalizer = 2.0 * sqrt(6) / sqrt(h + w) * 1.0 
    
    #code for standard initialization
    #return theano.shared(floatX(np.random.randn(*shape) * 0.01))  
    
    #code for using Glorot init
    return theano.shared(floatX((np.random.random_sample(shape) - 0.5) * normalizer))  

def sgd(cost, params, lr=0.005):
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    for p, g in zip(params, grads):
        updates.append([p, p - g * lr])
    return updates

def model(X, w_h, w_o):
    #code of tahn activation functions
    h = T.tanh(T.dot(X, w_h))

    #code for sigmoid activation functions
    #h = T.nnet.sigmoid(T.dot(X, w_h))
    pyx = T.nnet.softmax(T.dot(h, w_o))
    return pyx, h

xTrain, yTrain, xTest, yTest = cifar()

X = T.fmatrix()
Y = T.fmatrix()

w_h = init_weights((3072, 1500))
w_o = init_weights((1500, 10))

py_x, h = model(X, w_h, w_o)
y_x = T.argmax(py_x, axis=1)

cost = T.mean(T.nnet.categorical_crossentropy(py_x, Y))
params = [w_h, w_o]
updates = sgd(cost, params)

train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)
predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True)

for i in range(101):
    for start, end in zip(range(0, len(xTrain), 500), range(500, len(xTrain), 500)):
        cost = train(xTrain[start:end], yTrain[start:end])
        py_x, h = model(xTrain[start:end], w_h, w_o)
        print  i, np.mean(np.argmax(yTest, axis=1) == predict(xTest)), 
             T.mean(h).eval(), sqrt(T.var(h).eval())

 0 0.0917 -0.0345314503712 0.996978518224
0 0.1505 -0.0431218379887 0.996827100065
0 0.1481 -0.0353252689169 0.997241621678
0 0.1602 -0.035856107308 0.997265500098
0 0.1687 -0.0334214123161 0.997550730816
0 0.1862 -0.0297363963154 0.997656348268
0 0.1774 -0.0375645305641 0.997493373646
0 0.1876 -0.0340690500413 0.997643295499
0 0.179 -0.036625870186 0.99760639378
0 0.208 -0.0326791267619 0.997780317042
0 0.1906 -0.0322513638775 0.997826029457
0 0.1931 -0.0347155552989 0.997866143279
0 0.197 -0.032918108108 0.997825874241
0 0.1994 -0.0295494599502 0.998074421513
0 0.2092 -0.0280240990255 0.998117907955
0 0.2055 -0.0316210823904 0.998018217161
0 0.2181 -0.0198016383538 0.998407498869
0 0.2256 -0.0224201525192 0.998380051517
0 0.2146 -0.0171325031183 0.998457114357
1 0.2337 -0.0229125078968 0.998385565456
1 0.2327 -0.0237092691441 0.998377505828
1 0.2352 -0.0267260372389 0.998357433825
1 0.2239 -0.0212505994733 0.998497455977
1 0.2356 -0.0228477911209 0.998434611633
1 0.2228 -0.0200810662

KeyboardInterrupt: 

3
Build 4-layer network for classifying Cifar images. Use 10k training data (as in last lecture) to truncate the training time. 

In [None]:
import theano
from theano import tensor as T
import numpy as np
from math import sqrt
from readCifar import cifar
from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams

srng = RandomStreams()

def floatX(X):
    return np.asarray(X, dtype=theano.config.floatX)

def init_weights(shape):
    (h, w) = shape
    # Glorot normalization - last factor depends on non-linearity
    # 0.25 for sigmoid and 0.1 for softmax, 1.0 for tanh or Relu
    normalizer = 2.0 * sqrt(6) / sqrt(h + w) * 2.0 
    #return theano.shared(floatX(np.random.randn(*shape) * 0.01))  #code for standard initialization
    #code for using Glorot initialization
    return theano.shared(floatX((np.random.random_sample(shape) - 0.5) * normalizer))  

def rectify(X):
    return T.maximum(X, 0.)

def softmax(X):
    e_x = T.exp(X - X.max(axis=1).dimshuffle(0, 'x'))
    return e_x / e_x.sum(axis=1).dimshuffle(0, 'x')

def RMSprop(cost, params, lr=0.001, rho=0.9, epsilon=1e-6):
    grads = T.grad(cost=cost, wrt=params)
    updates = []
    for p, g in zip(params, grads):
        acc = theano.shared(p.get_value() * 0.)
        acc_new = rho * acc + (1 - rho) * g ** 2
        gradient_scaling = T.sqrt(acc_new + epsilon)
        g = g / gradient_scaling
        updates.append((acc, acc_new))
        updates.append((p, p - lr * g))
    return updates

def dropout(X, p=0.):
    if p > 0:
        retain_prob = 1 - p
        X *= srng.binomial(X.shape, p=retain_prob, dtype=theano.config.floatX)
        X /= retain_prob
    return X

def model(X, w_h, w_h2, w_o, p_drop_input, p_drop_hidden):
    X = dropout(X, p_drop_input)
    h = rectify(T.dot(X, w_h))

    h = dropout(h, p_drop_hidden)
    h2 = rectify(T.dot(h, w_h2))

    h2 = dropout(h2, p_drop_hidden)
    py_x = softmax(T.dot(h2, w_o))
    return h, h2, py_x


X = T.fmatrix()
Y = T.fmatrix()

w_h = init_weights((3072, 1500))
w_h2 = init_weights((1500, 700))
w_o = init_weights((700, 10))

noise_h, noise_h2, noise_py_x = model(X, w_h, w_h2, w_o, 0.6, 0.6)
h, h2, py_x = model(X, w_h, w_h2, w_o, 0., 0.)
y_x = T.argmax(py_x, axis=1)

cost = T.mean(T.nnet.categorical_crossentropy(noise_py_x, Y))
params = [w_h, w_h2, w_o]
updates = RMSprop(cost, params, lr=0.0001)

train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)
predict = theano.function(inputs=[X], outputs=y_x, allow_input_downcast=True)

xTrain, yTrain, xTest, yTest = cifar()
for i in range(101):
    iJump = 500
    for start, end in zip(range(0, len(xTrain), iJump), range(iJump, len(xTrain), iJump)):
        cost = train(xTrain[start:end], yTrain[start:end])
        print i, np.mean(np.argmax(yTest, axis=1) == predict(xTest))

0 0.1068
0 0.1299
0 0.1555
0 0.1527
0 0.1348
0 0.1523
0 0.146
0 0.1495
0 0.1472
0 0.1448
0 0.148
0 0.1511
0 0.1577
0 0.1675
0 0.1655
0 0.1654
0 0.1482
0 0.1494
0 0.1495
1 0.1351
1 0.1403
1 0.1446
1 0.1518
1 0.149
1 0.1536
1 0.1539
1 0.1568
1 0.1608
1 0.1582
1 0.171
1 0.1737
1 0.1695
1 0.1625
1 0.1509
1 0.1577
1 0.1544
1 0.1538
1 0.1595
2 0.1611
2 0.1546
2 0.1554
2 0.1504
2 0.1517
2 0.1476
2 0.1393
2 0.1368
2 0.1477
2 0.1471
2 0.147
2 0.1486
2 0.1493
2 0.1514
2 0.1459
2 0.143
2 0.1468
2 0.1464
2 0.151
3 0.1478
3 0.1475
3 0.1448
3 0.1461
3 0.1471
3 0.1515
3 0.1546
3 0.1544
3 0.1566
3 0.155
3 0.1562
3 0.1514
3 0.1528
3 0.1564
3 0.1526
3 0.1501
3 0.1486
3 0.1481
3 0.1419
4 0.1513
4 0.1531
4 0.1627
4 0.164
4 0.1609
4 0.1696
4 0.1679
4 0.159
4 0.1535
4 0.1533
4 0.1535
4 0.15
4 0.1479
4 0.1481
4 0.1526
4 0.147
4 0.1439
4 0.1358
4 0.1383
5 0.1409
5 0.1431
5 0.1504
5 0.1465
5 0.1484
5 0.1473
5 0.151
5 0.1489
5 0.1504
5 0.1475
5 0.1434
5 0.1403
5 0.1405
5 0.1351
5 0.1364
5 0.134
5 0.1318
5 0.130