In [None]:
# this is shamelessly stolen from Vincent Dumoulin's Github:
# https://github.com/vdumoulin/ift6266h15/blob/master/assignments/01/solution_theano.py

In [3]:
import gzip
import cPickle
import numpy
import theano
import theano.tensor as tensor

In [4]:
def one_hot_encode(y, num_classes):
    """
    Performs a one-hot encoding of a batch of integer targets
    Parameters
    ----------
    y : numpy.ndarray
        Batch of integer targets of shape (batch_size, )
    num_classes : int
        Number of classes
    Returns
    -------
    Y : numpy.ndarray
        One-hot encoded matrix of shape (batch_size, num_classes) corresponding
        to y
    """
    Y = numpy.zeros((y.shape[0], num_classes))
    for i, c in enumerate(y):
        Y[i, c] = 1
    return Y

In [5]:
# Load data
with gzip.open('mnist.pkl.gz', 'rb') as f:
    train_set, valid_set, test_set = cPickle.load(f)
    train_X, train_y = train_set
    valid_X, valid_y = valid_set
    test_X, test_y = test_set
train_y = one_hot_encode(train_y, 10)
valid_y = one_hot_encode(valid_y, 10)
test_y = one_hot_encode(test_y, 10)

In [9]:
def relu(x):
    return tensor.switch(x>0, x, 0)

In [25]:
# Instantiate symbolic variables
X = tensor.matrix('X')
T = tensor.matrix('T')
W = theano.shared(
    numpy.random.uniform(low=-0.01, high=0.01, size=(784, 500)), 'W')
b = theano.shared(numpy.zeros(500))
V = theano.shared(
    numpy.random.uniform(low=-0.01, high=0.01, size=(500, 10)), 'V')
c = theano.shared(numpy.zeros(10))
params = [W, b, V, c]

# Build computation graph
H = tensor.nnet.sigmoid(tensor.dot(X, W) + b)
#H = relu(tensor.dot(X, W) + b)
Y = tensor.nnet.softmax(tensor.dot(H, V) + c)
loss = -(T * tensor.log(Y)).sum(axis=1).mean()
misclass = tensor.neq(T.argmax(axis=1), Y.argmax(axis=1)).mean()

grads = tensor.grad(loss, params)

In [26]:
# Compile function
updates = dict((param, param - 0.032 * grad)
               for param, grad in zip(params, grads))
f = theano.function(inputs=[X, T], updates=updates)
g = theano.function(inputs=[X, T], outputs=[loss, misclass])



In [27]:
# Call function with numerical values
batch_size = 100
num_batches = train_X.shape[0] / batch_size
for epoch in xrange(10):
    for i in xrange(num_batches):
        numpy_X = train_X[batch_size * i: batch_size * (i + 1)]
        numpy_T = train_y[batch_size * i: batch_size * (i + 1)]
        f(numpy_X, numpy_T)
    print "Epoch " + str(epoch + 1) + ":"
    print "    Train loss/misclass: %0.2f/%0.2f" % tuple(g(train_X, train_y))
    print "    Valid loss/misclass: %0.2f/%0.2f" % tuple(g(valid_X, valid_y))
    print "    Test  loss/misclass: %0.2f/%0.2f" % tuple(g(test_X, test_y))

Epoch 1:
    Train loss/misclass: 2.12/0.56
    Valid loss/misclass: 2.12/0.55
    Test  loss/misclass: 2.12/0.55
Epoch 2:
    Train loss/misclass: 1.23/0.32
    Valid loss/misclass: 1.20/0.30
    Test  loss/misclass: 1.21/0.31
Epoch 3:
    Train loss/misclass: 0.78/0.20
    Valid loss/misclass: 0.75/0.18
    Test  loss/misclass: 0.76/0.19
Epoch 4:
    Train loss/misclass: 0.61/0.16
    Valid loss/misclass: 0.57/0.14
    Test  loss/misclass: 0.58/0.15
Epoch 5:
    Train loss/misclass: 0.52/0.14
    Valid loss/misclass: 0.48/0.13
    Test  loss/misclass: 0.50/0.13
Epoch 6:
    Train loss/misclass: 0.47/0.13
    Valid loss/misclass: 0.43/0.12
    Test  loss/misclass: 0.44/0.12
Epoch 7:
    Train loss/misclass: 0.43/0.12
    Valid loss/misclass: 0.40/0.11
    Test  loss/misclass: 0.41/0.11
Epoch 8:
    Train loss/misclass: 0.41/0.11
    Valid loss/misclass: 0.38/0.10
    Test  loss/misclass: 0.39/0.11
Epoch 9:
    Train loss/misclass: 0.39/0.11
    Valid loss/misclass: 0.36/0.10
    Test 