# MNIST digit classifier example

Turn of cuDNN as its convolution operations are *not reproducible*.

In [1]:
import os
os.environ['THEANO_FLAGS'] = 'dnn.enabled=False, optimizer_including='

#### Imports

In [2]:
import numpy as np

import collections

from matplotlib import pyplot as plt

import sklearn.cross_validation

import lasagne
import theano
import theano.tensor as T
from britefury_lasagne import basic_dnn, trainer, image_window_extractor

from fuel.datasets.mnist import MNIST
import fuel

Using gpu device 0: GeForce GTX 970 (CNMeM is enabled with initial size: 25.0% of memory, cuDNN not available)


### Define network architecture

We define the `build_network` function that takes the input variables as an optional argument and build the network using the Lasagne API.

NOTE that the final dense layer does *NOT* use the `softmax` nonlinearity as it is supplied by the classifier builder (see below).

In [3]:
def build_network(input_vars=None):
    # Input layer
    x_var = input_vars[0] if input_vars is not None else None
    net = lasagne.layers.InputLayer(shape=(None, 1, 28, 28),
                                        input_var=x_var)

    # A 32 unit 5x5 conv layer, followed by 2x2 max-pool
    net = lasagne.layers.Conv2DLayer(net, num_filters=32, filter_size=(5, 5), W=lasagne.init.HeUniform(), name='c1_1')
    net = lasagne.layers.MaxPool2DLayer(net, pool_size=(2, 2), name='p1')

    # Two 32 unit 3x3 conv layers, followed by 2x2 max-pool
    net = lasagne.layers.Conv2DLayer(net, num_filters=32, filter_size=(3, 3), W=lasagne.init.HeUniform(), name='c2_1')
    net = lasagne.layers.Conv2DLayer(net, num_filters=32, filter_size=(3, 3), W=lasagne.init.HeUniform(), name='c2_2')
    net = lasagne.layers.MaxPool2DLayer(net, pool_size=(2, 2), name='p2')

    # A fully-connected layer of 64 units
    net = lasagne.layers.DenseLayer(net, num_units=64, W=lasagne.init.HeUniform(), name='d3')
    
    # NO DROPOUT; dropout state is difficult to reset/restore in such a way as to ensure reproducibility;
    # Even `dropout_layer._srng.set_rstate(some_constant)` does not seem to work
    # net = lasagne.layers.DropoutLayer(net, p=0.5)

    # Final 10-unit dense layer, with no nonlinearity
    net = lasagne.layers.DenseLayer(net, num_units=10, nonlinearity=None, name='d4')

    return net

#### Load the train, validation and test sets

In [4]:
mnist_train = MNIST(which_sets=['train'], load_in_memory=True, subset=slice(0, 1000))
mnist_val = MNIST(which_sets=['train'], load_in_memory=True, subset=slice(1000, 2000))
mnist_test = MNIST(which_sets=['test'], load_in_memory=True, subset=slice(0, 1000))

train_X, train_y = mnist_train.get_data(mnist_train.open(), request=slice(0, 1000))
val_X, val_y = mnist_val.get_data(mnist_train.open(), request=slice(0, 1000))
test_X, test_y = mnist_test.get_data(mnist_train.open(), request=slice(0, 1000))

### Train the classifier

In [5]:
# Create SEPARATE random number generators for generating weights and shuffling,
# with specific seeds so that we can re-try the experiment below to check that we get the same result
# Creating a network's layers draws from an RNG in order to create randomly initialised network weights.
# Since the third experiment does not create new layers, we need separate RNGs for the shuffling
# step so that we can ensure that it operates the same way each time
weight_rng1 = np.random.RandomState(12345)
shuffle_rng1 = np.random.RandomState(24680)

# Set Lasagne's RNG
lasagne.random.set_rng(weight_rng1)

# Build the image classifier for the given model builder
print 'Building network'
clf = basic_dnn.simple_classifier(build_network, n_input_spatial_dims=2, target_channel_index=0,
            updates_fn=lambda loss, params: lasagne.updates.adam(loss, params, learning_rate=0.001))

# Get the randomly initialised parameter values so that we can try to re-start training without building
# the net from scratch
blank_state = clf.get_param_values(include_updates=True)

# Set verbosity
clf.trainer.report(verbosity=trainer.VERBOSITY_EPOCH)

# Set training length
clf.trainer.train_for(num_epochs=5)

# Train
print 'Training'
clf.trainer.train([train_X, train_y], [val_X, val_y], [test_X, test_y], batchsize=128, shuffle_rng=shuffle_rng1)

clf_state = clf.get_param_values(include_updates=True)

Building network
Training
Epoch 1/5 took 0.30s:  TRAIN y loss=39.954645  VAL y loss=13.138603 err=67.20%  TEST y loss=15.358888 err=72.50%
Epoch 2/5 took 0.28s:  TRAIN y loss=7.458300  VAL y loss=5.213586 err=47.00%  TEST y loss=5.733494 err=52.00%
Epoch 3/5 took 0.28s:  TRAIN y loss=3.116188  VAL y loss=2.710061 err=37.40%  TEST y loss=3.158632 err=40.50%
Epoch 4/5 took 0.28s:  TRAIN y loss=1.593666  VAL y loss=1.950297 err=28.00%  TEST y loss=2.298699 err=32.30%
Epoch 5/5 took 0.28s:  TRAIN y loss=0.936020  VAL y loss=1.507651 err=23.80%  TEST y loss=1.797327 err=26.20%
Final result:
Epoch 5/5 took 1.43s:  TRAIN y loss=0.936020  VAL y loss=1.507651 err=23.80%  TEST y loss=1.797327 err=26.20%


Once more:

In [6]:
weight_rng2 = np.random.RandomState(12345)
shuffle_rng2 = np.random.RandomState(24680)

lasagne.random.set_rng(weight_rng2)

# Build the image classifier for the given model builder
print 'Building network'
clf2 = basic_dnn.simple_classifier(build_network, n_input_spatial_dims=2, target_channel_index=0,
            updates_fn=lambda loss, params: lasagne.updates.adam(loss, params, learning_rate=0.001))

# Set verbosity
clf2.trainer.report(verbosity=trainer.VERBOSITY_EPOCH)

# Set training length
clf2.trainer.train_for(num_epochs=5)

# Train
print 'Training'
clf2.trainer.train([train_X, train_y], [val_X, val_y], [test_X, test_y], batchsize=128, shuffle_rng=shuffle_rng2)

clf2_state = clf2.get_param_values(include_updates=True)

Building network
Training
Epoch 1/5 took 0.40s:  TRAIN y loss=39.954645  VAL y loss=13.138603 err=67.20%  TEST y loss=15.358888 err=72.50%
Epoch 2/5 took 0.30s:  TRAIN y loss=7.458300  VAL y loss=5.213586 err=47.00%  TEST y loss=5.733494 err=52.00%
Epoch 3/5 took 0.28s:  TRAIN y loss=3.116188  VAL y loss=2.710061 err=37.40%  TEST y loss=3.158632 err=40.50%
Epoch 4/5 took 0.28s:  TRAIN y loss=1.593666  VAL y loss=1.950297 err=28.00%  TEST y loss=2.298699 err=32.30%
Epoch 5/5 took 0.28s:  TRAIN y loss=0.936020  VAL y loss=1.507651 err=23.80%  TEST y loss=1.797327 err=26.20%
Final result:
Epoch 5/5 took 1.54s:  TRAIN y loss=0.936020  VAL y loss=1.507651 err=23.80%  TEST y loss=1.797327 err=26.20%


Reset the state of the first classifier and train again:

In [7]:
shuffle_rng3 = np.random.RandomState(24680)

# Reset parameter state
clf.set_param_values(blank_state, include_updates=True)

# Train
print 'Training'
clf.trainer.train([train_X, train_y], [val_X, val_y], [test_X, test_y], batchsize=128, shuffle_rng=shuffle_rng3)

clf_state_b = clf.get_param_values(include_updates=True)

Training
Epoch 1/5 took 0.28s:  TRAIN y loss=39.954645  VAL y loss=13.138603 err=67.20%  TEST y loss=15.358888 err=72.50%
Epoch 2/5 took 0.29s:  TRAIN y loss=7.458300  VAL y loss=5.213586 err=47.00%  TEST y loss=5.733494 err=52.00%
Epoch 3/5 took 0.31s:  TRAIN y loss=3.116188  VAL y loss=2.710061 err=37.40%  TEST y loss=3.158632 err=40.50%
Epoch 4/5 took 0.28s:  TRAIN y loss=1.593666  VAL y loss=1.950297 err=28.00%  TEST y loss=2.298699 err=32.30%
Epoch 5/5 took 0.28s:  TRAIN y loss=0.936020  VAL y loss=1.507651 err=23.80%  TEST y loss=1.797327 err=26.20%
Final result:
Epoch 5/5 took 1.43s:  TRAIN y loss=0.936020  VAL y loss=1.507651 err=23.80%  TEST y loss=1.797327 err=26.20%


Check parameters for equality:

In [8]:
def compare_states(s1, s2):
    for i, (a, b) in enumerate(zip(s1, s2)):
        if (a != b).any():
            print 'FAIL at index {}/{}'.format(i, len(s1))
            


In [9]:
compare_states(clf_state, clf2_state)

In [10]:
compare_states(clf_state, clf2_state)

In [11]:
compare_states(clf_state, clf_state_b)