# MNIST with various deep-learning approaches

### Possible approaches

- standard fully-connected network (baseline)
- LSTM along the pixels (possibly with attention?)
- RL algorithm where an LSTM looks at a *window* of the image, and steers it, while building a classification?

In [7]:
import tensorflow as tf
import numpy as np
from read_mnist import load_dataset
import random


In [78]:
class MNISTClassifier(object):
    def __init__(self, train_path, test_path):
        self.train_in, self.train_out = load_dataset('../data/train')
        self.test_in, self.test_out = load_dataset('../data/t10k')
        
        self.global_step = tf.contrib.framework.get_or_create_global_step()
        self.session = None
        
        with tf.variable_scope(self.name()):
            self.build()
    
    def setup(self):
        if self.session: self.session.close()
        self.session = tf.InteractiveSession()
        self.session.run(tf.global_variables_initializer())
    
    def name(self):
        return 'mnist14'
    
    def build(self):
        assert False, '{} does not implement build'.format(self)
    
    def train(self, epochs, max_batches_per_epoch=None):
        for epoch in xrange(epochs):
            for i, (input, output) in enumerate(self.iterate_batches(self.train_in, self.train_out)):
                self.train_on_batch(input, output)
                if max_batches_per_epoch and i >= max_batches_per_epoch: break
            acc = self.eval_accuracy()
            print "Epoch {}: accuracy: {}%".format(epoch, acc * 100)
    
    def iterate_batches(self, input, output):
        batch_size = 32
        xy = list(zip(input, output))
        random.shuffle(xy)
        i = 0
        while i < len(xy):
            batch = xy[i:min(i+batch_size, len(xy))]
            yield [x for x,y in batch], [y for x,y in batch]
            i += batch_size

    
    def train_on_batch(self, input, output):
        assert False, '{} does not implement train_on_batch'.format(self)
    
    def eval_accuracy(self):
        num = 0
        denom = 0
        for x, y in self.iterate_batches(self.test_in, self.test_out):
            denom += len(x)
            num += len(x) * self.accuracy_for_batch(x, y)
        return num * 1.0 / denom
    
    def accuracy_for_batch(self, input, output):
        assert False, '{} does not implement accuracy_for_batch'.format(self)


## How well does a plain fully-connected neural network perform?

In [90]:
class MLPClassifier(MNISTClassifier):
    def build(self):
        self.input = tf.placeholder(tf.float32, [None, 28, 28], name='input')
        self.target = tf.placeholder(tf.int32, [None], name='target')
        x = self.create_logits(self.input)
        self.loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=x, labels=self.target)
        self.prediction = tf.cast(tf.argmax(x, axis=1), tf.int32)
        correct = tf.equal(self.prediction, self.target)
        self.accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
        self.train_op = tf.train.AdamOptimizer(1e-3).minimize(self.loss)
    
    def create_logits(self, x):
        x = tf.reshape(self.input, [-1, 28 * 28])
        for i, l in enumerate([256, 128, 10]):
            x = tf.layers.dense(x, l, tf.nn.relu, name='fc'+str(i))
            x = tf.nn.dropout(x, 0.7)
        return x
    
    def accuracy_for_batch(self, x, y):
        feed = {
            self.input: x,
            self.target: y
        }
        return self.session.run(self.accuracy, feed_dict=feed)
    
    def train_on_batch(self, input, output):
        feed = {
            self.input: input,
            self.target: output
        }
        _, loss = self.session.run([self.train_op, self.loss], feed_dict=feed)
        return loss

In [64]:
m = MLPClassifier('../data/train', '../data/test')
m.setup()

In [61]:
m.train(3)

Epoch 0: accuracy: 68.76%
Epoch 1: accuracy: 70.16%
Epoch 2: accuracy: 70.28%


## How does a Tensorflow LSTM compare?

_Let's run an LSTM over the image row-by-row and see what happens_

In [None]:
class RNNClassifier(MLPClassifier):
    def name(self):
        return 'rnn1'
    
    def make_cell(self, size):
        return tf.contrib.rnn.BasicLSTMCell(size)
    
    def create_logits(self, x):
        x = tf.reshape(x, [-1, 28, 28]) # each time step is a different row of the image
        
        rnn_cell = self.make_cell(128)
        outputs, state = tf.nn.dynamic_rnn(rnn_cell, x, dtype=tf.float32, scope='lstm1')
        x = outputs[:, -1]
        
        return tf.layers.dense(x, 10, tf.nn.relu, name='fc')

l = LSTMClassifier('../data/train', '../data/test')
l.setup()

In [None]:
l.train(20, max_batches_per_epoch=1)

## Experiment: multiple random initializations

One thing I've noticed when training deep models (especially small ones) is that initial losses can vary _wildly_ between different random initializations.

Before starting gradient descent, why not perform a preliminary search of the space by randomly initializing then network a couple times, training for a couple batches with a high learning rate, and keeping the initialization with the best loss?

In [102]:
class HeadStartClassifier(MLPClassifier):
    def headstart(self, trials=10, batches=10):
        variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.name())
        
        saved_vars = []
        accuracies = []
        for trial in xrange(trials):
            self.session.run(tf.initialize_variables(variables))
            self.train(1, max_batches_per_epoch=batches)
            saved_vars.append(self.session.run(variables))
            accuracies.append(self.eval_accuracy())
            print 'Trial {} accuracy: {}'.format(trial, accuracies[-1])
            
        # restore the best set of variables:
        best_idx = accuracies.index(max(accuracies))
        assign_ops = [var.assign(value) for var, value in zip(variables, saved_vars[best_idx])]
        self.session.run(assign_ops)
        print "All accuracies:", accuracies
    
    def name(self):
        return 'headstart5'


In [106]:
hc = HeadStartClassifier('../data/train', '../data/test')
hc.setup()
hc.headstart()
l.train(20, max_batches_per_epoch=4)

### preliminary results:

Randomly initialized the net 20 times, and then trained each for 10 batches.

After this, accuracies ranged from 15% to 30%. After 1 epoch, the best initialization was trained to 85% accuracy.

# Highway networks

- Performance of 3-layer MLP net: Epoch 3: accuracy: 97.78%
- Performance of 20-layer MLP net: Epoch 3: accuracy: 64.82% (fails to improve in subsequent iterations)
- Performance with 20-layer MLP net with relu only applied every third layer: Epoch 3: accuracy: 96.09%
- Performance with 20-layer residual MLP net: Epoch 3: accuracy: 97.32%
- Performance with 20-layer highway (gated) MLP net: Epoch 3: accuracy: 3.23% (generally hovers around 9% – yikes!)

In [124]:
class NLayerClassifier(MLPClassifier):
    def __init__(self, train_path, test_path, n_layers, style=None):
        # style: None, 'residual' or 'highway'
        self.n_layers = n_layers
        self.style = style
        super(NLayerClassifier, self).__init__(train_path, test_path)
    
    def name(self):
        return 'n6_{}.{}'.format(self.n_layers, self.style)
    
    def create_logits(self, x):
        x = tf.reshape(self.input, [-1, 28 * 28])
        
        layers = [256] + ([128] * (self.n_layers-1))
        
        shortcut = None
        
        def identity(x): return x
        
        for i, l in enumerate(layers):
            x_orig = x
            x = tf.layers.dense(x, l, identity, name='fc'+str(i))
            if self.style == 'residual':
                if i % 3 == 0:
                    if shortcut is not None and shortcut.get_shape()[-1].value == x.get_shape()[-1].value:
                        x += shortcut
                    shortcut = x
                    x = tf.nn.relu(x)
            elif self.style == 'highway':
                if x_orig.get_shape()[-1].value == x.get_shape()[-1].value:
                    transform_gate = tf.layers.dense(x_orig, l, tf.nn.sigmoid, name='transform'+str(i))
                    carry = 1 - transform_gate # can also learn carry gate independently, just like the transform gate
                    x = x * transform_gate + x_orig * carry
            else:
                x = tf.nn.relu(x)
        
        return x

# m3 = NLayerClassifier('../data/train', '../data/test', 3)
# m3.setup()

# m20 = NLayerClassifier('../data/train', '../data/test', 20)
# m20.setup()

# m20r = NLayerClassifier('../data/train', '../data/test', 20, 'residual')
# m20r.setup()

m20h = NLayerClassifier('../data/train', '../data/test', 20, 'highway')
m20h.setup()


In [125]:
# m3.train(20)
# m20.train(20)
# m20r.train(20)
m20h.train(20)


Epoch 0: accuracy: 8.87%
Epoch 1: accuracy: 9.57%
Epoch 2: accuracy: 8.67%
Epoch 3: accuracy: 3.23%


KeyboardInterrupt: 