# MNIST 99%+
Current settings result in the test accuracy crusing above 99.5%.

Final settings overview:

- 3 conv layers (6/5/4 kernels, 24/48/64 channels, 1/2/2 stride, no dropout)
- 2 dense (200/10 neurons, .75 dropout keep probability on the first)
- batch normalization on all layers, 0.999 decay
- decaying learning rate from 0.02 to 0.00015

In [None]:
%matplotlib notebook
from matplotlib import pyplot as plt
import numpy as np
from numpy.random import choice, random_integers
from mnist import MNIST
import tensorflow as tf


data = MNIST()
BATCHSIZE = 100
EPOCHS = 15000
TESTSTEPSIZE = 100

RATE_MAX = 0.02
RATE_MIN = 0.00015
RATE_DECAY = 1000

DROPOUT = 0.75
DROPOUT_CONV = 0.75

K = 24
L = 48
M = 64

## Model

In [None]:
def batchnorm(y, is_test, iteration, offset, convolutional=False):
    exp_moving_avg = tf.train.ExponentialMovingAverage(0.999, iteration)
    mean, variance = tf.nn.moments(y, [0, 1, 2] if convolutional else [0])
    update_moving_avgs = exp_moving_avg.apply([mean, variance])
    m = tf.cond(is_test, lambda: exp_moving_avg.average(mean), lambda: mean)
    v = tf.cond(is_test, lambda: exp_moving_avg.average(variance), lambda: variance)
    ybn = tf.nn.batch_normalization(y, m, v, offset, None, 1e-5)
    return ybn, update_moving_avgs

In [None]:
x = tf.placeholder(tf.float32, (None, 28, 28), name='input')
t = tf.placeholder(tf.int64, (None), name='target')
lr = tf.placeholder(tf.float32)
it = tf.placeholder(tf.int32)
tst = tf.placeholder(tf.bool)
drate = tf.placeholder(tf.float32)
drate_conv = tf.placeholder(tf.float32)

x_ = tf.reshape(x, (-1,28,28,1))

w0 = tf.Variable(tf.truncated_normal((6,6,1,K), stddev=0.1))
b0 = tf.Variable(tf.ones((K,))/100)
c0 = tf.nn.conv2d(x_, w0, (1,1,1,1), 'SAME')
n0, u0 = batchnorm(c0, tst, it, b0, convolutional=True)
y0 = tf.nn.relu(n0)

w1 = tf.Variable(tf.truncated_normal((5,5,K,L), stddev=0.1))
b1 = tf.Variable(tf.ones((L,))/100)
c1 = tf.nn.conv2d(y0, w1, (1,2,2,1), 'SAME')
n1, u1 = batchnorm(c1, tst, it, b1, convolutional=True)
y1 = tf.nn.relu(n1)

w2 = tf.Variable(tf.truncated_normal((4,4,L,M), stddev=0.1))
b2 = tf.Variable(tf.ones((M,))/100)
c2 = tf.nn.conv2d(y1, w2, (1,2,2,1), 'SAME')
n2, u2 = batchnorm(c2, tst, it, b2, convolutional=True)
y2 = tf.nn.relu(n2)

y2_ = tf.reshape(y2, (-1,7*7*M))

w3 = tf.Variable(tf.truncated_normal((7*7*M,200), stddev=0.1))
b3 = tf.Variable(tf.ones((200,))/10)
n3, u3 = batchnorm(tf.matmul(y2_, w3), tst, it, b3)
y3 = tf.nn.relu(n3)
y3 = tf.nn.dropout(y3, drate)

wo = tf.Variable(tf.truncated_normal((200,10), stddev=0.1))
bo = tf.Variable(tf.zeros((10,)))
y = tf.matmul(y3, wo) + bo

update_ema = tf.group(u0, u1, u2, u3)

entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(y, t)
entropy = tf.reduce_mean(entropy) * 100
train_step = tf.train.AdamOptimizer(lr).minimize(entropy)

correct_prediction = tf.equal(tf.argmax(y, 1), t)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

## Training

In [None]:
class Viz:
    test_acc = []
    test_loss = []
    learning_rate = []
    
    def __init__(self):
        self.fig = plt.figure('Training...', figsize=(9,7.5))
        self.ax1 = self.fig.add_subplot(221)
        self.ax2 = self.fig.add_subplot(222)
        self.ax3 = self.fig.add_subplot(223)

        self.ax1.set_xlim([0, EPOCHS])
        self.ax1.set_ylim([0, 100])
        self.ax2.set_xlim([0, EPOCHS])
        self.ax2.set_ylim([0, 100])
        self.ax3.set_xlim([0, EPOCHS])
        self.ax3.set_ylim([0, RATE_MAX*1.25])

        self.plt1, = self.ax1.plot([0], [0])
        self.plt2, = self.ax2.plot([0], [0])
        self.plt3, = self.ax3.plot([RATE_MAX], [0])
        plt.show()

    def update(self, i, learning_rate, test_acc, test_loss):
        step = len(self.test_acc) + 1
        test_acc = test_acc * 100
        self.test_acc.append(test_acc)
        self.test_loss.append(test_loss)
        self.learning_rate.append(learning_rate)

        if i == EPOCHS:
            self.fig.canvas.set_window_title('Training done.')
            self.fig.suptitle('Training done. Max test accuracy {:.2f}% and min loss {:.2f}.'
                .format(np.max(self.test_acc), np.min(self.test_loss)))
        else:
            self.fig.canvas.set_window_title('Training...')
            self.fig.suptitle( '{:.2f}% (epoch {} of {}) done. Max test accuracy {:.2f}% and min loss {:.2f}.'
                .format((i/EPOCHS)*100, i, EPOCHS, np.max(self.test_acc), np.min(self.test_loss)))

        self.ax1.set_title('Test accuracy {:.2f}%'.format(test_acc))
        self.ax1.set_ylim([np.round(test_acc)-10, 100])
        self.ax1.set_xlim([0, np.min([i+100, EPOCHS])])
        self.plt1.set_ydata(self.test_acc)
        self.plt1.set_xdata(np.arange(step)*TESTSTEPSIZE)

        self.ax2.set_title('Test loss {:.2f}'.format(test_loss))
        self.ax2.set_ylim([0, np.round(test_loss)+10])
        self.ax2.set_xlim([0, np.min([i+100, EPOCHS])])
        self.plt2.set_ydata(self.test_loss)
        self.plt2.set_xdata(np.arange(step)*TESTSTEPSIZE)
        
        self.ax3.set_title('Learning Rate {:.8f}'.format(learning_rate))
        self.ax3.set_xlim([0, np.min([i+100, EPOCHS])])
        self.plt3.set_ydata(self.learning_rate)
        self.plt3.set_xdata(np.arange(step)*TESTSTEPSIZE)
        
        self.fig.canvas.draw()

In [None]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())

viz = Viz()

for i in range(0, EPOCHS + 1):
    batch_x, batch_t = data.getTrainingBatch(BATCHSIZE)
    rate = RATE_MIN + (RATE_MAX - RATE_MIN) * np.exp(-i/RATE_DECAY)
    
    sess.run(train_step, {
        x: batch_x, t: batch_t, lr: rate, tst: False, drate: DROPOUT, drate_conv: DROPOUT_CONV
    })
    sess.run(update_ema, {x: batch_x, t: batch_t, tst: False, it: i, drate: 1, drate_conv: 1})
    if i % TESTSTEPSIZE == 0:
        test_acc, test_loss = sess.run([accuracy, entropy], {
            x: data.testData, t: data.testLabels, tst: True, drate: 1, drate_conv: 1
        })
        viz.update(i, rate, test_acc, test_loss)

# Visualize some of the failed predictions

In [None]:
predictions, correctness = sess.run([y, correct_prediction], {
    x: data.testData, t: data.testLabels, tst: True, drate: 1, drate_conv: 1
})
predictions = np.argmax(predictions, 1)
s = correctness == False

plt.figure('wrong predictions', figsize=(10, 15))

for i, (img, label, pred) in enumerate(zip(data.testData[s], data.testLabels[s], predictions[s])):
    if i >= 50:
        break
    ax = plt.subplot(10, 5, i+1)
    ax.set_title('saw {}, is {}'.format(pred, label))
    ax.imshow(img, cmap='gray')
    ax.set_xticklabels([])
    ax.set_yticklabels([])

plt.tight_layout()