# MNIST TensorFlow
We use:
* convolutional and fully-connected layers
* regularization techniques:
    * dropout
    * max pulling (to help overfitting by providing an abstracted form of the representation + it reduces the computational cost by reducing the number of parameters)
* activation functions:
    * ReLU
    * softmax for probabilities (classification)
* flexible learning rate
* batch normalization
    * batch norm scaling is not useful with relus
    * batch norm offsets are used instead of biases
* TensorBoard vizualization

In [1]:
import numpy as np
import datetime
import tensorflow as tf
from tensorflow.contrib.learn.python.learn.datasets.mnist import read_data_sets
mnist = read_data_sets("data", one_hot=True, reshape=False, validation_size=5000)

Extracting data/train-images-idx3-ubyte.gz
Extracting data/train-labels-idx1-ubyte.gz
Extracting data/t10k-images-idx3-ubyte.gz
Extracting data/t10k-labels-idx1-ubyte.gz


In [2]:
def timestamp():
    d = datetime.datetime.now()
    return d.strftime("%Y/%m/%d/%X")

logs_path_train = '/tmp/tensorflow_logs/mnist/train' + timestamp()
logs_path_val = '/tmp/tensorflow_logs/mnist/val' + timestamp()

In [3]:
# Parameters
training_epochs = 20
batch_size = 100
display_step = 1
max_learning_rate = 0.0004
min_learning_rate = 0.0001
decay_speed = 10
bnepsilon = 1e-5

# Net

In [4]:
# Setup placeholders
x = tf.placeholder(tf.float32, [None, 28, 28, 1], name="x")
tf.summary.image('input', x)
y_ = tf.placeholder(tf.float32, [None, 10], name="labels")
# Learning rate
lr = tf.placeholder(tf.float32,)
# Probability of keeping a node during dropout = 1.0 at test time (no dropout) 
# and 0.75 at training time
pkeep_fc = tf.placeholder(tf.float32)
pkeep_conv = tf.placeholder(tf.float32)
# test flag for batch norm
tst = tf.placeholder(tf.bool)
iter = tf.placeholder(tf.int32)

In [5]:
def batchnorm(Ylogits, is_test, iteration, offset, convolutional=False):
    # adding the iteration prevents from averaging across non-existing iterations
    exp_moving_avg = tf.train.ExponentialMovingAverage(0.999, iteration) 
    if convolutional:
        mean, variance = tf.nn.moments(Ylogits, [0, 1, 2])
    else:
        mean, variance = tf.nn.moments(Ylogits, [0])
    update_moving_everages = exp_moving_avg.apply([mean, variance])
    m = tf.cond(is_test, lambda: exp_moving_avg.average(mean), lambda: mean)
    v = tf.cond(is_test, lambda: exp_moving_avg.average(variance), lambda: variance)
    Ybn = tf.nn.batch_normalization(Ylogits, m, v, offset, None, bnepsilon)
    return Ybn, update_moving_everages

def compatible_convolutional_noise_shape(Y):
    noiseshape = tf.shape(Y)
    noiseshape = noiseshape * tf.constant([1,0,0,1]) + tf.constant([0,1,1,0])
    return noiseshape

def conv_layer(input, size_window, size_in, size_out, stride, max_pulling=True, name="conv"):
    """Convolutional layer + max pulling + dropout."""
    with tf.name_scope(name):
        w = tf.Variable(
            tf.truncated_normal([size_window, size_window, size_in, size_out], stddev=0.1), name="weight")
        b = tf.Variable(tf.ones([size_out]), name="bias")
        # CNN
        conv = tf.nn.conv2d(input, w, strides=[1, stride, stride, 1], padding="SAME")
        # BATCH NORMALIZATION
        act, update_ema = batchnorm(conv, tst, iter, b, convolutional=True)
        # ReLU activation
        act = tf.nn.relu(conv)
        if max_pulling == True:
            # MAX PULLING
            act = tf.nn.max_pool(
                act, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding="SAME")
            # DROPOUT
            act = tf.nn.dropout(
                act, pkeep_conv, compatible_convolutional_noise_shape(act))
        else:
            # DROPOUT
            act = tf.nn.dropout(
                act, pkeep_conv, compatible_convolutional_noise_shape(act))
        tf.summary.histogram("weights", w)
        tf.summary.histogram("biases", b)
        tf.summary.histogram("activations", act)
        return act, update_ema
    
def fc_layer(input, size_in, size_out, name="fc"):
    """Fully-connected layer."""
    with tf.name_scope(name):
        w = tf.Variable(tf.truncated_normal([size_in, size_out], stddev=0.1), 
                        name="weight")
        b = tf.Variable(tf.ones([size_out]), name="bias")
        # FC logits
        fc = tf.matmul(input, w)
        # BATCH NORMALIZATION
        act, update_ema = batchnorm(fc, tst, iter, b, convolutional=False)
        # ReLU activation
        act = tf.nn.relu(act)

        tf.summary.histogram("weights", w)
        tf.summary.histogram("biases", b)
        tf.summary.histogram("activations", act)
        return act, update_ema
    
def get_logits(input, size_in, size_out, name="get_logits"):
    """Fully-connected layer without activation function."""
    with tf.name_scope(name):
        w = tf.Variable(tf.truncated_normal([size_in, size_out], stddev=0.1), 
                        name="weight")
        b = tf.Variable(tf.ones([size_out]), name="bias")
        logits = tf.matmul(input, w) + b
        return logits

In [6]:
# MODEL
# input is 28x28
conv1, update_ema1 = conv_layer(x, 6, 1, 24, 1, max_pulling=True, name='conv1')
# stride = 1 and max pooling -> output is size_out (24) feature maps with 14x14 size
conv2, update_ema2 = conv_layer(conv1, 5, 24, 48, 1, max_pulling=True, name='conv2')
# stride = 1 ("same" padding -> size remains constant) and max pooling -> size changed: output is 7x7
conv3, update_ema3 = conv_layer(conv2, 4, 48, 64, 1, max_pulling=False, name='conv3')
# stride = 1 -> output is 7x7
flattened = tf.reshape(conv3, shape=[-1, 7 * 7 * 64])
fc1, update_ema4 = fc_layer(flattened, 7 * 7 * 64, 200, "fc1")
logits = get_logits(fc1, 200, 10, "logits")

update_ema = tf.group(update_ema1, update_ema2, update_ema3, update_ema4)

In [7]:
with tf.name_scope('loss'):
    cross_entropy = tf.reduce_mean(
        tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y_), name='loss')
    tf.summary.scalar("loss", cross_entropy)
    
with tf.name_scope('optimizer'):
    optimizer = tf.train.AdamOptimizer(lr).minimize(cross_entropy)
    
with tf.name_scope('accuracy'):
    correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(y_, 1))
    acc = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name='accuracy')
    tf.summary.scalar("accuracy", acc)

merged_summary_op = tf.summary.merge_all()

In [8]:
# Initializing the variables
init = tf.global_variables_initializer()

In [9]:
%%time
# Launch the graph
with tf.Session() as sess:
    sess.run(init)

    # op to write logs to Tensorboard
    summary_writer_train = tf.summary.FileWriter(logs_path_train, 
                                                 graph=tf.get_default_graph())
    summary_writer_val = tf.summary.FileWriter(logs_path_val, 
                                               graph=tf.get_default_graph())

    # Training cycle
    for epoch in range(training_epochs):
        avg_train_loss = 0.
        avg_val_loss = 0.
        total_batch = int(mnist.train.num_examples / batch_size)
        learning_rate = min_learning_rate + (
            max_learning_rate - min_learning_rate) * np.exp(-epoch / decay_speed)
          
        # Loop over all batches
        for i in range(total_batch):
            batch_x_train, batch_y_train = mnist.train.next_batch(batch_size)
            batch_x_val, batch_y_val = mnist.validation.next_batch(batch_size)
            
            # Run optimization op (backprop), cost op (to get loss value) and summary nodes
            # and summary nodes
            a, c, summary = sess.run([optimizer, cross_entropy, merged_summary_op], 
                                     feed_dict={x: batch_x_train, 
                                                y_: batch_y_train, 
                                                lr: learning_rate,
                                                tst: False, 
                                                pkeep_fc: 0.75, 
                                                pkeep_conv: 0.75}) 
            
            sess.run(update_ema, {x: batch_x_train, 
                                  y_: batch_y_train, 
                                  tst: False, 
                                  iter: i, 
                                  pkeep_fc: 0.75, 
                                  pkeep_conv: 0.75})
            
            c_val, summary_val = sess.run([cross_entropy, merged_summary_op], 
                                          feed_dict={x: batch_x_val, 
                                                     y_: batch_y_val, 
                                                     lr: learning_rate, 
                                                     tst: False, 
                                                     pkeep_fc: 1.0, 
                                                     pkeep_conv: 1.0}) 

            # Write logs at every iteration
            summary_writer_train.add_summary(summary, epoch * total_batch + i)
            summary_writer_val.add_summary(summary_val, epoch * total_batch + i)

            # Compute average loss
            avg_train_loss += c / total_batch
            avg_val_loss += c_val / total_batch
            
        # Display logs per epoch step
        if (epoch + 1) % display_step == 0:
            print("Epoch:", '%04d' % (epoch + 1),
                  "train loss", "{:.9f}".format(avg_train_loss),
                  "val loss", "{:.9f}".format(avg_val_loss),
                  "learning rate", "{:.9f}".format(learning_rate))

    print("Optimization Finished!")

    # Test model
    # Calculate accuracy    
    total_test_batch = mnist.test.num_examples // batch_size
    print('total_test_batch', total_test_batch)    
    acc_test_lst = []
    for step in range(mnist.test.images.shape[0] // total_test_batch):        
        acc_test = acc.eval(
            {x: mnist.test.images[step * total_test_batch:(step + 1) * total_test_batch, :], 
             y_: mnist.test.labels[step * total_test_batch:(step + 1) * total_test_batch, :],
             lr: learning_rate,
             tst: False, 
             pkeep_fc: 1.0, 
             pkeep_conv: 1.0}) 
        acc_test_lst.append(acc_test)    
    print("Accuracy:", np.mean(acc_test_lst))

    print("Run the command line:\n" \
          "--> tensorboard --logdir=/tmp/tensorflow_logs " \
          "\nThen open http://0.0.0.0:6006/")

Epoch: 0001 train loss 0.383475291 val loss 0.246662935 learning rate 0.000400000
Epoch: 0002 train loss 0.119240872 val loss 0.074134740 learning rate 0.000371451
Epoch: 0003 train loss 0.085277008 val loss 0.054309076 learning rate 0.000345619
Epoch: 0004 train loss 0.068126769 val loss 0.044543734 learning rate 0.000322245
Epoch: 0005 train loss 0.056819013 val loss 0.039922873 learning rate 0.000301096
Epoch: 0006 train loss 0.049651470 val loss 0.037398321 learning rate 0.000281959
Epoch: 0007 train loss 0.044833362 val loss 0.034619518 learning rate 0.000264643
Epoch: 0008 train loss 0.040875166 val loss 0.032598043 learning rate 0.000248976
Epoch: 0009 train loss 0.039633398 val loss 0.031267214 learning rate 0.000234799
Epoch: 0010 train loss 0.036536123 val loss 0.030053893 learning rate 0.000221971
Epoch: 0011 train loss 0.032441446 val loss 0.028197201 learning rate 0.000210364
Epoch: 0012 train loss 0.030286548 val loss 0.026890954 learning rate 0.000199861
Epoch: 0013 trai