In [1]:
# do not forget to swapoff -a
import numpy as np       # linear algebra
import pylab as pl       # plots
import tensorflow as tf  # now we are actually using it 

In [2]:
# choose wisely
(train, label_train), (test, label_test) = tf.keras.datasets.mnist.load_data()
#(train, label_train), (test, label_test) = tf.keras.datasets.fashion_mnist.load_data()

# make sure your data is floating point
test = test.astype(np.float32)
train = train.astype(np.float32)

# print shapes
print(train.shape, label_train.shape)
print(test.shape, label_test.shape)

((60000, 28, 28), (60000,))
((10000, 28, 28), (10000,))


In [3]:
# here we can subsample in the data set
num_test, num_train = 10000, 60000
test, label_test = test[:num_test], label_test[:num_test]
train, label_train = train[:num_train], label_train[:num_train]

# print shapes
print(train.shape, label_train.shape)
print(test.shape, label_test.shape)

((60000, 28, 28), (60000,))
((10000, 28, 28), (10000,))


In [4]:
# now define the session graph
X_tf = tf.placeholder(shape=[None, 28, 28, 1], dtype=tf.float32) # for each image we have 784 pixels
Y_tf = tf.placeholder(shape=[None],            dtype=tf.int64)   # for each image we have one scalar label

# make train and test compatible
train = np.expand_dims(train, 3)
test  = np.expand_dims(test,  3)

In [5]:
import tensorflow.contrib.layers as tfl

def prelu(net):
    alpha = tf.Variable(0.0, dtype=net.dtype)
    return tf.nn.leaky_relu(net, alpha)

def residual_conv_block(net, num_filters, kernel_size, stride, is_training=True):
    
    # let us cache the input tensor and downsample it
    inp = tfl.avg_pool2d(net, kernel_size, stride, padding="SAME")
    
    # now convolve with stride (potential downsampling)
    net = tfl.conv2d(net, num_filters, kernel_size, stride, activation_fn=tf.identity, padding="SAME")
    
    # normalize the output
    net = tfl.batch_norm(net, is_training=is_training, activation_fn=tf.identity)
    
    # now convolve again but do not downsample
    net = tfl.conv2d(net, num_filters, kernel_size, stride=1, activation_fn=tf.identity, padding="SAME")

    return prelu(tf.concat((net, inp), axis=-1))
    

In [6]:
net = tf.identity(X_tf)
net = residual_conv_block(net,  16, 3, 2)
net = residual_conv_block(net,  32, 3, 2)
net = residual_conv_block(net,  64, 3, 2)
net = residual_conv_block(net, 128, 3, 2)

# make it completely translation invariant
net = tf.reduce_mean(net, axis=(1, 2))
net = tfl.fully_connected(net, 10, activation_fn=tf.identity)

# the same loss like in softmax regression just now in short
loss_tf = tf.losses.sparse_softmax_cross_entropy(labels=Y_tf, logits=net)

# let us define the non-differentiable accuracy as metric
correctly_predicted = tf.equal(Y_tf, tf.argmax(net, axis=1))
metric_tf = tf.reduce_mean(tf.cast(correctly_predicted, tf.float32))

# define the optimizer
optimizer = tf.train.AdamOptimizer(1E-3)
step_tf = optimizer.minimize(loss_tf)

In [7]:
import tqdm

with tf.Session() as sess:

    sess.run(tf.global_variables_initializer())
    
    num_iterations, print_every, batch_size = 2**9, 2**5, 2**8
    for iteration in tqdm.tqdm(range(num_iterations)):
        
        indices = np.random.choice(train.shape[0], batch_size, replace=False)
        X, Y = train[indices], label_train[indices]
        
        sess.run(step_tf, feed_dict={X_tf: X, Y_tf: Y})
        
        if iteration % print_every == print_every-1:
            loss, metric = sess.run([loss_tf, metric_tf], feed_dict={X_tf: X, Y_tf: Y})
            print("train loss and metric:",loss, metric)


    loss, metric = sess.run([loss_tf, metric_tf], feed_dict={X_tf: test, Y_tf: label_test})
    print("test loss and metric:", loss, metric)


  6%|▋         | 33/512 [00:06<01:34,  5.08it/s]

('train loss and metric:', 0.5179426, 0.8515625)


 13%|█▎        | 65/512 [00:12<01:26,  5.18it/s]

('train loss and metric:', 0.26763347, 0.921875)


 19%|█▉        | 96/512 [00:19<01:25,  4.88it/s]

('train loss and metric:', 0.10120472, 0.984375)


 25%|██▌       | 129/512 [00:27<01:21,  4.73it/s]

('train loss and metric:', 0.07530877, 0.9765625)


 31%|███▏      | 160/512 [00:33<01:14,  4.72it/s]

('train loss and metric:', 0.12366684, 0.953125)


 38%|███▊      | 193/512 [00:41<01:08,  4.64it/s]

('train loss and metric:', 0.09836517, 0.96875)


 44%|████▍     | 224/512 [00:48<01:01,  4.65it/s]

('train loss and metric:', 0.037775334, 0.984375)


 50%|█████     | 256/512 [00:59<00:59,  4.27it/s]

('train loss and metric:', 0.076444976, 0.9765625)


 56%|█████▋    | 288/512 [01:08<00:53,  4.19it/s]

('train loss and metric:', 0.09011425, 0.97265625)


 62%|██████▎   | 320/512 [01:15<00:45,  4.24it/s]

('train loss and metric:', 0.04790362, 0.984375)


 69%|██████▉   | 352/512 [01:22<00:37,  4.29it/s]

('train loss and metric:', 0.052710272, 0.984375)


 75%|███████▌  | 384/512 [01:30<00:30,  4.26it/s]

('train loss and metric:', 0.04114881, 0.984375)


 81%|████████▏ | 417/512 [01:36<00:22,  4.32it/s]

('train loss and metric:', 0.01993398, 0.99609375)


 88%|████████▊ | 449/512 [01:43<00:14,  4.35it/s]

('train loss and metric:', 0.040300496, 0.98828125)


 94%|█████████▍| 480/512 [01:51<00:07,  4.32it/s]

('train loss and metric:', 0.024529546, 0.99609375)


100%|██████████| 512/512 [01:58<00:00,  4.33it/s]

('train loss and metric:', 0.019960929, 0.9921875)





('test loss and metric:', 0.048200987, 0.9847)
