In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

### MNIST Data

MNIST Data Introduction : https://www.tensorflow.org/versions/r0.12/tutorials/mnist/beginners/index.html#mnist-for-ml-beginners

In [None]:
# Import MINST data
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)

# black and white images with values in [0.0, 1.0]
mnist.test.images[0][300:350]

### Deep Neural Network Example : Autoencoder

Deep Auto-encoder Introduction : http://speech.ee.ntu.edu.tw/~tlkagk/courses/ML_2016/Lecture/auto%20(v7).pdf

Reference : https://github.com/aymericdamien/TensorFlow-Examples

In [None]:
# Building the encoder
def encoder(x, scope, n_input, n_hidden, n_code, n_layer):
    # Using "scope" for variable-sharing (getting the same variable everytime)
    with tf.variable_scope(scope):
        # Input layer
        W = tf.get_variable("W_in", [n_input, n_hidden])
        b = tf.get_variable("b_in", [n_hidden])
        layer_i = tf.nn.tanh(tf.add(tf.matmul(x, W), b))

        # Hidden layers
        for i in range(n_layer - 1):
            W = tf.get_variable("W_en_%d" % i, [n_hidden, n_hidden])
            b = tf.get_variable("b_en_%d" % i, [n_hidden])
            # activation function - tanh (Could be others)
            layer_i = tf.nn.tanh(tf.add(tf.matmul(layer_i, W), b))

        # Bottleneck layer (code)
        W = tf.get_variable("W_encode", [n_hidden, n_code])
        b = tf.get_variable("b_encode", [n_code])
        # activation function - linear (could be relu, tanh or others)
        code = tf.add(tf.matmul(layer_i, W), b)

    return code


# Building the decoder
def decoder(code, scope, n_output, n_hidden, n_code, n_layer):

    with tf.variable_scope(scope):
        # Bottleneck layer input
        W = tf.get_variable("W_decode", [n_code, n_hidden])
        b = tf.get_variable("b_decode", [n_hidden])
        # activation function - tanh (Could be others)
        layer_i = tf.nn.tanh(tf.add(tf.matmul(code, W), b))

        # Hidden layers
        for i in range(n_layer - 1):
            W = tf.get_variable("W_de_%d" % i, [n_hidden, n_hidden])
            b = tf.get_variable("b_de_%d" % i, [n_hidden])
            # activation function - tanh (Could be others)
            layer_i = tf.nn.tanh(tf.add(tf.matmul(layer_i, W), b))

        # Output layer
        W = tf.get_variable("W_out", [n_hidden, n_output])
        b = tf.get_variable("b_out", [n_output])
        # activation function - sigmoid
        # (Since original data are values in [0.0, 1.0])
        output = tf.nn.sigmoid(tf.add(tf.matmul(layer_i, W), b))

    return output

In [None]:
# Parameters
learning_rate = 0.01
training_epochs = 30
batch_size = 256
display_step = 3
examples_to_show = 10

# Network Parameters
n_hidden = 32
n_code = 2  # code dimension
n_layer = 3
# MNIST data input (img shape: 28*28)
n_input = 784

# tf Graph input (only pictures)
X = tf.placeholder("float", [None, n_input])  # None - dynamic size

# Construct the auto-encoder
with tf.variable_scope("autoencoder") as scope:
    # encoder - input images, output codes
    encoder_op = encoder(X, scope, n_input, n_hidden, n_code, n_layer)
    # decoder - input codes, output decoded images
    decoder_op = decoder(encoder_op, scope, n_input, n_hidden, n_code, n_layer)

    # Prediction
    y_pred = decoder_op
    # Targets (Labels) are the input data.
    y_true = X

    # Define loss and optimizer, minimize the squared error
    cost = tf.reduce_mean(tf.pow(y_true - y_pred, 2))
    # using RMSProp optimization technique
    optimizer = tf.train.RMSPropOptimizer(learning_rate).minimize(cost)

    # Initializing the variables
    init = tf.initialize_all_variables()

In [None]:
# Train the auto-encoder
with tf.variable_scope("autoencoder", reuse=True) as scope:  # set the scope to reuse variables!

    sess = tf.InteractiveSession()
    with sess.as_default():
        sess.run(init)

        total_batch = int(mnist.train.num_examples / batch_size)

        # Training cycle
        for epoch in range(training_epochs):

            # Loop over all batches
            for i in range(total_batch):
                batch_xs, batch_ys = mnist.train.next_batch(batch_size)

                # Run optimization op (backprop) and cost op (to get loss value)
                _, c = sess.run([optimizer, cost], feed_dict={X: batch_xs})

            # Display logs per epoch step
            if epoch % display_step == 0:
                print("Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f}".format(c))

        print("Optimization Finished!")

        # Applying encode and decode over test set
        encode_decode = sess.run(y_pred, feed_dict={X: mnist.test.images[:examples_to_show]})

        # Visualize and compare original images with their reconstructions
        f, a = plt.subplots(2, 10, figsize=(10, 2))
        for i in range(examples_to_show):
            a[0][i].imshow(np.reshape(mnist.test.images[i], (28, 28)))
            a[1][i].imshow(np.reshape(encode_decode[i], (28, 28)))

        f.show()
        plt.draw()

In [None]:
# compute all the codes of training images
with tf.variable_scope("autoencoder", reuse=True) as scope:

    with sess.as_default():
        codes = sess.run(encoder_op, feed_dict={X: mnist.train.images})

# Visualize auto-encoder codes
color = np.array(['#fb9a99', '#e31a1c', '#ff7f00', '#fdbf6f', '#b2df8a', '#33a02c',
                  '#a6cee3', '#1f78b4', '#cab2d6', '#6a3d9a'])

plt.figure(figsize=(5, 5))
plt.scatter(codes[:, 0], codes[:, 1], s=4, alpha=0.5, color=color[np.argmax(mnist.train.labels, axis=1)])
plt.xlim([-15, 15])
plt.ylim([-15, 15])
# plt.savefig('auto_train.png', dpi=200)

In [None]:
# compute all the codes of testing images
with tf.variable_scope("autoencoder", reuse=True) as scope:

    with sess.as_default():
        codes = sess.run(encoder_op, feed_dict={X: mnist.test.images})

# Visualize auto-encoder codes
color = np.array(['#fb9a99', '#e31a1c', '#ff7f00', '#fdbf6f', '#b2df8a', '#33a02c',
                  '#a6cee3', '#1f78b4', '#cab2d6', '#6a3d9a'])

plt.figure(figsize=(5, 5))
plt.scatter(codes[:, 0], codes[:, 1], s=4, alpha=0.5, color=color[np.argmax(mnist.test.labels, axis=1)])
plt.xlim([-15, 15])
plt.ylim([-15, 15])
# plt.savefig('auto_test.png', dpi=200)

### Recurrent Neural Network

Introduction to Neural Network with Memory : http://speech.ee.ntu.edu.tw/~tlkagk/courses/MLDS_2015_2/Lecture/RNN%20(v4).pdf

Reference : https://github.com/aymericdamien/TensorFlow-Examples

Example for MNIST:

- Break the 28 * 28 image into a sequence of 28 vectors (each row is a vector)
- Feed RNN a sequence of rows of one image and predict the digit (0 - 9)

In [None]:
# Construct a Recurrent Neural Network - support minibatch
def RNN(x, n_input, n_hidden, n_classes, n_steps, scope=None):

    with tf.variable_scope(scope or "RNN"):
        # x:[batchsize, n_steps, n_input]; Permuting batch_size and n_steps
        x = tf.transpose(x, [1, 0, 2])
        # Reshape to (n_steps*batch_size, n_input)
        x = tf.reshape(x, [-1, n_input])
        # Split: list of 'n_steps' tensors of shape (batch_size, n_input)
        x = tf.split(0, n_steps, x)

        outputs = []
        memory = tf.get_variable("memory", [1, n_hidden])
        for i in range(len(x)):
            W_o = tf.get_variable("W_out", [n_hidden, n_classes])
            W_h = tf.get_variable("W_mem", [n_hidden, n_hidden])
            W_i = tf.get_variable("W_in", [n_input, n_hidden])

            # memory passed from last step
            memory_read = tf.matmul(memory, W_h)

            # update memory - using "tanh" activation function
            memory = tf.nn.tanh(tf.add(tf.matmul(x[i], W_i), memory_read))

            # output : a sequence
            outputs.append(tf.matmul(memory, W_o))

            # set the variables to be shared !
            if i == 0:
                tf.get_variable_scope().reuse_variables()

    return outputs[-1]  # only need the last output to predict the digit

In [None]:
# Parameters
learning_rate = 0.003
training_iters = 100000
batch_size = 256
display_step = 50

# Network Parameters
nn_input = 28  # MNIST data input (img shape: 28*28)
nn_steps = 28  # 28 rows
nn_hidden = 128  # hidden layer : num of neurons
nn_classes = 10  # MNIST total classes (0-9 digits)

In [None]:
x = tf.placeholder("float", [None, nn_steps, nn_input])
y = tf.placeholder("float", [None, nn_classes])

# Construct RNN
with tf.variable_scope("RNN_run") as scope:
    pred = RNN(x, nn_input, nn_hidden, nn_classes, nn_steps, scope)

    # Define loss and optimizer - apply "softmax" on pred and compute the cross entropy
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(pred, y))

    # using the Adam optimization technique
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)

    # Evaluate model
    correct_pred = tf.equal(tf.argmax(pred,1), tf.argmax(y,1))
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

    # Initializing the variables
    init = tf.initialize_all_variables()

In [None]:
# Train the RNN and evalute results
with tf.variable_scope("RNN_run", reuse=True) as scope:

    sess = tf.Session()
    with sess.as_default():
        sess.run(init)
        step = 1

        # Keep training until reach max iterations
        while step * batch_size < training_iters:
            # get minibatch data
            batch_x, batch_y = mnist.train.next_batch(batch_size)

            # Reshape data to get 28 seq of 28 elements
            batch_x = batch_x.reshape((batch_size, nn_steps, nn_input))

            # Run optimization op (backprop) - training
            sess.run(optimizer, feed_dict={x: batch_x, y: batch_y})

            if step % display_step == 0 or step == 1:
                # Calculate batch accuracy
                acc = sess.run(accuracy, feed_dict={x: batch_x, y: batch_y})
                # Calculate batch loss
                loss = sess.run(cost, feed_dict={x: batch_x, y: batch_y})

                print("Iter " + str(step*batch_size) + ", Loss= " + \
                      "{:.6f}".format(loss) + ", Accuracy= " + "{:.5f}".format(acc))

            step += 1

        print("Optimization Finished!")

        # Calculate accuracy for 128 mnist test images
        test_len = 128
        test_data = mnist.test.images[:test_len].reshape((-1, nn_steps, nn_input))
        test_label = mnist.test.labels[:test_len]
        print("Testing Accuracy:", sess.run(accuracy, feed_dict={x: test_data, y: test_label}))


### Recurrent Neural Network with Attention

Introduction to Attention Interface : http://distill.pub/2016/augmented-rnns/#attentional-interfaces

Introduction to Attention-based model : http://speech.ee.ntu.edu.tw/~tlkagk/courses/MLDS_2015_2/Lecture/Attain%20(v3).pdf

Example for MNIST :

- Break the 28 * 28 image into a sequence of 28 vectors (each row is a vector)
- Feed RNN with attention a sequence of rows of one image and predict the digit (0 - 9)

( however, since the attention mechanism is more suitable for sequence-to-sequence tasks, <br>
 training on MNIST seems not showing the advantage of the attention mechanism. )

In [None]:
# Recurrent Neural Network with Attention
def RNN_attention(x, n_input, n_hidden, n_attn_hidden, n_classes, scope=None):

    with tf.variable_scope(scope or "RNN"):

        # initialize variables for "attention match function" - a small DNN
        W_in = tf.get_variable("Win_atten", [n_input + n_hidden, n_attn_hidden])
        b_in = tf.get_variable("bin_atten", [n_attn_hidden])
        W2 = tf.get_variable("W2_atten", [n_attn_hidden, n_attn_hidden])
        b2 = tf.get_variable("b2_atten", [n_attn_hidden])
        W_out = tf.get_variable("Wout_atten", [n_attn_hidden, 1])
        b_out = tf.get_variable("bout_atten", [1])

        # initialize variables for RNN
        W_h = tf.get_variable("W_mem", [n_hidden, n_hidden])
        W_i = tf.get_variable("W_in", [n_input, n_hidden])
        b_i = tf.get_variable("b_in", [n_hidden])
        W_o = tf.get_variable("W_out", [n_hidden, n_classes])
        b_o = tf.get_variable("b_out", [n_classes])
        state = tf.get_variable("state0", [n_hidden])

        # attention mechaism - given a query, compute the attention scores across the input sequence
        def attention(query, x, scope):

            with tf.variable_scope(scope):
                # variable sharing
                tf.get_variable_scope().reuse_variables()
                
                W_in = tf.get_variable("Win_atten")
                b_in = tf.get_variable("bin_atten")
                W2 = tf.get_variable("W2_atten")
                b2 = tf.get_variable("b2_atten")
                W_out = tf.get_variable("Wout_atten")
                b_out = tf.get_variable("bout_atten")

                def match(_, batch_xi):
                    # batch_xi : [batchsize, feature_dim]
                    dnn_input = tf.concat(1, [batch_xi, query])

                    # attention "match" DNN
                    a_attn = tf.nn.relu(tf.add(tf.matmul(dnn_input, W_in), b_in))
                    a_attn2 = tf.nn.relu(tf.add(tf.matmul(a_attn, W2), b2))
                    # attention score at this step : [batchsize, 1]
                    attn_score = tf.add(tf.matmul(a_attn2, W_out), b_out)

                    return attn_score
                
                # concatenate the query with each of input features - to feed into the match function
                attn_scores = tf.scan(match, tf.transpose(x, [1, 0, 2]), initializer=tf.matmul(x[0], tf.ones([n_input, 1])))

                attn_scores = tf.transpose(attn_scores, [1, 2, 0])  # reshape to [batchsize, 1, n_seq]
                attn = tf.nn.softmax(attn_scores)

                return attn

        # RNN cell
        def cell(memory, _):
            # variable sharing
            tf.get_variable_scope().reuse_variables()
            W_h = tf.get_variable("W_mem")
            W_i = tf.get_variable("W_in")
            b_i = tf.get_variable("b_in")

            # memory passed from last step
            mem = tf.matmul(memory, W_h)

            # use last-step memory as query to compute attention scores
            attn = attention(memory, x, scope)
            # calculate the attended features as the RNN input
            atten_x = tf.reshape(tf.batch_matmul(attn, x), [-1, n_input])

            # update RNN memory with "tanh" activation
            memory = tf.nn.tanh(tf.add(tf.add(tf.matmul(atten_x, W_i), b_i), mem))

            return memory

        # loop over the x sequence and calculate the output sequence
        states = tf.scan(lambda state, _: state, x, initializer=state)
        out_seq = tf.scan(cell, tf.transpose(x, [1, 0, 2]), initializer=states)
        outputs = tf.nn.softmax(tf.add(tf.matmul(out_seq[-1], W_o), b_o))

    return outputs  # only the last output to predict the digit

In [None]:
# Parameters
learning_rate = 0.003
training_iters = 100000
batch_size = 128
display_step = 5

# Network Parameters
n_input = 28
n_steps = 28
n_hidden = 128  # hidden layer num of neurons
n_classes = 10  # MNIST total classes (0-9 digits)
n_attn_hidden = 32

In [None]:
x = tf.placeholder("float", [None, None, n_input])
y = tf.placeholder("float", [None, n_classes])

with tf.variable_scope("RNN_attn_run", reuse=True) as scope:
    pred = RNN_attention(x, n_input, n_hidden, n_attn_hidden, n_classes, scope)

    # Define loss and optimizer
    cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(pred, y))
    
    optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
    gvs = optimizer.compute_gradients(cost)
    capped_gvs = [(tf.clip_by_value(grad, -0.1, 0.1), var) for grad, var in gvs]
    optimizer = optimizer.apply_gradients(capped_gvs)
    
    # Evaluate model
    correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

    # Initializing the variables
    init = tf.initialize_all_variables()

In [None]:
# Train the RNN and evalute results
with tf.variable_scope("RNN_attn_run", reuse=True) as scope:

    sess = tf.Session()
    with sess.as_default():
        sess.run(init)
        step = 1

        # Keep training until reach max iterations
        while step * batch_size < training_iters:
            # get minibatch data
            batch_x, batch_y = mnist.train.next_batch(batch_size)

            # Reshape data to get 28 seq of 28 elements
            batch_x = batch_x.reshape((batch_size, n_steps, n_input))

            # Run optimization op (backprop) - training
            sess.run(optimizer, feed_dict={x: batch_x, y: batch_y})

            if step % display_step == 0 or step == 1:
                # Calculate batch accuracy
                acc = sess.run(accuracy, feed_dict={x: batch_x, y: batch_y})
                # Calculate batch loss
                loss = sess.run(cost, feed_dict={x: batch_x, y: batch_y})

                print("Iter " + str(step*batch_size) + ", Loss= " + \
                      "{:.6f}".format(loss) + ", Accuracy= " + "{:.5f}".format(acc))

            step += 1

        print("Optimization Finished!")

        # Calculate accuracy for 128 mnist test images
        test_len = 128
        test_data = mnist.test.images[:test_len].reshape((-1, n_steps, n_input))
        test_label = mnist.test.labels[:test_len]
        print("Testing Accuracy:", sess.run(accuracy, feed_dict={x: test_data, y: test_label}))
