In [5]:
import tensorflow as tf
import numpy as np

In [6]:
# to make this notebook's output stable across runs
def reset_graph(seed=1):
    tf.reset_default_graph()
    tf.set_random_seed(seed)
    np.random.seed(seed)

## tf implementation of Batch Normalization (BN) layers (mostly for vanishing gradient issue)

### Construct Phase

1. Major net parameters

In [7]:
reset_graph()
n_inputs = 28 * 28 #MNIST
n_hidden1 = 300
n_hidden2 = 100
n_outputs = 10

learning_rate = 0.01

2. construct computation graph

**Note**:
In graph construct phase, for Batch Normalization graph, the main change is:

The need of a `train` placeholder node with default value as "False" to control if the `tf.layers.batch_normalization()` layers should use the running stats of the mini batches or the whole data stats in calculation. The former is for model training and the latter is for model evaluation / apply


In [63]:
# define placeholder node in the computation graph
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name='X')
y = tf.placeholder(tf.int64, shape=(None), name="y") # y will be just 1D tensor, int

# a controller node in the graph, giving a signal to BN nodes, such that
# it uses different parameters (mean and std dev) of mini-batch or whole data
training = tf.placeholder_with_default(False, shape=(), name='training')

with tf.name_scope('fc'):
    hidden1 = tf.layers.dense(X, n_hidden1, name='hidden1')
    bn1 = tf.layers.batch_normalization(hidden1, training=training, momentum=0.9)
    bn1_act = tf.nn.elu(bn1)
    hidden2 = tf.layers.dense(bn1_act, n_hidden2, name='hidden2')
    bn2 = tf.layers.batch_normalization(hidden2, training=training, momentum=0.9)
    bn2_act = tf.nn.elu(bn2)
    logits_before_bn = tf.layers.dense(bn2_act, n_outputs, name='outputs')
    logits = tf.layers.batch_normalization(logits_before_bn, training=training, momentum=0.9)

with tf.name_scope('loss'):
    # this function takes labels from y, one hot it, and then use the logits node to 
    # calculate loss
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    # average xentropy as the "loss" of the model current state
    loss = tf.reduce_mean(xentropy, name='loss')

with tf.name_scope('train'):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)

with tf.name_scope('eval'):
    # tells if the logits perdictions are in top 1
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

# define logging systems
from datetime import datetime
now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
root_logdir = 'mlp_logs'
logdir = "{}/run-{}/".format(root_logdir, now)

with tf.name_scope('log'):
    acc_summary = tf.summary.scalar('Acc', accuracy)
    file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())

init = tf.global_variables_initializer()
saver = tf.train.Saver()

Alternatively, use `partial()` to define the BN with the same settings and use in the graph construction

In [8]:
# tip: using python native functools.partial() to wrap a function with some default values
from functools import partial

# define placeholder node in the computation graph
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name='X')
y = tf.placeholder(tf.int64, shape=(None), name="y") # y will be just 1D tensor, int
training = tf.placeholder_with_default(False, shape=(), name='training')

# define computation graph

# create a bn function with same parameters that's repeated
my_bn_layer = partial(tf.layers.batch_normalization, training=training, momentum=0.9)

with tf.name_scope('fc'):
    hidden1 = tf.layers.dense(X, n_hidden1, name='hidden1')
    bn1 = my_bn_layer(hidden1)
    bn1_act = tf.nn.elu(bn1)
    hidden2 = tf.layers.dense(bn1_act, n_hidden2, name='hidden2')
    bn2 = my_bn_layer(hidden2)
    bn2_act = tf.nn.elu(bn2)
    logits_before_bn = tf.layers.dense(bn2_act, n_outputs, name='outputs')
    logits = my_bn_layer(logits_before_bn)
    
with tf.name_scope('loss'):
    # this function takes labels from y, one hot it, and then use the logits node to 
    # calculate loss
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    # average xentropy as the "loss" of the model current state
    loss = tf.reduce_mean(xentropy, name='loss')

with tf.name_scope('train'):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)

with tf.name_scope('eval'):
    # tells if the logits perdictions are in top 1
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

# define logging systems
from datetime import datetime
now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
root_logdir = 'mlp_logs'
logdir = "{}/run-{}/".format(root_logdir, now)

with tf.name_scope('log'):
    acc_summary = tf.summary.scalar('Acc', accuracy)
    file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())

init = tf.global_variables_initializer()
saver = tf.train.Saver()

W0822 13:26:14.260378 15428 deprecation.py:323] From <ipython-input-8-415c75686bfa>:15: dense (from tensorflow.python.layers.core) is deprecated and will be removed in a future version.
Instructions for updating:
Use keras.layers.dense instead.
W0822 13:26:14.267363 15428 deprecation.py:506] From C:\Users\oycy\AppData\Roaming\Python\Python35\site-packages\tensorflow\python\ops\init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0822 13:26:14.660347 15428 deprecation.py:323] From <ipython-input-8-415c75686bfa>:16: batch_normalization (from tensorflow.python.layers.normalization) is deprecated and will be removed in a future version.
Instructions for updating:
Use keras.layers.BatchNormalization instead.  In particular, `tf.control_dependencies(tf.GraphKeys.UPDATE_OPS)`

### Execution Phase

In [9]:
from tensorflow.examples.tutorials.mnist import input_data

mnist = input_data.read_data_sets("/tmp/data/")

W0822 13:26:24.274164 15428 deprecation.py:323] From <ipython-input-9-4141630e56b4>:3: read_data_sets (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.
Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
W0822 13:26:24.289788 15428 deprecation.py:323] From C:\Users\oycy\AppData\Roaming\Python\Python35\site-packages\tensorflow\contrib\learn\python\learn\datasets\mnist.py:260: maybe_download (from tensorflow.contrib.learn.python.learn.datasets.base) is deprecated and will be removed in a future version.
Instructions for updating:
Please write your own downloading logic.
W0822 13:26:25.937195 15428 deprecation.py:323] From C:\Users\oycy\AppData\Roaming\Python\Python35\site-packages\tensorflow\contrib\learn\python\learn\datasets\mnist.py:262: extract_images (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.

Extracting /tmp/data/train-images-idx3-ubyte.gz


W0822 13:26:26.530279 15428 deprecation.py:323] From C:\Users\oycy\AppData\Roaming\Python\Python35\site-packages\tensorflow\contrib\learn\python\learn\datasets\mnist.py:267: extract_labels (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.
Instructions for updating:
Please use tf.data to implement this functionality.
W0822 13:26:26.691383 15428 deprecation.py:323] From C:\Users\oycy\AppData\Roaming\Python\Python35\site-packages\tensorflow\contrib\learn\python\learn\datasets\mnist.py:290: DataSet.__init__ (from tensorflow.contrib.learn.python.learn.datasets.mnist) is deprecated and will be removed in a future version.
Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.


Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz


In [10]:
n_epochs = 40
batch_size = 50

**Note**:

In the execution phase, 2 things to note for running a graph with BN layers:

1. The training runs need to feed additional `True` value to the `training` node to overwrite the default `False`. This will help trigger all the BN layers to calculate stats based on mini-batches
2. In training process, the BN layers need to dynamically update the parameters for input data; however these parameters are **not** updated automatically. A `tf.GraphKeys.UPDATE_OPS` collection of ops should be executed with the exact same parameters with the `training_op` in training, and it should run after the `training_op` in the sequence.

In [11]:
extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for iteration in range(mnist.train.num_examples // batch_size + 1):
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            if iteration % 10 == 0:
                summary_str = acc_summary.eval(feed_dict={X:X_batch, y:y_batch})
                # step is total number of minibatches from beginning
                step = epoch * (mnist.train.num_examples // batch_size + 1) + iteration
                # call the file_writer to add the above information
                file_writer.add_summary(summary_str, step)
            sess.run([training_op, extra_update_ops],
                     feed_dict={training:True, X: X_batch, y: y_batch})
        acc_val = accuracy.eval(feed_dict={X: mnist.validation.images,
                                           y: mnist.validation.labels})
        print("Epoch {0}: Val Accuracy - {1}".format(
            epoch + 1, acc_val))
    save_path = saver.save(sess, './my_model_final.ckpt')
    file_writer.close()

Epoch 1: Val Accuracy - 0.925000011920929
Epoch 2: Val Accuracy - 0.9462000131607056
Epoch 3: Val Accuracy - 0.9557999968528748
Epoch 4: Val Accuracy - 0.9631999731063843
Epoch 5: Val Accuracy - 0.9635999798774719
Epoch 6: Val Accuracy - 0.9679999947547913
Epoch 7: Val Accuracy - 0.9696000218391418
Epoch 8: Val Accuracy - 0.9714000225067139
Epoch 9: Val Accuracy - 0.9732000231742859
Epoch 10: Val Accuracy - 0.974399983882904
Epoch 11: Val Accuracy - 0.9732000231742859
Epoch 12: Val Accuracy - 0.9757999777793884
Epoch 13: Val Accuracy - 0.977400004863739
Epoch 14: Val Accuracy - 0.9746000170707703
Epoch 15: Val Accuracy - 0.9768000245094299
Epoch 16: Val Accuracy - 0.9769999980926514
Epoch 17: Val Accuracy - 0.977400004863739
Epoch 18: Val Accuracy - 0.9783999919891357
Epoch 19: Val Accuracy - 0.9775999784469604
Epoch 20: Val Accuracy - 0.9775999784469604
Epoch 21: Val Accuracy - 0.977400004863739
Epoch 22: Val Accuracy - 0.9782000184059143
Epoch 23: Val Accuracy - 0.9793999791145325
Ep

## Applying Gradient Clipping for Exploding Gradients issue

**Note**
Use the exact same graph from above design, however, gradient clipping needs to happen on the tf optimizer such that:

1. The `.minimize()` method automatically calculates the gradients and then update the weights
2. Instead:
    1. Use `optimizer.compute_gradients()` to obtain the gradients
    2. Then use tf function `clip_by_value()` to clip the gradients
    3. Take the clipped gradients and use `optimizer.apply_gradients()` to update the gradients

In [16]:
reset_graph()
n_inputs = 28 * 28 #MNIST
n_hidden1 = 300
n_hidden2 = 100
n_outputs = 10

learning_rate = 0.01
# for gradient clipping
gradient_threshold = 1

In [17]:
# tip: using python native functools.partial() to wrap a function with some default values
from functools import partial

# define placeholder node in the computation graph
X = tf.placeholder(tf.float32, shape=(None, n_inputs), name='X')
y = tf.placeholder(tf.int64, shape=(None), name="y") # y will be just 1D tensor, int
training = tf.placeholder_with_default(False, shape=(), name='training')

# define computation graph

# create a bn function with same parameters that's repeated
my_bn_layer = partial(tf.layers.batch_normalization, training=training, momentum=0.9)

with tf.name_scope('fc'):
    hidden1 = tf.layers.dense(X, n_hidden1, name='hidden1')
    bn1 = my_bn_layer(hidden1)
    bn1_act = tf.nn.elu(bn1)
    hidden2 = tf.layers.dense(bn1_act, n_hidden2, name='hidden2')
    bn2 = my_bn_layer(hidden2)
    bn2_act = tf.nn.elu(bn2)
    logits_before_bn = tf.layers.dense(bn2_act, n_outputs, name='outputs')
    logits = my_bn_layer(logits_before_bn)
    
with tf.name_scope('loss'):
    # this function takes labels from y, one hot it, and then use the logits node to 
    # calculate loss
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    # average xentropy as the "loss" of the model current state
    loss = tf.reduce_mean(xentropy, name='loss')

# notice in training here, the grads and vars come in as a list of
# (grad, var) tuples. Therefore, the clipped grads should come in
# with the exact same data structure
with tf.name_scope('train'):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    grads_and_vars = optimizer.compute_gradients(loss)
    capped_vars = [(tf.clip_by_value(grad, -gradient_threshold, gradient_threshold), var)
                   for grad, var in grads_and_vars]
    training_op = optimizer.apply_gradients(capped_vars)

with tf.name_scope('eval'):
    # tells if the logits perdictions are in top 1
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

# define logging systems
from datetime import datetime
now = datetime.utcnow().strftime("%Y%m%d%H%M%S")
root_logdir = 'mlp_logs'
logdir = "{}/run-{}/".format(root_logdir, now)

with tf.name_scope('log'):
    acc_summary = tf.summary.scalar('Acc', accuracy)
    file_writer = tf.summary.FileWriter(logdir, tf.get_default_graph())

init = tf.global_variables_initializer()
saver = tf.train.Saver()

Nothing to change for the execution stage codes

In [18]:
extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for iteration in range(mnist.train.num_examples // batch_size + 1):
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            if iteration % 10 == 0:
                summary_str = acc_summary.eval(feed_dict={X:X_batch, y:y_batch})
                # step is total number of minibatches from beginning
                step = epoch * (mnist.train.num_examples // batch_size + 1) + iteration
                # call the file_writer to add the above information
                file_writer.add_summary(summary_str, step)
            sess.run([training_op, extra_update_ops],
                     feed_dict={training:True, X: X_batch, y: y_batch})
        acc_val = accuracy.eval(feed_dict={X: mnist.validation.images,
                                           y: mnist.validation.labels})
        print("Epoch {0}: Val Accuracy - {1}".format(
            epoch + 1, acc_val))
    save_path = saver.save(sess, './my_model_final.ckpt')
    file_writer.close()

Epoch 1: Val Accuracy - 0.9241999983787537
Epoch 2: Val Accuracy - 0.9444000124931335
Epoch 3: Val Accuracy - 0.9581999778747559
Epoch 4: Val Accuracy - 0.9613999724388123
Epoch 5: Val Accuracy - 0.9678000211715698
Epoch 6: Val Accuracy - 0.9688000082969666
Epoch 7: Val Accuracy - 0.9706000089645386
Epoch 8: Val Accuracy - 0.972000002861023
Epoch 9: Val Accuracy - 0.9742000102996826
Epoch 10: Val Accuracy - 0.9747999906539917
Epoch 11: Val Accuracy - 0.9750000238418579
Epoch 12: Val Accuracy - 0.975600004196167
Epoch 13: Val Accuracy - 0.9761999845504761
Epoch 14: Val Accuracy - 0.9765999913215637
Epoch 15: Val Accuracy - 0.9753999710083008
Epoch 16: Val Accuracy - 0.9782000184059143
Epoch 17: Val Accuracy - 0.9783999919891357
Epoch 18: Val Accuracy - 0.9779999852180481
Epoch 19: Val Accuracy - 0.9805999994277954
Epoch 20: Val Accuracy - 0.978600025177002
Epoch 21: Val Accuracy - 0.9782000184059143
Epoch 22: Val Accuracy - 0.9782000184059143
Epoch 23: Val Accuracy - 0.9789999723434448
