# Batch Normalization
    Batch normalization is most useful when building deep neural networks. To demonstrate this, we will create a convolutional neural network with 20 convolutional layers, followed by a fully connected layer. We will use it to classify handwritten digits in the MNIST dataset, which should be familiar to you by now.
    
    This notebook includes 2 versions of the network. First, uses higher level functions from tf.layers. Second, uses tf.nn package

In [33]:
'''
import packages
'''
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
#read data
mnist = input_data.read_data_sets("MNIST_data/", one_hot = True, reshape = False)

Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz


## Batch normalization using tf.layers.batch_normalization
    this version of the network uses tf.layers for everything, and excepts you to implement batch normalization using tf.layers.batch_normalization
    
    We will use the following function to create fully connected layers in our network. We will create them with the specified number of neurons and a ReLu activation function

In [34]:
def fully_connected(prev_layer, num_units):
    '''
    create a fully connected layer with the given layer as input and the given number of neurons
    
    parameters
    ----------
    prev_layer, tensor: The tensor acts as input into this layer
    num_units, int: number of the units, nodes, neurons
    
    return
    ------
    layer, tensor; a fully connected layer
    '''
    
    layer = tf.layers.dense(prev_layer, num_units, activation = tf.nn.relu)
    
    return layer

    We will use the following function to create convolutional layers in our network. We are using a 3x3 kernel, ReLu activation functions, strides of 1x1 on layers with odd depths, and strides of 2x2 on layers with even depths. We dont use pooling layers at all in this network

In [35]:
def conv_layer(prev_layer, layer_depth):
    '''
    create a convolutional layer with the given layer as input
    
    parameters
    ----------
    prev_layer, tensor: input into this layer
    layer_depth, int: we will set the strides and number of feature maps based on the layer depth of the network
    
    return
    ------
    conv_layer, tensor: a new convolutional layer
    '''
    
    strides = 2 if layer_depth % 3 == 0 else 1
    conv_layer = tf.layers.conv2d(prev_layer, layer_depth * 4, 3, strides, 'same', activation = tf.nn.relu)
    
    return conv_layer

    This cell builds the network without batch normalization, then it trains it on the MNIST dataset, It displays loss and accuracy data prediocally while training

## Train the CNN with fully connected layer as output that created before

In [36]:
'''
a CNN without Batch Normalization
'''
def train(num_batches, batch_size, learning_rate):
    #build placeholders for the input samples and labels
    inputs = tf.placeholder(tf.float32, [None, 28, 28, 1])
    labels = tf.placeholder(tf.float32, [None, 10])
    
    #feed the inputs into a series of 20 convolutional layers
    layer = inputs
    for layer_i in range(1, 20):
        layer = conv_layer(layer, layer_i)
        
    #flatten the output from the convolutional layers
    orig_shape = layer.get_shape().as_list()
    layer = tf.reshape(layer, shape = [-1, orig_shape[1] * orig_shape[2] * orig_shape[3]])
    
    #add one fully connected layer
    layer = fully_connected(layer, 100)
    
    #create the output layer from the convolutional layers
    logits = tf.layers.dense(layer, 10)
    
    #define loss and training operations
    model_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits = logits, labels = labels))
    train_opt = tf.train.AdamOptimizer(learning_rate).minimize(model_loss)
    
    #create operations to test operations
    correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(labels, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    
    #train and test the network
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        
        for batch_i in range(num_batches):
            batch_xs, batch_ys = mnist.train.next_batch(batch_size)
            
            #train this batch
            sess.run(train_opt, {inputs: batch_xs, labels: batch_ys})
            
            #periodically check the validation or training loss and accuracy
            if batch_i % 100 == 0:
                loss, acc = sess.run([model_loss, accuracy], {inputs: mnist.validation.images, 
                                                              labels: mnist.validation.labels})
                print("batch: {}: validation loss: {:.3f}, validation accuracy: {:.3f}".
                     format(batch_i, loss, acc))
                
            elif batch_i % 25 == 0:
                loss, acc = sess.run([model_loss, accuracy], {inputs: batch_xs, labels: batch_ys})
                print("batch: {}: training training: {:.3f}, training accuracy: {:.3f}".
                      format(batch_i, loss, acc))
                
        #at the end, score the final accuracy for both the validation and test sets
        acc = sess.run(accuracy, {inputs: mnist.validation.images, labels: mnist.validation.labels})
        print("final validation accuracy: {:.2f}".format(acc))
        acc = sess.run(accuracy, {inputs: mnist.test.images, labels: mnist.test.labels})
        print("final test accuracy: {:.2f}".format(acc))
        
        #score the first 100 test images individually
        correct = 0
        for i in range(100):
            correct += sess.run(accuracy, feed_dict = {inputs: [mnist.test.images[i]], 
                                                       labels: [mnist.test.labels[i]]})
            
        print("accuracy on 100 samples: {:.2f}".format(correct / 100))
        

'''
hyperparameters
'''
num_batches = 800
batch_size = 64
learning_rate = 0.002

tf.reset_default_graph()
with tf.Graph().as_default():
    train(num_batches, batch_size, learning_rate)

batch: 0: validation loss: 0.691, validation accuracy: 0.099
batch: 25: training training: 0.481, training accuracy: 0.078
batch: 50: training training: 0.331, training accuracy: 0.109
batch: 75: training training: 0.327, training accuracy: 0.062
batch: 100: validation loss: 0.325, validation accuracy: 0.175
batch: 125: training training: 0.326, training accuracy: 0.031
batch: 150: training training: 0.326, training accuracy: 0.141
batch: 175: training training: 0.323, training accuracy: 0.125
batch: 200: validation loss: 0.326, validation accuracy: 0.099
batch: 225: training training: 0.325, training accuracy: 0.203
batch: 250: training training: 0.326, training accuracy: 0.078
batch: 275: training training: 0.323, training accuracy: 0.125
batch: 300: validation loss: 0.326, validation accuracy: 0.092
batch: 325: training training: 0.323, training accuracy: 0.156
batch: 350: training training: 0.323, training accuracy: 0.125
batch: 375: training training: 0.326, training accuracy: 0.0

## Add batch normalization
    modify fully_connected and conv_layer function and add batch normalization to the fully connected layers it creates

In [37]:
'''
same fully_connected function but added batch normalization
'''
def fully_connected(prev_layer, num_units, is_training):
    '''
    create a fully connected layer with the given layer as input and the given number of neurons
    
    parameters
    ----------
    prev_layer, tensor: acts as input into this layer
    num_units, int: number of units, nodes, neurons
    is_training, bool: batch_normalization parameter
    
    return
    ------
    layer, tensor: fully connected layer with batch normalization applied
    '''
    layer = tf.layers.dense(prev_layer, num_units, use_bias = False, activation = None)
    layer = tf.layers.batch_normalization(layer, training = is_training)
    layer = tf.nn.relu(layer)
    
    return layer

In [38]:
'''
same conv_layer function but added batch normalization
'''
def conv_layer(prev_layer, layer_depth, is_training):
    '''
    create a convolutional layer with the given layer as input
    
    parameters
    ----------
    prev_layer, tensor: that acts as input into this layer
    layer_depth, int: we wil set the strides and number of features maps based on the layers depth
    is_training, bool: batch_normalization parameters
    
    return
    ------
    conv_layer, tensor: a convolutional layer with batch normalization added
    '''
    strides = 2 if layer_depth % 3 == 0 else 1
    conv_layer = tf.layers.conv2d(prev_layer, layer_depth * 4, 3, strides, 'same', use_bias = False, activation = None)
    conv_layer = tf.layers.batch_normalization(conv_layer, training = is_training)
    conv_layer = tf.nn.relu(conv_layer)
    
    return conv_layer

## Train a CNN network with a fully conneted layer as output now with added batch normalization

In [43]:
'''
modify the network that will be trained with batch normalization
'''
def train(num_batches, batch_size, learning_rate):
    #build placeholder for the input samples and labels
    inputs = tf.placeholder(tf.float32, [None, 28, 28, 1])
    labels = tf.placeholder(tf.float32, [None, 10])
    
    #add placeholder to indicate whether or not we are training the model
    is_training = tf.placeholder(tf.bool)
    
    #feed the inputs into a series of 20 convolutional layers
    layer = inputs
    for layer_i in range(1, 20):
        layer = conv_layer(layer, layer_i, is_training)
        
    #flatten the output from convolutional layers
    orig_shape = layer.get_shape().as_list()
    layer = tf.reshape(layer, shape = [-1, orig_shape[1] * orig_shape[2] * orig_shape[3]])
    
    #add one fully connected layer
    layer = fully_connected(layer, 100, is_training)
    
    #create the output layer with 1 node for each
    logits = tf.layers.dense(layer, 10)
    
    #define loss and training operations
    model_loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits = logits, labels = labels))
    
    '''
    tell tensorflow to update the population statistics while training
    '''
    with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)):
        train_opt = tf.train.AdamOptimizer(learning_rate).minimize(model_loss)
        
    #create operations to test accuracy
    correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(labels, 1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    
    #train and test the network
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        
        for batch_i in range(num_batches):
            batch_xs, batch_ys = mnist.train.next_batch(batch_size)
            
            #train this batch
            sess.run(train_opt, {inputs: batch_xs, labels: batch_ys, is_training: True})
            
            #periodically check the validation or training loss and accuracy
            if batch_i % 100 == 0:
                loss, acc = sess.run([model_loss, accuracy], {inputs: mnist.validation.images, 
                                                              labels: mnist.validation.labels, 
                                                              is_training: False})
                print("batch: {} validation loss: {:.2f} validation accuracy: {:.2f}".format(batch_i, loss, acc))
                
            elif batch_i % 25 == 0:
                loss, acc = sess.run([model_loss, accuracy], {inputs: batch_xs, labels: batch_ys,
                                                              is_training: False})
                print("batch: {} training loss: {:.2f} training accuracy: {:.2f}".format(batch_i, loss, acc))
                
        #score the final accuracy for both the validation and test sets
        acc = sess.run(accuracy, {inputs: mnist.validation.images, 
                                  labels: mnist.validation.labels, 
                                  is_training: False})
        print("final validation accuracy: {:.2f}".format(acc))
        acc = sess.run(accuracy, {inputs: mnist.test.images, 
                                  labels: mnist.test.labels, 
                                  is_training: False})
        print("final test accuracy: {:.2f}".format(acc))
        
        #score the first 100 test images individually, just to make sure batch normalization worked
        correct = 0
        for i in range(100):
            correct += sess.run(accuracy, feed_dict = {inputs: [mnist.test.images[i]], 
                                                    labels: [mnist.test.labels[i]], 
                                                    is_training: False})
            
        print("accuracy on 100 samples: {:.2f}".format(correct / 100))
        
'''
hyperparameters
'''
num_batches = 800
batch_size = 64
learning_rate = 0.002

tf.reset_default_graph()
with tf.Graph().as_default():
    train(num_batches, batch_size, learning_rate)

batch: 0 validation loss: 0.69 validation accuracy: 0.10
batch: 25 training loss: 0.58 training accuracy: 0.08
batch: 50 training loss: 0.45 training accuracy: 0.08
batch: 75 training loss: 0.38 training accuracy: 0.12
batch: 100 validation loss: 0.36 validation accuracy: 0.09
batch: 125 training loss: 0.34 training accuracy: 0.11
batch: 150 training loss: 0.36 training accuracy: 0.08
batch: 175 training loss: 0.32 training accuracy: 0.22
batch: 200 validation loss: 0.29 validation accuracy: 0.34
batch: 225 training loss: 0.19 training accuracy: 0.64
batch: 250 training loss: 0.13 training accuracy: 0.77
batch: 275 training loss: 0.23 training accuracy: 0.56
batch: 300 validation loss: 0.26 validation accuracy: 0.56
batch: 325 training loss: 0.11 training accuracy: 0.84
batch: 350 training loss: 0.11 training accuracy: 0.88
batch: 375 training loss: 0.06 training accuracy: 0.89
batch: 400 validation loss: 0.07 validation accuracy: 0.90
batch: 425 training loss: 0.01 training accuracy: 