# Using Tensorflow 3
Notes from 11/04/2018 lab lecture

## Generate the model and save it

In [4]:
import tensorflow as tf
import numpy as np

# load MNIST dataset
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/") # download files
# define layers dimensions
n_inputs = 28*28  # MNIST image sizen_hidden1 = 300
n_hidden1 = 300
n_hidden2 = 100
n_outputs = 10

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int64, shape=(None), name="y")

# define function to construct a layer with n_neurons ad input X; name of layer and activation
# type for the units are additional parameters
def neuron_layer(X, n_neurons, name, activation=None): 
    with tf.name_scope(name): # define a name scope for better visualization in TensorBoard 
        n_inputs = int(X.get_shape()[1]) 
        stddev = 2 / np.sqrt(n_inputs) # compute standard deviation for weights initialization
        init = tf.truncated_normal((n_inputs, n_neurons), stddev=stddev) # normal distribution
        W = tf.Variable(init, name="kernel") # create weights matrix with initializer
        b = tf.Variable(tf.zeros([n_neurons]), name="bias") # create biases (= 0) for each neuron
        Z = tf.matmul(X, W) + b # create subgraph for computing the input net (weighted sum)
        if activation is not None:
            return activation(Z) # apply activation function if defined
        else:
            return Z

# build the network (with no soft-max, that will be inserted in the definition of loss)
with tf.name_scope("dnn"):
    hidden1 = neuron_layer(X, n_hidden1, name="hidden1", activation=tf.nn.relu)
    hidden2 = neuron_layer(hidden1, n_hidden2, name="hidden2", activation=tf.nn.relu)
    logits = neuron_layer(hidden2, n_outputs, name="outputs")

# build the loss: single label multi-class classification -> softmax in output + cross entropy loss 
with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y,logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")

# build the subgraph for computing the gradient 
learning_rate = 0.01

with tf.name_scope("train"):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)
# define how to evaluate the model
with tf.name_scope("eval"):
    # check if the highest logit corresponds to the target class
    correct = tf.nn.in_top_k(logits, y, 1) # returns a 1D tensor
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) # average to compute accuracy 

init = tf.global_variables_initializer()

saver = tf.train.Saver()

n_epochs = 40
batch_size = 50

# open session

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for iteration in range(mnist.train.num_examples // batch_size):
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
        acc_val = accuracy.eval(feed_dict={X: mnist.validation.images,y: mnist.validation.labels})
        print(epoch, "Train accuracy:", acc_train, "Val accuracy:", acc_val)

    save_path = saver.save(sess, "./my_model_final.ckpt")


Extracting /tmp/data/train-images-idx3-ubyte.gz
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz
0 Train accuracy: 0.96 Val accuracy: 0.921
1 Train accuracy: 0.98 Val accuracy: 0.9364
2 Train accuracy: 0.9 Val accuracy: 0.9446
3 Train accuracy: 0.98 Val accuracy: 0.95
4 Train accuracy: 1.0 Val accuracy: 0.9532
5 Train accuracy: 0.96 Val accuracy: 0.9608
6 Train accuracy: 1.0 Val accuracy: 0.9636
7 Train accuracy: 0.94 Val accuracy: 0.9638
8 Train accuracy: 0.94 Val accuracy: 0.9674
9 Train accuracy: 0.96 Val accuracy: 0.9684
10 Train accuracy: 0.96 Val accuracy: 0.9672
11 Train accuracy: 0.98 Val accuracy: 0.9696
12 Train accuracy: 0.98 Val accuracy: 0.9722
13 Train accuracy: 0.98 Val accuracy: 0.9718
14 Train accuracy: 0.96 Val accuracy: 0.9708
15 Train accuracy: 0.92 Val accuracy: 0.974
16 Train accuracy: 1.0 Val accuracy: 0.9736
17 Train accuracy: 0.98 Val accuracy: 0.9742
18 Train accuracy:

## Reload the model and make predictions

In [9]:
# redefine name Saver()
saver = tf.train.Saver()

# open session to load the trained model

with tf.Session() as sess:
    saver.restore(sess, "./my_model_final.ckpt") # restore the saved model
    X_new_scaled = mnist.test.images[90:120] # get test images from 140 to 159
    Z = logits.eval(feed_dict={X: X_new_scaled}) # evaluate the outputs (logits)
    y_pred = np.argmax(Z, axis=1) # select the index that reaches the maximum value

print("Predicted classes:", y_pred) # print the predicted index class
print("Actual classes:   ", mnist.test.labels[90:120]) # print the actual index class

INFO:tensorflow:Restoring parameters from ./my_model_final.ckpt
Predicted classes: [3 6 9 3 1 4 1 7 6 9 6 0 5 4 9 9 2 1 9 4 8 7 3 9 7 4 4 4 9 2]
Actual classes:    [3 6 9 3 1 4 1 7 6 9 6 0 5 4 9 9 2 1 9 4 8 7 3 9 7 4 4 4 9 2]


## Weights intialization
Gradient will be almost zero if initialization is wrong.
There are many different function that can be use, and each of one can give help in learning specific tasks (can give less problem in gradient descent).
Trade-off: for each unit normalize the initial weights with respect of the number of incoming and outcoming connections.
Small modules for the weights are better in order to compute gradient descent

In the following example a ReLU function initialization is used

In [10]:
import tensorflow as tf
import numpy as np

# load MNIST dataset
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/") # download files
# define layers dimensions
n_inputs = 28*28  # MNIST image sizen_hidden1 = 300
n_hidden1 = 300
n_hidden2 = 100
n_outputs = 10

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int64, shape=(None), name="y")

# He inizialization
he_init = tf.contrib.layers.variance_scaling_initializer()

# build the network (with no soft-max, that will be inserted in the definition of loss)                                                       
with tf.name_scope("dnn"):
    hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, kernel_initializer=he_init, name="hidden1")
    hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu, kernel_initializer=he_init, name="hidden2")
    logits = tf.layers.dense(hidden2, n_outputs, name="outputs")

# build the loss: single label multi-class classification -> softmax in output + cross entropy loss 
with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y,logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")

# build the subgraph for computing the gradient 
learning_rate = 0.01

with tf.name_scope("train"):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)
# define how to evaluate the model
with tf.name_scope("eval"):
    # check if the highest logit corresponds to the target class
    correct = tf.nn.in_top_k(logits, y, 1) # returns a 1D tensor
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) # average to compute accuracy 

init = tf.global_variables_initializer()

saver = tf.train.Saver()

n_epochs = 40
batch_size = 50

# open session

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for iteration in range(mnist.train.num_examples // batch_size):
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
        acc_val = accuracy.eval(feed_dict={X: mnist.validation.images,y: mnist.validation.labels})
        print(epoch, "Train accuracy:", acc_train, "Val accuracy:", acc_val)

    save_path = saver.save(sess, "./my_model_final.ckpt")



Extracting /tmp/data/train-images-idx3-ubyte.gz
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz
0 Train accuracy: 0.94 Val accuracy: 0.9084
1 Train accuracy: 0.94 Val accuracy: 0.9284
2 Train accuracy: 0.9 Val accuracy: 0.939
3 Train accuracy: 0.92 Val accuracy: 0.9448
4 Train accuracy: 0.94 Val accuracy: 0.9454
5 Train accuracy: 1.0 Val accuracy: 0.9518
6 Train accuracy: 0.98 Val accuracy: 0.9538
7 Train accuracy: 0.92 Val accuracy: 0.957
8 Train accuracy: 0.94 Val accuracy: 0.9592
9 Train accuracy: 0.94 Val accuracy: 0.9624
10 Train accuracy: 0.94 Val accuracy: 0.9642
11 Train accuracy: 0.96 Val accuracy: 0.966
12 Train accuracy: 1.0 Val accuracy: 0.967
13 Train accuracy: 1.0 Val accuracy: 0.9678
14 Train accuracy: 0.94 Val accuracy: 0.968
15 Train accuracy: 0.96 Val accuracy: 0.9702
16 Train accuracy: 1.0 Val accuracy: 0.971
17 Train accuracy: 1.0 Val accuracy: 0.97
18 Train accuracy: 1.0 V

**Leaky ReLU** activation function instead of taking 0 has negative values; instead **ELU** uses non linear functions in its parts. ELU is predefined in tensorflow.
**SELU** has another different non linear function respect to ELU and try to find the optimal shape in order to improve gradient descent. Normalization with this function give 0 mean and 1 stddev

In [17]:
import tensorflow as tf
import numpy as np

tf.reset_default_graph()

# load MNIST dataset
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/") # download files
# define layers dimensions
n_inputs = 28*28  # MNIST image sizen_hidden1 = 300
n_hidden1 = 300
n_hidden2 = 100
n_outputs = 10

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int64, shape=(None), name="y")

# define Leaky ReLU
def leaky_relu(z, name=None):
    return tf.maximum(0.01 * z, z, name=name)

# define SELU
def selu(z,
         scale=1.0507009873554804934193349852946,
         alpha=1.6732632423543772848170429916717):
    return scale * tf.where(z >= 0.0, z, alpha * tf.nn.elu(z))

# build the network (with no soft-max, that will be inserted in the definition of loss)
with tf.name_scope("dnn"):
    # Leaky ReLU
    # hidden1 = tf.layers.dense(X, n_hidden1, name="hidden1", activation=leaky_relu)
    # ELU
    # hidden1 = tf.layers.dense(X, n_hidden1, name="hidden1", activation=tf.nn.elu)
    # SELU
    hidden1 = tf.layers.dense(X, n_hidden1, name="hidden1", activation=selu)        
    # Leaky ReLU 
    # hidden2 = tf.layers.dense(hidden1, n_hidden2, name="hidden2", activation=leaky_relu)
    # ELU 
    # hidden2 = tf.layers.dense(hidden1, n_hidden2, name="hidden2", activation=tf.nn.elu)
    # SELU
    hidden2 = tf.layers.dense(hidden1, n_hidden2, name="hidden2", activation=selu)
    logits = tf.layers.dense(hidden2, n_outputs, name="outputs")


# build the loss: single label multi-class classification -> softmax in output + cross entropy loss 
with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y,logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")

# build the subgraph for computing the gradient 
learning_rate = 0.01

with tf.name_scope("train"):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)
# define how to evaluate the model
with tf.name_scope("eval"):
    # check if the highest logit corresponds to the target class
    correct = tf.nn.in_top_k(logits, y, 1) # returns a 1D tensor
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) # average to compute accuracy 

init = tf.global_variables_initializer()

saver = tf.train.Saver()

n_epochs = 40
batch_size = 50

# begin THIS IS NEEDED FOR SELU
means = mnist.train.images.mean(axis=0, keepdims=True)
stds = mnist.train.images.std(axis=0, keepdims=True) + 1e-10
# end SELU

# open session

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for iteration in range(mnist.train.num_examples // batch_size):
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            # begin THIS IS NEEDED FOR SELU 
            X_batch_scaled = (X_batch - means) / stds
            sess.run(training_op, feed_dict={X: X_batch_scaled, y: y_batch})
            # end SELU
            # sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
        # begin THIS IS NEEDED FOR SELU
        acc_train = accuracy.eval(feed_dict={X: X_batch_scaled, y: y_batch})
        X_val_scaled = (mnist.validation.images - means) / stds
        acc_val = accuracy.eval(feed_dict={X: X_val_scaled, y: mnist.validation.labels})
        # end SELU 
        # acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
        # acc_val = accuracy.eval(feed_dict={X: mnist.validation.images,y: mnist.validation.labels})
        print(epoch, "Train accuracy:", acc_train, "Val accuracy:", acc_val)

    save_path = saver.save(sess, "./my_model_final.ckpt")



Extracting /tmp/data/train-images-idx3-ubyte.gz
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz
0 Train accuracy: 0.96 Val accuracy: 0.9244
1 Train accuracy: 0.92 Val accuracy: 0.9362
2 Train accuracy: 0.98 Val accuracy: 0.9452
3 Train accuracy: 0.98 Val accuracy: 0.9506
4 Train accuracy: 0.98 Val accuracy: 0.9552
5 Train accuracy: 1.0 Val accuracy: 0.9568
6 Train accuracy: 0.98 Val accuracy: 0.9586
7 Train accuracy: 1.0 Val accuracy: 0.9594
8 Train accuracy: 0.98 Val accuracy: 0.9624
9 Train accuracy: 0.98 Val accuracy: 0.963
10 Train accuracy: 0.98 Val accuracy: 0.9636
11 Train accuracy: 0.98 Val accuracy: 0.9654
12 Train accuracy: 1.0 Val accuracy: 0.9652
13 Train accuracy: 1.0 Val accuracy: 0.9662
14 Train accuracy: 0.96 Val accuracy: 0.9678
15 Train accuracy: 1.0 Val accuracy: 0.9668
16 Train accuracy: 1.0 Val accuracy: 0.9682
17 Train accuracy: 1.0 Val accuracy: 0.9678
18 Train accuracy:

## Batch normalization
Normalizing inputs speed up training. When you've got a multiple layers at the end the output is not normalized anymore. The idea is to normalize it again. Backpropagation takes into account the normalization

In [18]:
import tensorflow as tf
import numpy as np

tf.reset_default_graph()

# load MNIST dataset
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/") # download files
# define layers dimensions
n_inputs = 28*28  # MNIST image sizen_hidden1 = 300
n_hidden1 = 300
n_hidden2 = 100
n_outputs = 10

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int64, shape=(None), name="y")

training = tf.placeholder_with_default(False, shape=(), name='training')

with tf.name_scope("dnn"):
    hidden1 = tf.layers.dense(X, n_hidden1, name="hidden1")
    bn1 = tf.layers.batch_normalization(hidden1, training=training, momentum=0.9)
    bn1_act = tf.nn.elu(bn1)

    hidden2 = tf.layers.dense(bn1_act, n_hidden2, name="hidden2")
    bn2 = tf.layers.batch_normalization(hidden2, training=training, momentum=0.9)
    bn2_act = tf.nn.elu(bn2)

    logits_before_bn = tf.layers.dense(bn2_act, n_outputs, name="outputs")
    logits = tf.layers.batch_normalization(logits_before_bn, training=training, momentum=0.9)

# build the loss: single label multi-class classification -> softmax in output + cross entropy loss 
with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y,logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")

# build the subgraph for computing the gradient 
learning_rate = 0.01

with tf.name_scope("train"):
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    training_op = optimizer.minimize(loss)
# define how to evaluate the model
with tf.name_scope("eval"):
    # check if the highest logit corresponds to the target class
    correct = tf.nn.in_top_k(logits, y, 1) # returns a 1D tensor
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) # average to compute accuracy 

init = tf.global_variables_initializer()

saver = tf.train.Saver()

n_epochs = 40
batch_size = 50
# the update operations needed by batch normalization (for computing moving averages)
# are added to the UPDATE_OPS collection and need to be evaluated during training
extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)

# open session

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for iteration in range(mnist.train.num_examples // batch_size):
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            sess.run([training_op, extra_update_ops], feed_dict={training: True, X: X_batch, y: y_batch})
        acc_train = accuracy.eval(feed_dict={X: X_batch, y: y_batch})
        acc_val = accuracy.eval(feed_dict={X: mnist.validation.images,y: mnist.validation.labels})
        print(epoch, "Train accuracy:", acc_train, "Val accuracy:", acc_val)

    save_path = saver.save(sess, "./my_model_final.ckpt")

# print("\ntf.trainable_variables()")
# for v in tf.trainable_variables():
#    print(v.name)

# print("\ntf.global_variables()")
# for v in tf.global_variables():
#    print(v.name)


Extracting /tmp/data/train-images-idx3-ubyte.gz
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz
0 Train accuracy: 0.9 Val accuracy: 0.9264
1 Train accuracy: 0.94 Val accuracy: 0.944
2 Train accuracy: 0.96 Val accuracy: 0.956
3 Train accuracy: 0.96 Val accuracy: 0.9612
4 Train accuracy: 0.98 Val accuracy: 0.9626
5 Train accuracy: 0.98 Val accuracy: 0.9658
6 Train accuracy: 0.98 Val accuracy: 0.97
7 Train accuracy: 0.96 Val accuracy: 0.9714
8 Train accuracy: 1.0 Val accuracy: 0.9718
9 Train accuracy: 1.0 Val accuracy: 0.9738
10 Train accuracy: 0.98 Val accuracy: 0.9724
11 Train accuracy: 1.0 Val accuracy: 0.9738
12 Train accuracy: 1.0 Val accuracy: 0.9752
13 Train accuracy: 1.0 Val accuracy: 0.977
14 Train accuracy: 1.0 Val accuracy: 0.976
15 Train accuracy: 1.0 Val accuracy: 0.9774
16 Train accuracy: 1.0 Val accuracy: 0.9806
17 Train accuracy: 1.0 Val accuracy: 0.9776
18 Train accuracy: 1.0 Val