## MLP with TensorFlow 2
The objective of the exercise is to implement computational graphs in TensorFlow 2.0 to train and use such an architecture. The constraints we put ourselves is to use **low-level** functions of TensorFlow, i.e. we will not use high-level functions to compose layers and to train the parameters.

If you get this error in the execution of the first cell: ` ModuleNotFoundError: No module named 'tensorflow' `, it probably means TensorFlow 2.0 is not installed yet on your machine.

In [None]:
#############################
# MNIST Dataset Preparation #
#############################

import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

mnist = tf.keras.datasets.mnist
(x_train, y_train_vec),(x_test, y_test_vec) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0
x_train = x_train.reshape(60000, 784)
x_test = x_test.reshape(10000, 784)
# convert class vectors to binary class matrices
y_train = tf.keras.utils.to_categorical(y_train_vec, 10, dtype='float64')
y_test = tf.keras.utils.to_categorical(y_test_vec, 10, dtype='float64')
N = x_train.shape[0]         # number of samples
D = x_train.shape[1]         # dimension of input sample
n_classes = y_train.shape[1] # output dim
print('MNIST data set ready. N={}, D={}, n_classes={}'.format(N,D,n_classes))

In [None]:
# Function to sample a random batch from dataset
def next_batch(num, data, labels):
    '''
    Return a total of `num` random samples and labels. 
    '''
    idx = np.arange(0,len(data))  # create an array of index values
    np.random.shuffle(idx)        # shuffle it
    idx = idx[:num]               # take the first n indexes = size of batch
    data_shuffle = data[idx]      # extract the batch using the random indexes
    labels_shuffle = labels[idx]  # extract the labels using the random indexes

    return data_shuffle, labels_shuffle

In [None]:
##################
# Training phase #
##################

E = 50                # number of epochs
B = 128               # batch size
N = x_train.shape[0]  # number of samples
D = x_train.shape[1]  # dimension of input sample
H = 300               # number of neurons
A = 1              # learning rate alpha

# Linear LR decay over a few epoch until some minimum LR is reached
min_learning_rate = 0.001
normal_epochs = 15
decaying_epochs = 20

decay_per_epoch = (A - min_learning_rate) / decaying_epochs

##############################################
#  COMPLETE CODE BELOW WHERE YOU SEE # ...   #
##############################################

# Build the computational graph
@tf.function # this decorator tells tf that a graph is defined
def mlp_train(x, y, alpha):
    # define nodes for forward computation for hidden neurons h and output neurons y_pred
    h = tf.nn.relu(tf.matmul(x, w1) + b1)  # output of first layer after ReLu activation
    y_pred = tf.nn.sigmoid(tf.matmul(h, w2) + b2) # output of second layer after sigmoid activation
    # define nodes for forward computation for hidden neurons h and output neurons y_pred
    diff = y - y_pred
    loss = tf.reduce_sum(diff**2) / B
    # define the gradients
    grad_w1, grad_b1, grad_w2, grad_b2 = tf.gradients(ys=loss, xs=[w1, b1, w2, b2])
    # compute the new values of the gradients with the assign method (see slides)
    w1.assign(w1 - alpha * grad_w1)
    b1.assign(b1 - alpha * grad_b1)
    w2.assign(w2 - alpha * grad_w2)
    b2.assign(b2 - alpha * grad_b2)
    return y_pred, loss

# Init the tf.Variables w1, b1, w2, b2 following the given examples
w1 = tf.Variable(tf.random.truncated_normal((D, H), stddev = 0.1, dtype='float64'))
b1 = tf.Variable(tf.constant(0.0, shape=[H], dtype='float64'))
w2 = tf.Variable(tf.random.truncated_normal((H, 10), stddev = 0.1, dtype='float64'))
b2 = tf.Variable(tf.constant(0.0, shape=[10], dtype='float64'))


# Run the computational graph
J = [] # to store the evolution of loss J for each epoch
for epoch in range(E):
    J_epoch = 0.0
    for _ in range(int(N/B)): # number of batches to visit for 1 epoch
        # get batches calling the next_batch method provided above
        x_train_batch, y_train_batch = next_batch(B, x_train, y_train)
        with tf.device('/CPU:0'):  # change to /GPU:0 to move it to GPU
            # call the graph with the batched input, target and alpha A
                        
            a = tf.math.maximum(A - (epoch - normal_epochs) * decay_per_epoch, min_learning_rate)
            a = tf.math.minimum(a, A)
            
            out = mlp_train(x_train_batch, y_train_batch, A)
        y_pred, loss_val = out
        J_epoch += loss_val
    J.append(J_epoch)
    print("epoch = {}, loss = {}, LR = {}".format(epoch, J_epoch, a))

In [None]:
# Plot the evolution of the loss
plt.plot(J)

In [None]:
#################
# Testing phase #
#################

N = x_test.shape[0]  # number of samples
D = x_test.shape[1]  # dimension of input sample

##############################################
#  COMPLETE CODE BELOW WHERE YOU SEE # ...   #
##############################################
# Build the computational graph
@tf.function # this decorator tells tf that a graph is defined
def mlp_test(x, y):
    h = tf.nn.relu(tf.matmul(x, w1) + b1)  # output of first layer after ReLu activation
    y_pred = tf.nn.sigmoid(tf.matmul(h, w2) + b2) # output of second layer after sigmoid activation
    return y_pred

# Run the computational graph
with tf.device('/CPU:0'):  # change to /GPU:0 to move it to GPU
    y_pred_test = mlp_test(x_test, y_test)

print('Forward pass on test set done.')
# At this stage, y_pred_test should contain the matrix of outputs on the test set with shape (N_test, 10)

In [None]:
# compute accuracy
y_winner = np.argmax(y_pred_test, axis=1)
N_test = y_winner.size
num_correct = (y_winner == y_test_vec).sum()
num_missed = N_test - num_correct
accuracy = num_correct * 1.0 / N_test
error_rate = num_missed * 1.0 / N_test
print('# samples  : ', N_test)
print('# correct  : ', num_correct)
print('# missed   : ', num_missed)
print('accuracy   :  %2.2f %%'% (accuracy*100.0))
print('error rate :  %2.2f %%'% (error_rate*100.0))

## Using Keras

In [None]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(H, input_shape=(D,), use_bias=True, activation='relu'))
model.add(tf.keras.layers.Dense(10, use_bias=True, activation='sigmoid'))
model.summary()
sgd = tf.keras.optimizers.SGD(learning_rate=A)
model.compile(optimizer=sgd, loss='mse', metrics=['accuracy'])
history2 = model.fit(x_train, y_train, batch_size=B, epochs=E)

In [None]:
plt.plot(history2.history["loss"])

In [None]:
model.evaluate(x_test, y_test, verbose=2)

The keras implementation was much faster, most probably because the resulting graph was more optimized. The pure tensorflow implementation reached a lower error rate but there we also used a LR schedule strategy which we neglected here.

### using softmax and cross entropy

In [None]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(H, input_shape=(D,), use_bias=True, activation='relu'))
model.add(tf.keras.layers.Dense(10, use_bias=True, activation='softmax'))
model.summary()
sgd = tf.keras.optimizers.SGD(learning_rate=A)
model.compile(optimizer=sgd, loss='categorical_crossentropy', metrics=['accuracy'])
history3 = model.fit(x_train, y_train, batch_size=B, epochs=E)

In [None]:
plt.plot(history3.history["loss"])

In [None]:
model.evaluate(x_test, y_test, verbose=2)

Using the cross entropy loss function and the softmax activation in the last layer enabled the network to converge much faster. The final error rate on the test set is comparable with the one achieved with the tensorflow only implementation using a LR schedule strategy.