# Neural networks: Multi Layer Perceptron
## In this lab we will learn about Multi-layer Perceptrons



In [7]:
import numpy as np
import seaborn as sns
import pandas as pd

import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.examples.tutorials.mnist import input_data


# From sklearn we will use the implementations of the Multi-layer perceptron
from sklearn import cluster
from sklearn.datasets import fetch_mldata
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import binarize
from sklearn.neural_network import BernoulliRBM
from sklearn.datasets.mldata import fetch_mldata
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.linear_model import LinearRegression 
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import load_digits
from keras.datasets import mnist


# We will also use different metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import recall_score
from sklearn import metrics





from IPython.display import display
from IPython.display import Image

In [8]:
(X_train, y_train), (X_test, y_test) = mnist.load_data()
print(y_train)
y_train = np.array([1 if x > 4 else 0 for x in y_train])
print(y_train)
print(y_test)
y_test = np.array([1 if x > 4 else 0 for x in y_test])
print(y_test)

X_train = np.reshape(X_train, (60000, 28*28))  # What are these two lines doing?
X_test = np.reshape(X_test, (10000, 28*28))
train_n_samples = 60000

[5 0 4 ... 5 6 8]
[1 0 0 ... 1 1 1]
[7 2 1 ... 4 5 6]
[1 0 0 ... 0 1 1]


In [9]:
def visualize_coefs(clf):
    """
    clf must be the instanced (and trained) classifier
    """
    fig, axes = plt.subplots(4, 4)
    # use global min / max to ensure all weights are shown on the same scale
    vmin, vmax = mlp.coefs_[0].min(), mlp.coefs_[0].max()
    for coef, ax in zip(mlp.coefs_[0].T, axes.ravel()):
        ax.matshow(coef.reshape(28, 28), cmap=plt.cm.gray, vmin=.5 * vmin, vmax=.5 * vmax)
    ax.set_xticks(())
    ax.set_yticks(())

    plt.show()

## Exercise 1

Now that we know how to create a MLP the easy way, we are going to give one step further, and design a MLP using tensorflow from scratch.

Suggestion: Use the same technique used in the tensorflow notebook to design te MLP. Following the notation used in that notebook (in which we formalized the linear models, y = W*X + b), design a one-hidden-layer MLP, which can be defined as y = (W1*X + b1)*W2 + b2. The rest of the tensorflow components (loss function, optimizer, ...) need no changes to work with this model. The MLP will have one single hidden layer, of 100 neurons.

Take into account that this model will be used for **binary classification** (this affects the loss function that needs to be optimized).

In [10]:
tf.reset_default_graph()
mini_batch_size = 100

# Declare the placeholders



# Declare the rest of variables



# Create the model y = (W1*X + b1)*W2 + b2 and loss function







In [11]:
learning_rate = 0.0001

# Declare an optimizer

init = tf.global_variables_initializer()

training_epochs = 40000
display_step = 500
perm = np.random.permutation(train_n_samples)


n_batch = train_n_samples // mini_batch_size + (train_n_samples % mini_batch_size != 0)

## Exercise 2

Use the designed MLP to learn a **classifier** for the MNIST dataset that was used in the example. You can reuse the training algorithm used in the tensorflow notebook to train this model.

Optional: Along with the error, display the accuracy of the model while training it. For that, you will have to compute it each time (inside the display_step *if*), or use a tensorflow operation (which will be a more optimized and sophisticated approach). For the first option, you will have to compute the predictions and compare them with the actual labels. For the second option, you will have to compute a "secondary loss function" which is not used for optimization, but only to show results.

In [12]:
init = tf.global_variables_initializer()


with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)) as sess:

    sess.run(init)

    for epoch in range(training_epochs):
        i_batch = (epoch % n_batch)*mini_batch_size
        batch = X_train[i_batch:i_batch+mini_batch_size], y_train[i_batch:i_batch+mini_batch_size]
        # run optimization operation
        
        if (epoch+1) % display_step == 0:
            err, p = sess.run((loss, tf.nn.sigmoid(prediction)), feed_dict={X: batch[0],  Y: np.reshape(batch[1], (-1,1))})
            
            # Compute and show the accuracy of the model
            
            print("Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f}".format(err))

    training_loss = sess.run(loss, feed_dict={X: X_test, Y: np.reshape(y_test, (-1,1))})
    print("Training loss=", training_loss, '\n')
    

Epoch: 0500 cost= 30314.716796875
Epoch: 1000 cost= 16630.796875000
Epoch: 1500 cost= 7495.603515625
Epoch: 2000 cost= 9143.077148438
Epoch: 2500 cost= 4318.079101562
Epoch: 3000 cost= 7912.834960938
Epoch: 3500 cost= 7438.613769531
Epoch: 4000 cost= 6567.265136719
Epoch: 4500 cost= 2717.686035156
Epoch: 5000 cost= 4734.321777344
Epoch: 5500 cost= 2321.935302734
Epoch: 6000 cost= 3813.561035156
Epoch: 6500 cost= 4045.684326172
Epoch: 7000 cost= 3392.591308594
Epoch: 7500 cost= 1636.912841797
Epoch: 8000 cost= 2671.666259766
Epoch: 8500 cost= 1197.667236328
Epoch: 9000 cost= 2026.114013672
Epoch: 9500 cost= 2538.968017578
Epoch: 10000 cost= 1859.041259766
Epoch: 10500 cost= 1050.409912109
Epoch: 11000 cost= 1622.656738281
Epoch: 11500 cost= 658.687500000
Epoch: 12000 cost= 1118.507202148
Epoch: 12500 cost= 1590.380371094
Epoch: 13000 cost= 1029.534423828
Epoch: 13500 cost= 739.396240234
Epoch: 14000 cost= 963.368652344
Epoch: 14500 cost= 348.022857666
Epoch: 15000 cost= 631.003967285
Ep

## Exercise 3 

Modify the visualize_coefs function so that it can show some of the coefficients in the first layer of the learned network. Instead of accessing the coefficients computed in the mlp object, you will have to show the coefficients computed in the weight variables.

Up until this point, we have let tensorflow optimize our models (both the simple ones that we did in the last lab, and the more complex MLP in this one) without much knowledge about the numerical computations that the optimizer did. Now that we know more about the backpropagation error, we are going to dig deeper into tensorflow, and see what operations are being performed in each step of the model training.

To know how the values change in each learning iteration, we are going to divide the optimizing operation in two halves, gradient computing, and gradient application:

In [13]:
grads = adam.compute_gradients(loss, var_list=variable_list)
application = adam.apply_gradients(grads)

In the previous cell, we have set a tensorflow operation that computes the gradients for all the variables, and then, we have defined another operation that applies these gradients to the variable.

The *grads* object contains the current value of the variables, and the gradient of these variables.
Using these two tensorflow operations results in the same outcome as using the minimize(loss) function of the optimizer.


In [14]:
init = tf.global_variables_initializer()


with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)) as sess:

    sess.run(init)

    for epoch in range(training_epochs):
        i_batch = (epoch % n_batch)*mini_batch_size
        batch = X_train[i_batch:i_batch+mini_batch_size], y_train[i_batch:i_batch+mini_batch_size]
        gs = sess.run(grads, feed_dict={X: batch[0], Y: np.reshape(batch[1], (-1,1))})  # This line is not needed for
        print(gs)                                                                       # optimization, just for visualization
        
        sess.run(application, feed_dict={X: batch[0], Y: np.reshape(batch[1], (-1,1))})
        
        if (epoch+1) % display_step == 0:
            err, p = sess.run((loss, tf.nn.sigmoid(prediction)), feed_dict={X: batch[0],  Y: np.reshape(batch[1], (-1,1))})
            print("Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f}".format(err))

    training_loss = sess.run(loss, feed_dict={X: X_test, Y: np.reshape(y_test, (-1,1))})
    print("Training loss=", training_loss, '\n')

Epoch: 0500 cost= 35199.300781250
Epoch: 1000 cost= 15801.134765625
Epoch: 1500 cost= 14258.865234375
Epoch: 2000 cost= 15981.629882812
Epoch: 2500 cost= 5333.068847656
Epoch: 3000 cost= 4249.783691406
Epoch: 3500 cost= 5076.312011719
Epoch: 4000 cost= 3628.100097656
Epoch: 4500 cost= 4364.167480469
Epoch: 5000 cost= 5796.270019531
Epoch: 5500 cost= 1905.759399414
Epoch: 6000 cost= 2384.908203125
Epoch: 6500 cost= 2428.159423828
Epoch: 7000 cost= 1798.767822266
Epoch: 7500 cost= 2060.764648438
Epoch: 8000 cost= 2803.760986328
Epoch: 8500 cost= 984.450683594
Epoch: 9000 cost= 1578.440185547
Epoch: 9500 cost= 1150.056762695
Epoch: 10000 cost= 1133.869262695
Epoch: 10500 cost= 1162.719238281
Epoch: 11000 cost= 1608.653442383
Epoch: 11500 cost= 563.721862793
Epoch: 12000 cost= 956.941101074
Epoch: 12500 cost= 625.649902344
Epoch: 13000 cost= 731.614135742
Epoch: 13500 cost= 697.470703125
Epoch: 14000 cost= 957.274169922
Epoch: 14500 cost= 334.106262207
Epoch: 15000 cost= 538.670043945
Epoc

## Exercise 4

Now, we are going to simulate the backpropagation algorithm. As you know, this algorithm computes the gradients (how the weights should change) of the network starting from the last layer to the first one. For that, we have to accomplish the following objectives:

1) Create a deep neural network with more than one hidden layer. Keep the number of neurons relatively low, so that the training of the model does not consume much time.

2) Use the *compute_gradients* function to compute the gradients of the layers. In this exercise, this operation must be done separately for each layer; you need to call the *compute_gradients* function for each layer of the network. Use the *var_list* parameter of the function.

3) Use the *apply_gradients* function as it needs to be used, to create the tensorflow operations that we will later use to update the weights.

4) Modify the training algorithm. Include in this algorithm the gradient application operations that you created.

Take into account the order in which these operatinos have to be run by tensorflow, maintaining coherence with the backpropagation algorithm.

In [15]:
tf.reset_default_graph()
mini_batch_size = 100

X = tf.placeholder("float", shape=[None, 784], name="X")
Y = tf.placeholder("float", shape=[None, 1], name="y")


# Declare the rest of variables



# Create the model



In [16]:
learning_rate = 0.0001

adam = tf.train.AdamOptimizer(learning_rate)

optimizer = adam.minimize(loss)

init = tf.global_variables_initializer()

training_epochs = 40000
display_step = 500
perm = np.random.permutation(train_n_samples)


n_batch = train_n_samples // mini_batch_size + (train_n_samples % mini_batch_size != 0)

In [17]:
# Declare the compute_gradients and apply_gradient operations.

In [18]:
init = tf.global_variables_initializer()


with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)) as sess:

    sess.run(init)

    for epoch in range(training_epochs):
        i_batch = (epoch % n_batch)*mini_batch_size
        batch = X_train[i_batch:i_batch+mini_batch_size], y_train[i_batch:i_batch+mini_batch_size]
            
        # Run applications
        
        if (epoch+1) % display_step == 0:
            err, p = sess.run((loss, tf.nn.sigmoid(prediction)), feed_dict={X: batch[0],  Y: np.reshape(batch[1], (-1,1))})
            
            # Compute and show the accuracy of the model
            
            print("Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f}".format(err))

    training_loss = sess.run(loss, feed_dict={X: X_test, Y: np.reshape(y_test, (-1,1))})
    print("Training loss=", training_loss, '\n')
    

Epoch: 0500 cost= 31113.199218750
Epoch: 1000 cost= 8554.460937500
Epoch: 1500 cost= 8937.524414062
Epoch: 2000 cost= 14167.705078125
Epoch: 2500 cost= 5528.160156250
Epoch: 3000 cost= 6429.468750000
Epoch: 3500 cost= 7346.627929688
Epoch: 4000 cost= 3989.730712891
Epoch: 4500 cost= 3643.077392578
Epoch: 5000 cost= 6967.380859375
Epoch: 5500 cost= 2028.039672852
Epoch: 6000 cost= 3920.076171875
Epoch: 6500 cost= 3830.023437500
Epoch: 7000 cost= 2369.577392578
Epoch: 7500 cost= 2162.732666016
Epoch: 8000 cost= 3556.782470703
Epoch: 8500 cost= 1083.005493164
Epoch: 9000 cost= 2510.074462891
Epoch: 9500 cost= 2099.039550781
Epoch: 10000 cost= 1259.440307617
Epoch: 10500 cost= 1269.145141602
Epoch: 11000 cost= 1929.218383789
Epoch: 11500 cost= 591.718566895
Epoch: 12000 cost= 1578.470947266
Epoch: 12500 cost= 1373.240356445
Epoch: 13000 cost= 687.904907227
Epoch: 13500 cost= 853.590148926
Epoch: 14000 cost= 1093.875488281
Epoch: 14500 cost= 343.560974121
Epoch: 15000 cost= 973.454467773
Ep

## Exercise 5

**This exercise is optional, and, if completed, should be uploaded to egela.**

We can, however, dig further into the computation of the gradients. For this exercise, the usage of the *tf.compute_graidents* is banned. Instead of that function, we will have to use the *tf.gradients* function, which computes the gradients. In this case, we are not making use of the optimizers implemented in tensorflow. Therefore, we will need to perform the optimization on our own.

1) Use the *tf.gradient* function (https://www.tensorflow.org/api_docs/python/tf/gradients) to declare an operation that computes the gradients of the variables.

2) Use the *tf.assign* function (https://www.tensorflow.org/api_docs/python/tf/assign) to create an operation that assigns certain value to a variable. In our case, the weights and biases of the model.

3) Modify the training loop so that it can train the model using the defined operations, instead of those that use the tensorflow optimizers.

4) Modify the algorithm so that it implements learning_rate decay.

In [19]:
tf.reset_default_graph()
mini_batch_size = 100

# Declare placeholders

# Declare the rest of variables

# Create the model



In [20]:
init = tf.global_variables_initializer()

training_epochs = 40000
display_step = 500
perm = np.random.permutation(train_n_samples)


n_batch = train_n_samples // mini_batch_size + (train_n_samples % mini_batch_size != 0)

display_step = 500

gs = tf.gradients(loss, variable_list)

learning_rate = tf.Variable(0.00001)

init = tf.global_variables_initializer()

In [21]:
# Create list of variables that need to be updated (first without learning_rate decay, then with decay)
    



In [22]:
#Optimized solution

with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)) as sess:

    sess.run(init)

    for epoch in range(training_epochs):
        i_batch = (epoch % n_batch)*mini_batch_size
        batch = X_train[i_batch:i_batch+mini_batch_size], y_train[i_batch:i_batch+mini_batch_size]
        # Run updates
            
        if (epoch+1) % display_step == 0:
            err, p = sess.run((loss, tf.nn.sigmoid(prediction)), feed_dict={X: batch[0],  Y: np.reshape(batch[1], (-1,1))})
            
            # Compute and show the accuracy of the model
            
            print("Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f}".format(err))

    training_loss = sess.run(loss, feed_dict={X: X_test, Y: np.reshape(y_test, (-1,1))})
    print("Training loss=", training_loss, '\n')

Epoch: 0500 cost= 2643.220947266
Epoch: 1000 cost= 772.092407227
Epoch: 1500 cost= 983.369201660
Epoch: 2000 cost= 405.895874023
Epoch: 2500 cost= 193.922836304
Epoch: 3000 cost= 339.877197266
Epoch: 3500 cost= 187.692291260
Epoch: 4000 cost= 389.246246338
Epoch: 4500 cost= 287.942840576
Epoch: 5000 cost= 277.470214844
Epoch: 5500 cost= 146.926696777
Epoch: 6000 cost= 462.152618408
Epoch: 6500 cost= 200.870971680
Epoch: 7000 cost= 292.698394775
Epoch: 7500 cost= 339.680358887
Epoch: 8000 cost= 286.861572266
Epoch: 8500 cost= 202.637817383
Epoch: 9000 cost= 154.537170410
Epoch: 9500 cost= 189.646255493
Epoch: 10000 cost= 249.607971191
Epoch: 10500 cost= 290.993347168
Epoch: 11000 cost= 378.125854492
Epoch: 11500 cost= 131.790023804
Epoch: 12000 cost= 188.687072754
Epoch: 12500 cost= 255.569412231
Epoch: 13000 cost= 204.271820068
Epoch: 13500 cost= 255.360778809
Epoch: 14000 cost= 633.768920898
Epoch: 14500 cost= 80.204093933
Epoch: 15000 cost= 183.813385010
Epoch: 15500 cost= 249.433731