# **Checking tensorflow version**

In [0]:
import tensorflow as tf
print(tf.__version__)

In [0]:
## only run this if the tf version is less than 2.0.0
!pip uninstall tensorflow
!pip install tensorflow==2.0.0

# **Tensorflow Low Level API for MNIST dataset**

In [0]:
## Install dependencies

import datetime, os
import numpy as np 
import tensorflow as tf 
from tqdm import trange
from matplotlib import pyplot as plt
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

In [0]:
## Imports MNIST dataset

(X_train, y_train), (X_test, y_test) = tf.keras.datasets.mnist.load_data()

In [0]:
## Write a function to reprocess the data for input into the MLP. It must:
## 1) Reshape data into 2D matrix(whereby each example is a row, and each column represents a pixel value)
## 2) Convert the labels into one-hot encoding(hint: use scikitlearn LabelBinarizer)

def mnist_preprocess(X_train, X_test, y_train, y_test, train_size=0.7):
    ## splits dataset into train/val/test
    training_set_size = round((X_train.shape[0]) * train_size)
    X_train = X_train.reshape(X_train.shape[0], (X_train.shape[1] * X_train.shape[2]))
    X_train, X_val = X_train[:training_set_size], X_train[training_set_size:]
    X_test = X_test.reshape(X_test.shape[0], (X_test.shape[1] * X_test.shape[2]))
    temp1 = np.zeros((y_train.shape[0], y_train.max() + 1))
    for i, j in enumerate(y_train):
        temp1[i][j] = 1
    y_train, y_val = temp1[:training_set_size], temp1[training_set_size:]
    temp2 = np.zeros((y_test.shape[0], y_test.max() + 1))
    for i, j in enumerate(y_test):
        temp2[i][j] = 1
    y_test = temp2
    return (X_train, y_train), (X_val, y_val), (X_test, y_test)

(X_train, y_train), (X_val, y_val), (X_test, y_test) = mnist_preprocess(X_train, X_test, y_train, y_test, train_size=0.7)

Feedforward propagation. Similar to linear and logistic regression, neural networks require a feedforward portion which calculates the output in a series of operations and transformations. 

In [0]:
## Define variables and placeholders for a 4 layer MLP(1st layer - 784 units, 2nd layer - 500 units, , 3rd layer - 100 units, last layer - 10 units)

n_input = 784
n_hidden_1 = 500
n_hidden_2 = 100
output = 10
batch_size = 2000
training_epochs = 8
learning_rate = 0.0005

In [0]:
## Define placeholders for inputs and outputs

x = tf.placeholder(tf.float32, [None, n_input])
y = tf.placeholder(tf.float32, [None, output])

## Define the additional weights and biases. W1 and b1 have been done for you.

W1 = tf.Variable(tf.random_normal([n_input, n_hidden_1]))
b1 = tf.Variable(tf.random_normal([n_hidden_1]))
W2 = tf.Variable(tf.random_normal([n_hidden_1, n_hidden_2]))
b2 = tf.Variable(tf.random_normal([n_hidden_2]))
W3 = tf.Variable(tf.random_normal([n_hidden_2, output]))
b3 = tf.Variable(tf.random_normal([output]))

In [0]:
## Construct the hypothesis for the MLP. The first 2 steps have been done for you. 
y_preact_1 = tf.add(tf.matmul(x, W1), b1)
y_act_1 = tf.nn.relu(y_preact_1)
y_preact_2 = tf.add(tf.matmul(y_act_1, W2), b2)
y_act_2 = tf.nn.relu(y_preact_2)
y_preact_3 = tf.add(tf.matmul(y_act_2, W3), b3)

The loss we commonly use in classification is cross-entropy. Cross-entropy is a concept from information theory:

𝐻(𝑦)=−∑𝑦′log(𝑦)

Cross-entropy not only captures how correct (max probability corresponds to the right answer) the model's answers are, it also accounts for how confident (high confidence in correct answers) they are. This encourages the model to produce very high probabilities for correct answers while driving down the probabilities for the wrong answers, instead of merely be satisfied with it being the argmax.

Thereafter, (feed)back propagation takes over. This is unique to neural networks and not seen in linear or logistic regression. This is because the "deep" architecture of neural networks makes finding out the partial derivative of the cost function difficult, and hence the need for back propagation. Intuitively, it can be seen as finding out the contribution of each node in each layer that contributes to the total cost, and thereby adjusting the weights accordingly. For a more detailed explanation and the math behind it: https://towardsdatascience.com/understanding-backpropagation-algorithm-7bb3aa2f95fd

In [0]:
## Define cost function 

cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(
    labels=y,
    logits=y_preact_3))

## Optimizer

train_optimizer = tf.train.AdamOptimizer(learning_rate).minimize(cost)

In [0]:
## Run model training

init = tf.global_variables_initializer()
with tf.Session() as sess:
  sess.run(init)
  avg_training_cost = []
  avg_validation_cost = []
  epoch = []
  for j in trange(training_epochs):
    epoch.append(j)
    average_training_cost = 0
    average_validation_cost = 0
    total_num_of_training_batches = int(X_train.shape[0] / batch_size)
    total_num_of_validation_batches = int(X_val.shape[0] / batch_size)
    for i in range(total_num_of_training_batches - 1):
      batch_x = X_train[(batch_size*i) : (batch_size*(i+1))]
      batch_y = y_train[(batch_size*i) : (batch_size*(i+1))]
      _, c = sess.run([train_optimizer, cost], feed_dict={x: batch_x, y: batch_y}) #feed_dict is for all the placeholders defined above
      # print("Batch {} of epoch {} done!".format(i, j))
      average_training_cost += c / total_num_of_training_batches
    avg_training_cost.append(average_training_cost)
    for i in range(total_num_of_validation_batches - 1):
      batch_x = X_val[(batch_size*i) : (batch_size*(i+1))]
      batch_y = y_val[(batch_size*i) : (batch_size*(i+1))]
      c = sess.run(cost, feed_dict={x:X_val, y:y_val})
      average_validation_cost += c / total_num_of_validation_batches
    avg_validation_cost.append(average_validation_cost)
    print("Epoch {} done!".format(j))


Debugging and evaluation of model performance is the most important part of machine learning model construction, and oftentimes takes up the most amount of time!!! We use the cost curves to help inform us about the model performance, and infer about the tweaks (especially in the hyperparameters) that we need to make to improve results. 

In [0]:
## Plot cost curves
# print(epoch, avg_training_cost, avg_validation_cost)
plt.plot(epoch, avg_training_cost)
plt.plot(epoch, avg_validation_cost)
plt.legend(['train', 'valid'])
plt.xlabel("number of epochs")
plt.ylabel("loss")
plt.title("cost curves")
plt.show()

# **KERAS WITH TENSORBOARD**

In [0]:
## Builds model with hyperparameters defined

model = tf.keras.Sequential([
    tf.keras.layers.Dense(n_input, input_dim=X_train.shape[1], activation=tf.nn.relu),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(n_hidden_1, activation=tf.nn.relu),
    tf.keras.layers.Dense(n_hidden_2, activation=tf.nn.relu),
    tf.keras.layers.Dense(output, activation=tf.nn.softmax)
])

model.compile(optimizer=tf.train.AdamOptimizer(learning_rate), 
             loss=tf.keras.losses.categorical_crossentropy,
             metrics=[tf.keras.metrics.categorical_accuracy])

## Model training 

logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
callbacks = tf.keras.callbacks.TensorBoard(logdir, histogram_freq=1)
training = model.fit(X_train, y_train, callbacks=[callbacks], epochs=training_epochs, batch_size=batch_size, validation_data=(X_val, y_val))

In [0]:
## Plotting cost curves (preferred)

plt.plot(training.history['loss'])
plt.plot(training.history['val_loss'])
plt.title('cost curves')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

Or, you can use TensorBoard if you're confident...

In [0]:
!pip uninstall -q -y tensorboard
!pip uninstall -q -y tensorflow
# Install nightly TensorFlow with nightly TensorBoard.
!pip install --ignore-installed tf-nightly
!rm -rf ./logs/ 

In [0]:
## Loads tensorboard utilities

%reload_ext tensorboard
%tensorboard --logdir logs