# TensorFlow

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.optimizers.legacy import Adam
import matplotlib.pyplot as plt

In [2]:
tf.__version__ # check tf ver

'2.20.0'

## Simple cost function

In [3]:
w = tf.Variable(0, dtype=tf.float32)
optimizer = tf.keras.optimizers.Adam(learning_rate=0.1)

In TensorFlow, only the forward propagation is coded. TF automatically computes back prop. One way to do this is with `tf.GradientTape`.

`tf.GradientTape`:
- records operations during forward prop.
- when played back in reverse, it automatically computes back prop and grads.

In [4]:
def train_step():
    with tf.GradientTape() as tape:
        cost = w ** 2 - 10 * w + 25
    trainable_variables = [w]
    grads = tape.gradient(cost, trainable_variables)
    optimizer.apply_gradients(zip(grads, trainable_variables))

Given a cost function $J = w^2 - 10w + 25$, the value that minimizes $J$ is $w = 5$.

In [5]:
w # initial value

<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=0.0>

In [6]:
train_step() # run one step
w

<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=0.0999993085861206>

In [7]:
for i in range(1000): # run 1000 steps
    train_step()
w

<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=5.000000953674316>

## Extending to train data

In [8]:
w = tf.Variable(0, dtype=tf.float32)
x = np.array([1.0, -10.0, 25.0], dtype=np.float32)
optimizer = tf.keras.optimizers.Adam(0.1)

In [9]:
def cost_fn(x):
    return (x[0] * w ** 2) + (x[1] * w) + x[2]
w

<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=0.0>

In [10]:
def train_step(x, w, optimizer):
    with tf.GradientTape() as tape:
        cost = cost_fn(x)
    trainable_variables = [w]
    grads = tape.gradient(cost, trainable_variables)
    optimizer.apply_gradients(zip(grads, trainable_variables))

In [11]:
train_step(x, w, optimizer)
w

<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=0.0999993085861206>

In [12]:
def training(x, w, optimizer):
    for i in range(1000):
        train_step(x, w, optimizer)
    return w
w = training(x, w, optimizer)

In [13]:
w

<tf.Variable 'Variable:0' shape=() dtype=float32, numpy=5.000000953674316>

## Linear function

In [14]:
def linear_function():
    X = tf.constant(np.random.randn(3, 1), name='X')
    W = tf.Variable(np.random.randn(4, 3), name='W')
    b = tf.Variable(np.random.randn(4, 1), name='b')
    Y = tf.add(tf.matmul(W, X), b)
    return Y

## Sigmoid function

In [15]:
# GRADED FUNCTION: sigmoid

def sigmoid(z):
    z = tf.cast(z, tf.float32)
    a = tf.keras.activations.sigmoid(z)
    
    return a

## One hot encodings

Often times, you will have a  $Y$ vector with numbers ranging from $(0, C-1)$.

For example, if $C = 4$:
$$
y = [1 \; 2 \; 3 \; 0 \; 2 \; 1] \ \text{is often converted to}
\begin{bmatrix}
0 & 0 & 0 & 1 & 0 & 1 \\
1 & 0 & 0 & 0 & 0 & 1 \\
0 & 1 & 0 & 0 & 1 & 0 \\
0 & 0 & 1 & 0 & 0 & 0 \\
\end{bmatrix}
$$

Rows 1 through 4 correspond to classes 0, 1, 2, and 3, respectively.

This is _one hot_ encoding. In the converted representation, exactly one element of each column is _hot_—set to 1.

In tensorflow, use the ff. line of code to implement one hot encoding: `tf.one_hot(labels, depth, axis=0)`


In [16]:
def one_hot_matrix(label, C=6):
    one_hot = tf.reshape(tf.one_hot(label, C, axis=0), shape=[C, ])
    
    return one_hot

## Parameter initialization

Initializing parameters using the Glorot (Xavier) initializer.

In [17]:
def initialize_parameters():
    W1 = tf.Variable(initializer((25, 12288)))
    b1 = tf.Variable(initializer((25, 1)))
    W2 = tf.Variable(initializer((12, 25)))
    b2 = tf.Variable(initializer((12, 1)))
    W3 = tf.Variable(initializer((6, 12)))
    b3 = tf.Variable(initializer((6, 1)))

    parameters = {"W1": W1,
                  "b1": b1,
                  "W2": W2,
                  "b2": b2,
                  "W3": W3,
                  "b3": b3}
    
    return parameters

## Forward propagation

For TensorFlow, you only need to implement the forward prop function. 

It will keep track of the operations you did to calculate the back prop automatically.

In [18]:
def forward_propagation(X, parameters):
    W1 = parameters['W1']
    b1 = parameters['b1']
    W2 = parameters['W2']
    b2 = parameters['b2']
    W3 = parameters['W3']
    b3 = parameters['b3']
    
    Z1 = tf.math.add(tf.linalg.matmul(W1, X), b1)
    A1 = tf.keras.activations.relu(Z1)
    Z2 = tf.math.add(tf.linalg.matmul(W2, A1), b2)
    A2 = tf.keras.activations.relu(Z2)
    Z3 = tf.math.add(tf.linalg.matmul(W3, A2), b3)
    
    return Z3

## Total loss

When the classification has multiple labels, categorical cross-entropy should be the loss function.

Cost is normally computed as the sum of losses over all samples, divided by the total number of samples.

Using the total loss (and not the mean loss) per mini-batch ensures consistency in the final cost value.

Additional remarks:
- `y_pred`" and "`y_true`" inputs of [tf.keras.losses.categorical_crossentropy](https://www.tensorflow.org/api_docs/python/tf/keras/losses/categorical_crossentropy) are expected to be of shape (number of examples, num_classes).
- `tf.reduce_sum` does the summation over the examples.
- Softmax will be taken care by `tf.keras.losses.categorical_crossentropy` by setting its parameter `from_logits=True`.

In [19]:
# GRADED FUNCTION: compute_total_loss 

def compute_total_loss(logits, labels):
    loss = tf.keras.losses.categorical_crossentropy(tf.transpose(labels), 
                                                    tf.transpose(logits), 
                                                    from_logits=True)
    total_loss = tf.reduce_sum(loss)
    
    return total_loss

## Model example

Specify an optimizer in a single line (e.g., `tf.keras.optimizers.Adam`). Then, call it within the training loop.

`tape.gradient` function retrives the gradients of recorded operations with automatic differentiation inside a `GradientTape` block.

`optimizer.apply_gradients` applies the optimizer's update rules to each trainable parameter.

Use `dataset = dataset.prefetch(8)` to avoid memory bottlenecks when reading data from disk.

`prefetch()` prepares a portion of data in advance for the next step. Because, the iteration is streaming—the data is does not need to fit into the memory.

In [20]:
def model(X_train, Y_train, X_test, Y_test, learning_rate = 0.0001,
          num_epochs = 1500, minibatch_size = 32, print_cost = True):
    """
    Implements a three-layer tensorflow neural network: LINEAR->RELU->LINEAR->RELU->LINEAR->SOFTMAX.
    """
    
    costs = [] # keep track of the cost
    train_acc = []
    test_acc = []
    
    # initialize params
    parameters = initialize_parameters()

    W1 = parameters['W1']
    b1 = parameters['b1']
    W2 = parameters['W2']
    b2 = parameters['b2']
    W3 = parameters['W3']
    b3 = parameters['b3']

    optimizer = tf.keras.optimizers.Adam(learning_rate)
    
    # CategoricalAccuracy will track the accuracy for this multiclass problem
    test_accuracy = tf.keras.metrics.CategoricalAccuracy()
    train_accuracy = tf.keras.metrics.CategoricalAccuracy()
    
    dataset = tf.data.Dataset.zip((X_train, Y_train))
    test_dataset = tf.data.Dataset.zip((X_test, Y_test))
    
    # get the num of elements of a dataset using the cardinality method
    m = dataset.cardinality().numpy()
    
    minibatches = dataset.batch(minibatch_size).prefetch(8)
    test_minibatches = test_dataset.batch(minibatch_size).prefetch(8)
    X_train = X_train.batch(minibatch_size, drop_remainder=True).prefetch(8) # extra step    
    Y_train = Y_train.batch(minibatch_size, drop_remainder=True).prefetch(8) # loads memory faster 

    # train loop
    for epoch in range(num_epochs):

        epoch_total_loss = 0.
        
        # reset object to start measuring (from 0) the accuracy each epoch
        train_accuracy.reset_states()
        
        for (minibatch_X, minibatch_Y) in minibatches:
            
            with tf.GradientTape() as tape:
                Z3 = forward_propagation(tf.transpose(minibatch_X), parameters) # predict
                minibatch_total_loss = compute_total_loss(Z3, tf.transpose(minibatch_Y)) # loss

            # accumulate the accuracy of all the batches
            train_accuracy.update_state(minibatch_Y, tf.transpose(Z3))
            
            trainable_variables = [W1, b1, W2, b2, W3, b3]
            grads = tape.gradient(minibatch_total_loss, trainable_variables)
            optimizer.apply_gradients(zip(grads, trainable_variables))
            epoch_total_loss += minibatch_total_loss
        
        # divide the epoch total loss over the number of samples
        epoch_total_loss /= m

        # print cost every 10 epochs
        if print_cost == True and epoch % 10 == 0:
            print ("Cost after epoch %i: %f" % (epoch, epoch_total_loss))
            print("Train accuracy:", train_accuracy.result())
            
            # evaluate the test set every 10 epochs to avoid computational overhead
            for (minibatch_X, minibatch_Y) in test_minibatches:
                Z3 = forward_propagation(tf.transpose(minibatch_X), parameters)
                test_accuracy.update_state(minibatch_Y, tf.transpose(Z3))
            print("Test_accuracy:", test_accuracy.result())

            costs.append(epoch_total_loss)
            train_acc.append(train_accuracy.result())
            test_acc.append(test_accuracy.result())
            test_accuracy.reset_states()

    return parameters, costs, train_acc, test_acc

In [22]:
def plot_cost(costs, lr=0.0001):
    plt.plot(np.squeeze(costs))
    plt.ylabel("Cost")
    plt.xlabel("iterations (per fives)")
    plt.title(f"Cost (Learning rate = {lr})")
    plt.show()

In [23]:
def plot_accuracy(train_acc, test_acc, lr=0.0001):
    plt.plot(np.squeeze(train_acc), label="Train Accuracy")
    plt.plot(np.squeeze(test_acc), label="Test Accuracy")
    plt.ylabel("Accuracy")
    plt.xlabel("iterations (per fives)")
    plt.title(f"Accuracy (Learning rate = {lr})")
    plt.legend()
    plt.show()