# Multilayer Neural Network Breakdown by Compnoent

We will focus on main components in the neural network instead of performing an actual regression or classification work.

## 0 - Preparation

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import h5py
import matplotlib.pyplot as plt
import copy

The following files are auxiliary files for unit test. For copyright issues, they won't be provided in this repo. However, they only serve as testing purposes.

In [3]:
from testCases import *
from dnn_utils import sigmoid, sigmoid_backward, relu, relu_backward
from public_tests import *

## 1 - Helper Functions

`sigmod` activation function

In [4]:
def sigmoid(Z):
    A = 1 / (1 + np.exp(-Z))
    cache = Z
    return A, cache

`relu` activation function

In [5]:
def relu(Z):
    # Be cautious: we need to use np.maximum NOT np.max
    A = np.maximum(0, Z)
    cache = Z
    return A, cache

## 2 - Initialize Parameters

`init_param` to initialize parameters for neural network with any number of layers

In [6]:
def init_param(layer_size: list[int]):
    """
    Argument:
    layer_size: a list of integers representing number of neurons from input layer to output layers.
                length of this list equals to the number of layers (including input and output layer)
    
    Returns:
    parameters: {'W1': W1, 'b1': b1', ..., 'Wm': Wm, 'bm': bm}
    """
    np.random.seed(3)
    num_layer = len(layer_size)
    parameters = {}
    for layer in range(1, num_layer):
        # initialize the parameters
        W = np.random.randn(layer_size[layer], layer_size[layer-1]) * 0.01
        b = np.zeros((layer_size[layer], 1))
        # save for return
        parameters.update({
            f'W{layer}': W,
            f'b{layer}': b
        })
    return parameters    

`TEST`

In [7]:
parameters = init_param([5,4,3])
initialize_parameters_deep_test(init_param)

[92m All tests passed.


## 3 - Forward Propagataion

Forwward propagation is to compute the transformation for input. The formula are:

$$
Z^{[l]} = W^{[l]}A^{[l-1]} +b^{[l]}
$$

$$
A^{[l]} = \text{relu}(Z^{[l]})
$$
or 
$$
A^{[l]} = \text{sigmoid}(Z^{[l]}) 
$$

`linear_forward` represent a single module of linear transformation without involving activation

In [8]:
def linear_forward(A: np.ndarray, W: np.ndarray, b: np.ndarray):
    """
    Arguments:
    W: weight matrix from previous layer to current layer: (current_layer_size, previous_layer_size)
    A: the output from previous layer (should be original data if previous layer is input layer): (previous_layer_size, num_samples)
    b: bias vector for the current: (current_layer_size, 1)

    Returns:
    Z: the results after linear transformation
    cache: {'W': W, 'A': A, 'b': b}
    """
    Z = np.matmul(W, A) + b
    cache = (A, W, b)
    return Z, cache

`TEST`

In [9]:
t_A, t_W, t_b = linear_forward_test_case()
t_Z, t_linear_cache = linear_forward(t_A, t_W, t_b)
linear_forward_test(linear_forward)

[92m All tests passed.


`linear_activate_forward` represents a single module of linear transformation PLUS activation functions

In [10]:
def linear_activate_forward(A: np.ndarray, W: np.ndarray, b: np.ndarray, activation_func: str):
    """
    Arguments:
    W: weight matrix from previous layer to current layer: (current_layer_size, previous_layer_size)
    A: the output from previous layer (should be original data if previous layer is input layer): (previous_layer_size, num_samples)
    b: bias vector for the current: (current_layer_size, 1)
    activation_func: 'sigmoid' or 'relu'
    
    Returns:
    A_cur: the results after linear transformation and activation
    cache: {'W': W, 'A': A, 'b': b, 'Z': Z}
    """
    Z, cache = linear_forward(A, W, b)
    if activation_func == 'sigmoid':
        A_cur, activation_cache = sigmoid(Z)
    elif activation_func == 'relu':
        A_cur, activation_cache = relu(Z)
    else:
        raise Exception(f'The input {activation_func} is not supported.')
    cache = (cache, activation_cache)
    return A_cur, cache

`TEST`

In [11]:
t_A_prev, t_W, t_b = linear_activation_forward_test_case()
t_A, t_linear_activation_cache = linear_activate_forward(t_A_prev, t_W, t_b, 'sigmoid')
t_A, t_linear_activation_cache = linear_activate_forward(t_A_prev, t_W, t_b, 'relu')
linear_activation_forward_test(linear_activate_forward)

[92m All tests passed.


`forward` function to combine multiple linear forward plus activation modules together

In [12]:
def forward(X: np.ndarray, parameters: dict):
    """
    linear transformation -> relu -> linear .... -> relu -> linear -> sigmoid

    Arguments:
    parameters: {'W1': W1, 'b1': b1', ..., 'Wm': Wm, 'bm': bm}
    X: (num_features, num_samples)
    """
    # Get the total number of layers
    num_layer = len(parameters) // 2
    # Save cache
    A, caches = X, []
    # From input layer to before the output layer
    for layer in range(1, num_layer):
        A_prev = A
        A, cache = linear_activate_forward(A_prev, parameters[f'W{layer}'], parameters[f'b{layer}'], 'relu') 
        caches.append(cache)
    # Implement the output layer
    AL, cache = linear_activate_forward(A, parameters[f'W{num_layer}'], parameters[f'b{num_layer}'], 'sigmoid')
    caches.append(cache)
    return AL, caches

`TEST`

In [13]:
t_X, t_parameters = L_model_forward_test_case_2hidden()
t_AL, t_caches = forward(t_X, t_parameters)
L_model_forward_test(forward)

[92m All tests passed.


## 4 - Loss Function (Cross Entropy)

$$
J = -\frac{1}{n} \sum\limits_{i = 1}^{n} (y^{(i)}\log\left(a^{[L] (i)}\right) + (1-y^{(i)})\log\left(1- a^{[L](i)}\right))
$$

In [14]:
def compute_loss(AL: np.ndarray, Y: np.ndarray):
    """
    Arguments:
    AL: output of the last layer (1, number of examples)
    Y: ground truth (1, number of examples)

    Returns:
    loss: the loss value calculated over all samples
    """
    num_samples = AL.shape[1]
    loss = 1 / num_samples * np.sum(- np.dot(Y, np.log(AL.T)) - np.dot((1 - Y), np.log(1 - AL.T)))
    return loss

`TEST`

In [15]:
t_Y, t_AL = compute_cost_test_case()
t_cost = compute_loss(t_AL, t_Y)
compute_cost_test(compute_loss)

[92m All tests passed.


## 5 - Backward Propagation

The backward propagation is to compute the gradients with respect to each parameter. The formula are listed below:

$$ 
dW^{[l]} = \frac{\partial \mathcal{J} }{\partial W^{[l]}} = \frac{1}{m} dZ^{[l]} A^{[l-1] T}
$$

$$ 
db^{[l]} = \frac{\partial \mathcal{J} }{\partial b^{[l]}} = \frac{1}{m} \sum_{i = 1}^{m} dZ^{[l](i)}
$$

$$
dA^{[l-1]} = \frac{\partial \mathcal{L} }{\partial A^{[l-1]}} = W^{[l] T} dZ^{[l]}
$$

In [16]:
def linear_backward(dZ, cache):
    """
    Perform backward propagation once dZ is given.

    Arguments:
    dZ: gradient of the cost with respect to Z in the current layer: same shape with Z
    cache: (A_prev, W, b) from forward propagation in the current layer

    Returns:
    dA_prev: gradient with respect to the activation, same shape with A_prev
    dW: gradient with respect to W, same shape as W
    db: gradient with respect to b, same shape as b
    """
    A_prev, W, b = cache
    num_samples = A_prev.shape[1]
    dW = 1 / num_samples * np.matmul(dZ, A_prev.T)
    db = 1 / num_samples * np.sum(dZ, axis=1, keepdims=True)
    dA_prev = np.matmul(W.T, dZ)
    return dA_prev, dW, db

`TEST`

In [17]:
t_dZ, t_linear_cache = linear_backward_test_case()
t_dA_prev, t_dW, t_db = linear_backward(t_dZ, t_linear_cache)
linear_backward_test(linear_backward)

[92m All tests passed.


`linear_activate_backward` computes the backward propagation with activation function

In [18]:
def linear_activate_backward(dA, cache, activation):
    """
    backward propagation for the LINEAR->ACTIVATION layer.
    
    Arguments:
    dA: gradient for the current layer
    cache: tuple of values (linear_cache, activation_cache)
    activation : "sigmoid" or "relu"
    
    Returns:
    dA_prev: gradient with respect to A in the previous laywer
    dW: gradient with respect to W in the current layer
    db: gradient with respect to b in the current layer
    """
    linear_cache, activation_cache = cache
    if activation == "relu":
        dZ = relu_backward(dA, activation_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)
    elif activation == "sigmoid":
        dZ = sigmoid_backward(dA, activation_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)
    else:
        raise Exception(f'The input {activation} is not supported.')
    return dA_prev, dW, db

`TEST`

In [19]:
t_dAL, t_linear_activation_cache = linear_activation_backward_test_case()
t_dA_prev, t_dW, t_db = linear_activate_backward(t_dAL, t_linear_activation_cache, activation = "sigmoid")
t_dA_prev, t_dW, t_db = linear_activate_backward(t_dAL, t_linear_activation_cache, activation = "relu")
linear_activation_backward_test(linear_activate_backward)

[92m All tests passed.


`backward` combines multiple backward modules together

In [20]:
def backward(AL, Y, caches):
    """
    Perform backward propagation for multiple times.

    Arguments:
    AL: the probability output of the last layer sigmoid: (1, num_samples)
    Y: groud truth: (1, num_samples)
    caches: a list of caches including linear_activate_forward cache results.

    Returns:
    grads: {'dA1': dA1, 'dW1': dW1, 'db1': db1', ..., 'dAL': dAL, 'dWL': dWL, 'dbL': dbL}
    """
    grads = {}
    num_layer = len(caches)
    Y = Y.reshape(AL.shape)
    # The gradient of cross entropy loss function
    dAL = - (np.divide(Y, AL) - np.divide(1-Y, 1-AL))
    # The backward propagation in the last layer
    cur_cache = caches[-1]
    dA_prev_temp, dW_temp, db_temp = linear_activate_backward(dAL, cur_cache, 'sigmoid')
    grads[f'dA{num_layer-1}'] = dA_prev_temp
    grads[f'dW{num_layer}'] = dW_temp
    grads[f'db{num_layer}'] = db_temp
    # From the second from the last layer
    for layer in range(num_layer-2, -1, -1):
        cur_cache = caches[layer]
        dA_prev_temp, dW_temp, db_temp = linear_activate_backward(dA_prev_temp, cur_cache, 'relu')
        grads[f'dA{layer}'] = dA_prev_temp
        grads[f'dW{layer+1}'] = dW_temp
        grads[f'db{layer+1}'] = db_temp
    return grads

`TEST`

In [21]:
t_AL, t_Y_assess, t_caches = L_model_backward_test_case()
grads = backward(t_AL, t_Y_assess, t_caches)
L_model_backward_test(backward)

[92m All tests passed.


## 6 - Update Parameters

`update_param` function to update the parameters using gradient descent

In [22]:
def update_param(params: dict, grads: dict, learning_rate: float=1e-3):
    """
    Arguments:
    params: {'W1': W1, 'b1': b1', ..., 'Wm': Wm, 'bm': bm}
    grads: {'dA1': dA1, 'dW1': dW1, 'db1': db1', ..., 'dAL': dAL, 'dWL': dWL, 'dbL': dbL}
    
    Returns:
    parameters: {'W1': W1, 'b1': b1', ..., 'Wm': Wm, 'bm': bm}
    """
    parameters = copy.deepcopy(params)
    num_layer = len(params.keys()) // 2
    # Gradient descent
    for layer in range(num_layer):
        parameters[f'W{layer+1}'] = parameters[f'W{layer+1}'] - learning_rate * grads[f'dW{layer+1}']
        parameters[f'b{layer+1}'] = parameters[f'b{layer+1}'] - learning_rate * grads[f'db{layer+1}']
    return parameters

In [23]:
t_parameters, grads = update_parameters_test_case()
t_parameters = update_param(t_parameters, grads, 0.1)
update_parameters_test(update_param)

[92m All tests passed.
