In [2]:
import numpy as np

## 01 Initialization

### 01.01 2-layer NN
Initializing the parameters of the 2-layer neural network.

In [3]:
def initialize_parameters(n_x, n_h, n_y):
    """
    Argument:
    n_x: size of the input layer
    n_h: size of the hidden layer
    n_y: size of the output layer
    
    Returns:
    parameters: dict containing your parameters:
                    W1: weight matrix of shape (n_h, n_x)
                    b1: bias vector of shape (n_h, 1)
                    W2: weight matrix of shape (n_y, n_h)
                    b2: bias vector of shape (n_y, 1)
    """
    # np.random.seed(1)
    W1 = np.random.randn(n_h, n_x) * 0.01
    b1 = np.zeros((n_h, 1))
    W2 = np.random.randn(n_y, n_h) * 0.01
    b2 = np.zeros((n_y, 1))
    
    parameters = {"W1": W1,
                  "b1": b1,
                  "W2": W2,
                  "b2": b2}
    
    return parameters    

### 01.02 L-layer NN
Generalization of the initialization process to $L$ layers.

In [5]:
def initialize_parameters_deep(layer_dims):
    """
    Arguments:
    layer_dims: list containing the dimensions of each layer in our network
    
    Returns:
    parameters: dict containing your parameters "W1", "b1", ..., "WL", "bL":
                    Wl: weight matrix of shape (layer_dims[l], layer_dims[l-1])
                    bl: bias vector of shape (layer_dims[l], 1)
    """
    # np.random.seed(3)
    parameters = {}
    L = len(layer_dims)
    
    for l in range(1, L):
        parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) * 0.01
        parameters['b' + str(l)] = np.zeros((layer_dims[l], 1))
                
        assert(parameters['W' + str(l)].shape == (layer_dims[l], layer_dims[l - 1]))
        assert(parameters['b' + str(l)].shape == (layer_dims[l], 1))

        
    return parameters

## 02 Forward propagation

Build the forward propagation in 3 steps.
1. LINEAR
2. LINEAR -> ACTIVATION where ACTIVATION will be either ReLU or Sigmoid. 
3. [LINEAR -> RELU] $\times$ (L-1) -> LINEAR -> SIGMOID (whole model)

### 02.01 Linear forward
Implementation of the linear part of a layer's forward propagation.

The linear forward module (vectorized over all the examples) computes the following equations:

$$Z^{[l]} = W^{[l]}A^{[l-1]} +b^{[l]}, \quad \text{where } A^{[0]} = X$$

In [7]:
def linear_forward(A, W, b):
    """
    Arguments:
    A: activations from previous layer (or input data): (size of previous layer, num of examples)
    W: weights matrix: numpy array of shape (size of current layer, size of previous layer)
    b: bias vector, numpy array of shape (size of the current layer, 1)

    Returns:
    Z: the input of the activation function, also called pre-activation parameter 
    cache: a python tuple containing "A", "W" and "b" ; stored for computing the backward pass efficiently
    """
    Z = np.dot(W, A) + b
    cache = (A, W, b)
    
    return Z, cache

### 02.02 Linear-activation forward
Implementation of the forward propagation for the LINEAR->ACTIVATION layer.

In [8]:
def linear_activation_forward(A_prev, W, b, activation):
    """
    Arguments:
    A_prev: activations from previous layer (or input data): (size of previous layer, number of examples)
    W: weights matrix, numpy array of shape (size of current layer, size of previous layer)
    b: bias vector, numpy array of shape (size of the current layer, 1)
    activation: the activation to be used in this layer, stored as a text string: "sigmoid" or "relu"

    Returns:
    A: the output of the activation function, also called the post-activation value 
    cache: tuple containing "linear_cache" and "activation_cache"; stored for computing the backward pass efficiently
    """
    
    if activation == "sigmoid":
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = sigmoid(Z)
    elif activation == "relu":
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = relu(Z)
    
    cache = (linear_cache, activation_cache)

    return A, cache

### 02.03 L-layer model forward
Implementation forward propagation for the [LINEAR->RELU]*(L-1)->LINEAR->SIGMOID computation


In [10]:
def L_model_forward(X, parameters):
    """
    Implement forward propagation for the [LINEAR->RELU]*(L-1)->LINEAR->SIGMOID computation
    
    Arguments:
    X: input data, numpy array of shape (input size, number of examples)
    parameters: output of initialize_parameters_deep()
    
    Returns:
    AL: activation value from the output layer
    caches: list of caches containing:
                - every cache of linear_activation_forward() 
                - there are L of them, indexed from 0 to L-1
    """
    caches = []
    A = X
    L = len(parameters) // 2 # number of layers in the neural network
    
    # [LINEAR -> RELU]*(L-1)
    for l in range(1, L):
        A_prev = A
        A, cache = linear_activation_forward(A_prev,
                                             parameters['W' + str(l)],
                                             parameters['b' + str(l)], 
                                             'relu')
        caches.append(cache)
        
    
    # LINEAR -> SIGMOID
    W = parameters['W' + str(L)]
    b = parameters['b' + str(L)]
    AL, cache = linear_activation_forward(A,
                                          parameters['W' + str(L)],
                                          parameters['b' + str(L)], 
                                          'sigmoid')
    caches.append(cache)
    
    return AL, caches

## 03 Cost function

Compute the cross-entropy cost $J$: 
$$-\frac{1}{m} \sum\limits_{i = 1}^{m} \left(y^{(i)}\log(a^{[L] (i)}) + (1-y^{(i)})\log(1- a^{[L](i)})\right)$$


In [11]:
def compute_cost(AL, Y):
    """
    Arguments:
    AL: probability vector corresponding to your label predictions, shape (1, number of examples)
    Y: true "label" vector (for example: containing 0 if non-cat, 1 if cat), shape (1, number of examples)

    Returns:
    cost: cross-entropy cost
    """
    
    m = Y.shape[1]

    # Compute loss from aL and y.
    logprobs = np.multiply(Y, np.log(AL)) + np.multiply((1-Y), np.log(1-AL))
    cost = -(1/m)*np.sum(logprobs)
    
    cost = np.squeeze(cost) # e.g. turns [[17]] into 17

    
    return cost

## 04 Backward propagation

Build the backward propagation in three steps:
1. LINEAR backward
2. LINEAR -> ACTIVATION backward where ACTIVATION computes the derivative of either the ReLU or sigmoid activation
3. [LINEAR -> RELU] $\times$ (L-1) -> LINEAR -> SIGMOID backward (whole model)

### 04.01 Linear backward
Implementation of the linear portion of backward propagation for a single layer $l$.

$$ dW^{[l]} = \frac{\partial \mathcal{J} }{\partial W^{[l]}} = \frac{1}{m} dZ^{[l]} A^{[l-1] T}$$
$$ db^{[l]} = \frac{\partial \mathcal{J} }{\partial b^{[l]}} = \frac{1}{m} \sum_{i = 1}^{m} dZ^{[l](i)}$$
$$ dA^{[l-1]} = \frac{\partial \mathcal{L} }{\partial A^{[l-1]}} = W^{[l] T} dZ^{[l]}$$

In [13]:
def linear_backward(dZ, cache):
    """
    Arguments:
    dZ: grad of the cost with respect to the linear output (of current layer l)
    cache: tuple of values (A_prev, W, b) coming from the forward prop in the current layer

    Returns:
    dA_prev: grad of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
    dW: grad of the cost with respect to W (current layer l), same shape as W
    db: grad of the cost with respect to b (current layer l), same shape as b
    """
    A_prev, W, b = cache
    m = A_prev.shape[1]

    dW = (1/m)*np.dot(dZ, A_prev.T)
    db = (1/m)*np.sum(dZ, axis=1, keepdims=True)
    dA_prev = np.dot(W.T, dZ)
    
    return dA_prev, dW, db

### 04.02 Linear-activation backward
Implementation of the backward propagation for the LINEAR->ACTIVATION layer.

In [15]:
def linear_activation_backward(dA, cache, activation):
    """    
    Arguments:
    dA: post-activation grad for current layer l 
    cache: tuple of values (linear_cache, activation_cache) we store for computing backward propagation efficiently
    activation: the activation to be used in this layer, stored as a text string: "sigmoid" or "relu"
    
    Returns:
    dA_prev: grad of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
    dW: grad of the cost with respect to W (current layer l), same shape as W
    db: grad of the cost with respect to b (current layer l), same shape as b
    """
    linear_cache, activation_cache = cache
    
    if activation == "relu":
        dZ = relu_backward(dA, activation_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)
    elif activation == "sigmoid":
        dZ = sigmoid_backward(dA, activation_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)
    
    return dA_prev, dW, db

### 04.03 L-layer model backward
To backprop through this network, know that the output is:  $A^{[L]} = \sigma(Z^{[L]})$. 

Therefore, we need to compute `dAL` $= \frac{\partial \mathcal{L}}{\partial A^{[L]}}$:
```python
dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL)) # derivative of cost with respect to AL
```

Then, use this post-activation gradient `dAL` to keep propagating backward. Feed in `dAL` into the LINEAR->SIGMOID backward function.

After, use a `for` loop to iterate through all the other layers with the LINEAR->RELU backward function. 

Store each dA, dW, and db in the grads dictionary using the ff. formula: 
$$\text{grads}[\text{"}dW\text{"} + \text{str}(l)] = dW^{[l]}$$

Implementation of the backward propagation for the [LINEAR->RELU] * (L-1) -> LINEAR -> SIGMOID group:

In [16]:
def L_model_backward(AL, Y, caches):
    """
    Arguments:
    AL: probability vector, output of the forward propagation (L_model_forward())
    Y: true "label" vector (containing 0 if non-cat, 1 if cat)
    caches: list of caches containing:
                - every cache of linear_activation_forward() with "relu" (it's caches[l], for l in range(L-1) i.e l = 0...L-2)
                - the cache of linear_activation_forward() with "sigmoid" (it's caches[L-1])
    
    Returns:
    grads: A dictionary with the gradients
             grads["dA" + str(l)] = ... 
             grads["dW" + str(l)] = ...
             grads["db" + str(l)] = ... 
    """
    grads = {}
    L = len(caches) # the number of layers
    m = AL.shape[1]
    Y = Y.reshape(AL.shape) # make Y take the same shape as AL
    
    # init backprop
    dAL = -(np.divide(Y, AL) - np.divide(1-Y, 1-AL))
    
    # Lth layer (SIGMOID -> LINEAR) grad
    dA_prev_temp, dW_temp, db_temp = linear_activation_backward(dAL,
                                                                current_cache, 
                                                                'sigmoid')
    grads["dA" + str(L-1)] = dA_prev_temp
    grads["dW" + str(L)] = dW_temp
    grads["db" + str(L)] = db_temp
    
    # Loop from l=(L-2) to l=0
    for l in reversed(range(L-1)):
        # lth layer: (RELU -> LINEAR) grad
        current_cache = caches[l]
        dA_prev_temp, dW_temp, db_temp = linear_activation_backward(dA_prev_temp,
                                                                current_cache, 
                                                                'relu')
        grads["dA" + str(l)] = dA_prev_temp
        grads["dW" + str(l + 1)] = dW_temp
        grads["db" + str(l + 1)] = db_temp
    
    return grads

### 04.04 Update parameters
Update the parameters of the model, using gradient descent: 
$$ W^{[l]} = W^{[l]} - \alpha \text{ } dW^{[l]} $$
$$ b^{[l]} = b^{[l]} - \alpha \text{ } db^{[l]} $$

where $\alpha$ is the learning rate. 

After computing the updated parameters, store them in the parameters dictionary.

In [17]:
def update_parameters(params, grads, learning_rate):
    """
    Arguments:
    params: dict containing your parameters 
    grads: dict containing your gradients, output of L_model_backward
    
    Returns:
    parameters: dict containing your updated parameters 
                  parameters["W" + str(l)] = ... 
                  parameters["b" + str(l)] = ...
    """
    parameters = copy.deepcopy(params)
    L = len(parameters) // 2 # number of layers in the neural network

    # Update rule for each parameter
    for l in range(L):
        parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate*grads["dW" + str(l + 1)]
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate*grads["db" + str(l + 1)]
    
    return parameters