In [None]:
from google.colab import drive
drive.mount("/content/gdrive")
%cd /content/gdrive/MyDrive/Colab Notebooks/11 - Deep Learning/

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/MyDrive/Colab Notebooks/11 - Deep Learning


In [None]:
import numpy as np
import h5py
import matplotlib.pyplot as plt
from lib.testCases_v4a import *
from lib.dnn_utils_v2 import sigmoid, sigmoid_backward, relu, relu_backward
plt.rcParams['figure.figsize'] = (5.0, 4.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Initialization

In [None]:
def initialize_parameters(n_x, n_h, n_y): 
    
    np.random.seed(1)
    W1 = np.random.randn(n_h, n_x) * 0.01 
    b1 = np.zeros((n_h, 1)) 
    W2 = np.random.randn(n_y, n_h) * 0.01 
    b2 = np.zeros((n_y, 1)) 
    parameters = {"W1": W1, "b1": b1, "W2": W2, "b2": b2}
    return parameters

In [None]:
parameters = initialize_parameters(3,2,1)
parameters

{'W1': array([[ 0.01624345, -0.00611756, -0.00528172],
        [-0.01072969,  0.00865408, -0.02301539]]),
 'W2': array([[ 0.01744812, -0.00761207]]),
 'b1': array([[0.],
        [0.]]),
 'b2': array([[0.]])}

# L-layer Neural Network

The initialization for a deeper L-layer neural network is more complicated because there are many more weight matrices and bias vectors. When completing the `initialize_parameters_deep`, you should make sure that your dimensions match between each layer. Recall that $n^{[l]}$ is the number of units in layer $l$. Thus for example if the size of our input $X$ is $(12288, 209)$ (with $m=209$ examples) then:

<table style="width:100%">
    <tr>
        <td><b>Layer Name</b></td> 
        <td><b>Shape of W</b></td> 
        <td><b>Shape of b</b></td> 
        <td><b>Activation</b></td>
        <td><b>Shape of Activation</b></td> 
    </tr>
    <tr>
        <td><b>Layer 1</b></td> 
        <td>$(n^{[1]},12288)$</td>
        <td>$(n^{[1]},1)$</td>
        <td>$Z^{[1]} = W^{[1]}  X + b^{[1]} $</td>
        <td>$(n^{[1]},209)$</td>
    </tr>
    <tr>
        <td><b>Layer 2</b></td>
        <td>$(n^{[2]}, n^{[1]})$</td>
        <td>$(n^{[2]},1)$</td>
        <td>$Z^{[2]} = W^{[2]} A^{[1]} + b^{[2]}$</td>
        <td>$(n^{[2]}, 209)$</td>
    </tr>
    <tr>
        <td>$\vdots$</td>
        <td>$\vdots$</td>
        <td>$\vdots$</td>
        <td>$\vdots$</td> 
        <td>$\vdots$</td>
   </tr>
   <tr>
        <td><b>Layer L-1</b></td> 
        <td>$(n^{[L-1]}, n^{[L-2]})$</td> 
        <td>$(n^{[L-1]}, 1)$</td> 
        <td>$Z^{[L-1]} =  W^{[L-1]} A^{[L-2]} + b$</td> 
        <td>$(n^{[L-1]}, 209)$</td> 
   </tr>
   <tr>
        <td><b>Layer L</b></td>
        <td>$(n^{[L]}, n^{[L-1]})$</td>
        <td>$(n^{[L]}, 1)$</td>
        <td>$Z^{[L]} =  W^{[L]} A^{[L-1]} + b^{[L]}$</td>
        <td>$(n^{[L]}, 209)$</td>
    </tr>

</table>

When we compute $W X + b$ in python, it carries out broadcasting. For example, if: 

$$ W = \begin{bmatrix}
    j  & k  & l\\
    m  & n & o \\
    p  & q & r 
\end{bmatrix}\;\;\; X = \begin{bmatrix}
    a  & b  & c\\
    d  & e & f \\
    g  & h & i 
\end{bmatrix} \;\;\; b =\begin{bmatrix}
    s  \\
    t  \\
    u
\end{bmatrix}\tag{2}$$

Then $WX + b$ will be:

$$ WX + b = \begin{bmatrix}
    (ja + kd + lg) + s  & (jb + ke + lh) + s  & (jc + kf + li)+ s\\
    (ma + nd + og) + t & (mb + ne + oh) + t & (mc + nf + oi) + t\\
    (pa + qd + rg) + u & (pb + qe + rh) + u & (pc + qf + ri)+ u
\end{bmatrix}\tag{3}  $$

In [None]:
def initialize_parameters_deep(layer_dims):
    parameters = {}
    L = len(layer_dims)

    for l in range(1, L):
        parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) * 0.01 # (layer_dims[1], layer_dims[-1])
        parameters['b' + str(l)] = np.zeros((layer_dims[l], 1)) # (layer_dims[1], 1)
        
    return parameters

In [None]:
parameters = initialize_parameters_deep([5, 4, 3])
parameters

{'W1': array([[-0.01244123, -0.00626417, -0.00803766, -0.02419083, -0.00923792],
        [-0.01023876,  0.01123978, -0.00131914, -0.01623285,  0.00646675],
        [-0.00356271, -0.01743141, -0.0059665 , -0.00588594, -0.00873882],
        [ 0.00029714, -0.02248258, -0.00267762,  0.01013183,  0.00852798]]),
 'W2': array([[ 0.01108187,  0.01119391,  0.01487543, -0.01118301],
        [ 0.00845833, -0.0186089 , -0.00602885, -0.01914472],
        [ 0.01048148,  0.01333738, -0.00197415,  0.01774645]]),
 'b1': array([[0.],
        [0.],
        [0.],
        [0.]]),
 'b2': array([[0.],
        [0.],
        [0.]])}

# Forward propagation module

In [None]:
def linear_forward(A, W, b):
    Z = np.dot(W, A) + b # (W.shape[0, A.shape[1]])
    cache = (A, W, b)
    return Z, cache

In [None]:
A, W, b = linear_forward_test_case()
Z, linear_cache = linear_forward(A, W, b)
linear_cache

(array([[ 1.62434536, -0.61175641],
        [-0.52817175, -1.07296862],
        [ 0.86540763, -2.3015387 ]]),
 array([[ 1.74481176, -0.7612069 ,  0.3190391 ]]),
 array([[-0.24937038]]))

## Linear-Activation Forward

In [None]:
def sigmoid(Z):
    A = 1/(1+np.exp(-Z))
    cache = Z
    return A, cache
    
def relu(Z):    
    A = np.maximum(0,Z) # (Z.shape)
    cache = Z 
    return A, cache

In [None]:
def linear_activation_forward(A_prev, W, b, activation):    
    if activation == "sigmoid":
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = sigmoid(Z) 
    
    elif activation == "relu":
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = relu(Z)
    
    assert (A.shape == (W.shape[0], A_prev.shape[1]))
    cache = (linear_cache, activation_cache)

    return A, cache

In [None]:
A_prev, W, b = linear_activation_forward_test_case()

# sigmoid
A, linear_activation_cache = linear_activation_forward(A_prev, W, b, activation = "sigmoid")
print(A)

# relu
A, linear_activation_cache = linear_activation_forward(A_prev, W, b, activation = "relu")
print(A)

[[0.96890023 0.11013289]]
[[3.43896131 0.        ]]


## L-Layer Forward 


<img src="https://raw.githubusercontent.com/sebastianbirk/coursera-deep-learning-specialization/master/01_neural_networks_and_deep_learning/04_deep_neural_nets_with_numpy/images/model_architecture_kiank.png" style="width:600px;height:300px;">
<caption><center> **Figure 2** : *[LINEAR -> RELU] $\times$ (L-1) -> LINEAR -> SIGMOID* model</center></caption><br>

In [None]:
def L_model_forward(X, parameters):
    caches = []
    A = X
    
    # number of layers in the neural network
    L = len(parameters) // 2                  
    
    # Implement [LINEAR -> RELU]*(L-1). Add "cache" to the "caches" list.
    for l in range(1, L):
        A_prev = A 
        A, cache = linear_activation_forward(A_prev, 
                                             parameters['W' + str(l)], 
                                             parameters['b' + str(l)], 
                                             activation='relu')
        caches.append(cache)
    
    # Implement LINEAR -> SIGMOID. Add "cache" to the "caches" list.
    AL, cache = linear_activation_forward(A, 
                                          parameters['W' + str(L)], 
                                          parameters['b' + str(L)], 
                                          activation='sigmoid')
    caches.append(cache)    
    assert(AL.shape == (1, X.shape[1]))
    return AL, caches

In [None]:
X, parameters = L_model_forward_test_case_2hidden()
AL, caches = L_model_forward(X, parameters)
print(X)
print("AL = " + str(AL))
print("Length of caches list = " + str(len(caches)))

[[-0.31178367  0.72900392  0.21782079 -0.8990918 ]
 [-2.48678065  0.91325152  1.12706373 -1.51409323]
 [ 1.63929108 -0.4298936   2.63128056  0.60182225]
 [-0.33588161  1.23773784  0.11112817  0.12915125]
 [ 0.07612761 -0.15512816  0.63422534  0.810655  ]]
AL = [[0.03921668 0.70498921 0.19734387 0.04728177]]
Length of caches list = 3


# Cost function

Compute the cross-entropy cost $J$, using the following formula: $$-\frac{1}{m} \sum\limits_{i = 1}^{m} (y^{(i)}\log\left(a^{[L] (i)}\right) + (1-y^{(i)})\log\left(1- a^{[L](i)}\right)) \tag{7}$$


In [None]:
def compute_cost(AL, Y):    
    m = Y.shape[1]
    cost = (-1/m)* np.sum(np.multiply(Y, np.log(AL)) + np.multiply(1-Y, np.log(1-AL)))
    cost = np.squeeze(cost) 
    assert(cost.shape == ())
    return cost

In [None]:
Y, AL = compute_cost_test_case()
print("cost = " + str(compute_cost(AL, Y)))

cost = 0.2797765635793422


# Backward propagation

<img src="https://raw.githubusercontent.com/sebastianbirk/coursera-deep-learning-specialization/master/01_neural_networks_and_deep_learning/04_deep_neural_nets_with_numpy/images/backprop_kiank.png" style="width:650px;height:250px;">
<caption><center> **Figure 3** : Forward and Backward propagation for *LINEAR->RELU->LINEAR->SIGMOID* <br> *The purple blocks represent the forward propagation, and the red blocks represent the backward propagation.*  </center></caption>

<!-- 
For those of you who are expert in calculus (you don't need to be to do this assignment), the chain rule of calculus can be used to derive the derivative of the loss $\mathcal{L}$ with respect to $z^{[1]}$ in a 2-layer network as follows:

$$\frac{d \mathcal{L}(a^{[2]},y)}{{dz^{[1]}}} = \frac{d\mathcal{L}(a^{[2]},y)}{{da^{[2]}}}\frac{{da^{[2]}}}{{dz^{[2]}}}\frac{{dz^{[2]}}}{{da^{[1]}}}\frac{{da^{[1]}}}{{dz^{[1]}}} \tag{8} $$

In order to calculate the gradient $dW^{[1]} = \frac{\partial L}{\partial W^{[1]}}$, you use the previous chain rule and you do $dW^{[1]} = dz^{[1]} \times \frac{\partial z^{[1]} }{\partial W^{[1]}}$. During the backpropagation, at each step you multiply your current gradient by the gradient corresponding to the specific layer to get the gradient you wanted.

Equivalently, in order to calculate the gradient $db^{[1]} = \frac{\partial L}{\partial b^{[1]}}$, you use the previous chain rule and you do $db^{[1]} = dz^{[1]} \times \frac{\partial z^{[1]} }{\partial b^{[1]}}$.

This is why we talk about **backpropagation**.
!-->

## Linear backward

For layer $l$, the linear part is: $Z^{[l]} = W^{[l]} A^{[l-1]} + b^{[l]}$ (followed by an activation).

Suppose you have already calculated the derivative $dZ^{[l]} = \frac{\partial \mathcal{L} }{\partial Z^{[l]}}$. You want to get $(dW^{[l]}, db^{[l]}, dA^{[l-1]})$.

<img src="https://raw.githubusercontent.com/sebastianbirk/coursera-deep-learning-specialization/master/01_neural_networks_and_deep_learning/04_deep_neural_nets_with_numpy/images/linearback_kiank.png" style="width:250px;height:300px;">
<caption><center> **Figure 4** </center></caption>

The three outputs $(dW^{[l]}, db^{[l]}, dA^{[l-1]})$ are computed using the input $dZ^{[l]}$.Here are the formulas you need:
$$ dW^{[l]} = \frac{\partial \mathcal{J} }{\partial W^{[l]}} = \frac{1}{m} dZ^{[l]} A^{[l-1] T} \tag{8}$$
$$ db^{[l]} = \frac{\partial \mathcal{J} }{\partial b^{[l]}} = \frac{1}{m} \sum_{i = 1}^{m} dZ^{[l](i)}\tag{9}$$
$$ dA^{[l-1]} = \frac{\partial \mathcal{L} }{\partial A^{[l-1]}} = W^{[l] T} dZ^{[l]} \tag{10}$$


In [None]:
def linear_backward(dZ, cache):
    # dZ2= A2 - Y
    # dZ1 = np.multiply(np.dot(W2.T, dZ2), 1 - np.power(A1, 2))
    A_prev, W, b = cache
    m = A_prev.shape[1]

    dW = (1 / m) * np.dot(dZ, A_prev.T)
    db = (1 / m) * np.sum(dZ, axis=1, keepdims=True)
    dA_prev = np.dot(W.T, dZ)
    
    assert (dA_prev.shape == A_prev.shape)
    assert (dW.shape == W.shape)
    assert (db.shape == b.shape)
    
    return dA_prev, dW, db

In [None]:
dZ, linear_cache = linear_backward_test_case()
dA_prev, dW, db = linear_backward(dZ, linear_cache)
print(dZ)
print ("dA_prev = "+ str(dA_prev))
print ("dW = " + str(dW))
print ("db = " + str(db))

[[ 1.62434536 -0.61175641 -0.52817175 -1.07296862]
 [ 0.86540763 -2.3015387   1.74481176 -0.7612069 ]
 [ 0.3190391  -0.24937038  1.46210794 -2.06014071]]
dA_prev = [[-1.15171336  0.06718465 -0.3204696   2.09812712]
 [ 0.60345879 -3.72508701  5.81700741 -3.84326836]
 [-0.4319552  -1.30987417  1.72354705  0.05070578]
 [-0.38981415  0.60811244 -1.25938424  1.47191593]
 [-2.52214926  2.67882552 -0.67947465  1.48119548]]
dW = [[ 0.07313866 -0.0976715  -0.87585828  0.73763362  0.00785716]
 [ 0.85508818  0.37530413 -0.59912655  0.71278189 -0.58931808]
 [ 0.97913304 -0.24376494 -0.08839671  0.55151192 -0.10290907]]
db = [[-0.14713786]
 [-0.11313155]
 [-0.13209101]]


## Linear-Activation backward

In [None]:
def relu_backward(dA, cache):
    Z = cache
    dZ = np.array(dA, copy=True)    
    dZ[Z <= 0] = 0
    assert (dZ.shape == Z.shape)
    return dZ

def sigmoid_backward(dA, cache):
    Z = cache
    s = 1 / (1 + np.exp(-Z))
    dZ = dA * s * (1-s)
    assert (dZ.shape == Z.shape)
    return dZ

In [None]:
def linear_activation_backward(dA, cache, activation):
    linear_cache, activation_cache = cache
    
    if activation == "relu":
        dZ = relu_backward(dA, cache[1])
        dA_prev, dW, db = linear_backward(dZ, linear_cache)
        
    elif activation == "sigmoid":
        dZ = sigmoid_backward(dA, cache[1])
        dA_prev, dW, db = linear_backward(dZ, linear_cache)
    
    return dA_prev, dW, db

In [None]:
dAL, linear_activation_cache = linear_activation_backward_test_case()

dA_prev, dW, db = linear_activation_backward(dAL, linear_activation_cache, activation = "sigmoid")
print ("sigmoid:")
print ("dA_prev = "+ str(dA_prev))
print ("dW = " + str(dW))
print ("db = " + str(db) + "\n")

dA_prev, dW, db = linear_activation_backward(dAL, linear_activation_cache, activation = "relu")
print ("relu:")
print ("dA_prev = "+ str(dA_prev))
print ("dW = " + str(dW))
print ("db = " + str(db))

sigmoid:
dA_prev = [[ 0.11017994  0.01105339]
 [ 0.09466817  0.00949723]
 [-0.05743092 -0.00576154]]
dW = [[ 0.10266786  0.09778551 -0.01968084]]
db = [[-0.05729622]]

relu:
dA_prev = [[ 0.44090989  0.        ]
 [ 0.37883606  0.        ]
 [-0.2298228   0.        ]]
dW = [[ 0.44513824  0.37371418 -0.10478989]]
db = [[-0.20837892]]


## L-Model Backward 

<img src="https://raw.githubusercontent.com/sebastianbirk/coursera-deep-learning-specialization/master/01_neural_networks_and_deep_learning/04_deep_neural_nets_with_numpy/images/mn_backward.png" style="width:450px;height:300px;">
<caption><center>  **Figure 5** : Backward pass  </center></caption>

** Initializing backpropagation**:
To backpropagate through this network, we know that the output is, 
$A^{[L]} = \sigma(Z^{[L]})$. Your code thus needs to compute `dAL` $= \frac{\partial \mathcal{L}}{\partial A^{[L]}}$.
To do so, use this formula (derived using calculus which you don't need in-depth knowledge of):
```python
dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL)) # derivative of cost with respect to AL
```

You can then use this post-activation gradient `dAL` to keep going backward. As seen in Figure 5, you can now feed in `dAL` into the LINEAR->SIGMOID backward function you implemented (which will use the cached values stored by the L_model_forward function). After that, you will have to use a `for` loop to iterate through all the other layers using the LINEAR->RELU backward function. You should store each dA, dW, and db in the grads dictionary. To do so, use this formula : 

$$grads["dW" + str(l)] = dW^{[l]}\tag{15} $$

For example, for $l=3$ this would store $dW^{[l]}$ in `grads["dW3"]`.

In [None]:
def L_model_backward(AL, Y, caches):
    grads = {}
    
    # the number of layers
    L = len(caches) 
    m = AL.shape[1]
    
     # after this line, Y is the same shape as AL
    Y = Y.reshape(AL.shape)
    
    # Initializing the backpropagation
    dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
    
    # Lth layer (SIGMOID -> LINEAR) gradients. Inputs: "dAL, current_cache". Outputs: "grads["dAL-1"], grads["dWL"], grads["dbL"]
    current_cache = caches[L-1]
    grads["dA" + str(L-1)], grads["dW" + str(L)], grads["db" + str(L)] = linear_activation_backward(dAL, current_cache, activation = "sigmoid")
    
    # Loop from l=L-2 to l=0
    for l in reversed(range(L-1)):
        # lth layer: (RELU -> LINEAR) gradients.
        # Inputs: "grads["dA" + str(l + 1)], current_cache". Outputs: "grads["dA" + str(l)] , grads["dW" + str(l + 1)] , grads["db" + str(l + 1)] 
        current_cache = caches[l]
        dA_prev_temp, dW_temp, db_temp = linear_activation_backward(grads["dA" + str(l + 1)], current_cache, activation = "relu")
        grads["dA" + str(l)] = dA_prev_temp
        grads["dW" + str(l + 1)] = dW_temp
        grads["db" + str(l + 1)] = db_temp

    return grads

In [None]:
AL, Y_assess, caches = L_model_backward_test_case()
grads = L_model_backward(AL, Y_assess, caches)
print_grads(grads)

dW1 = [[0.41010002 0.07807203 0.13798444 0.10502167]
 [0.         0.         0.         0.        ]
 [0.05283652 0.01005865 0.01777766 0.0135308 ]]
db1 = [[-0.22007063]
 [ 0.        ]
 [-0.02835349]]
dA1 = [[ 0.12913162 -0.44014127]
 [-0.14175655  0.48317296]
 [ 0.01663708 -0.05670698]]


## Update Parameters

In this section you will update the parameters of the model, using gradient descent: 

$$ W^{[l]} = W^{[l]} - \alpha \text{ } dW^{[l]} \tag{16}$$
$$ b^{[l]} = b^{[l]} - \alpha \text{ } db^{[l]} \tag{17}$$

where $\alpha$ is the learning rate. After computing the updated parameters, store them in the parameters dictionary. 

In [None]:
def update_parameters(parameters, grads, learning_rate):
    L = len(parameters) // 2 # number of layers in the neural network

    for l in range(L):
        parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate * grads["dW" + str(l + 1)]
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate * grads["db" + str(l + 1)]
    return parameters

In [None]:
parameters, grads = update_parameters_test_case()
parameters = update_parameters(parameters, grads, 0.1)

print ("W1 = "+ str(parameters["W1"]))
print ("b1 = "+ str(parameters["b1"]))
print ("W2 = "+ str(parameters["W2"]))
print ("b2 = "+ str(parameters["b2"]))

W1 = [[-0.59562069 -0.09991781 -2.14584584  1.82662008]
 [-1.76569676 -0.80627147  0.51115557 -1.18258802]
 [-1.0535704  -0.86128581  0.68284052  2.20374577]]
b1 = [[-0.04659241]
 [-1.28888275]
 [ 0.53405496]]
W2 = [[-0.55569196  0.0354055   1.32964895]]
b2 = [[-0.84610769]]
