# Deep Learning
* In this notebook all required functions will be implemented manually.
* This notebook shows how to build deep-neural-network ( At least 2 hidden layers )
* ( Test cases will be added )

#### First import the required libraries

In [None]:
import numpy as np
import h5py
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

print("Libraries are imported :) ")

## Initialization

### 2-layer Neural Network
* Create and initialize the parameters

In [None]:
def init_parameters(n_x, n_h, n_y):
    """
    n_x = Size of the input layer
    n_h = Size of the hidden layer
    n_y = Size of the output layer
    
    """
    w1 = np.random.randn(n_h, n_x) * 0.01
    b1 = np.zeros((n_h,1))
    w2 = np.random.randn(n_y, n_h) * 0.01
    b2 = np.zeros((n_y, 1))
    
    """
    Why multiplied by 0.01?
    Ans: If we set the weights to zero as bias, then 
    there will be no difference in hidden layers, since 
    they all would be computing the same thing. And if we set them
    too large, then learning would be slow as gradient descent's convergence would
    be affected by big values.
    """
    
    parameters = {'w1' : w1, 'w2' : w2, 'b1' : b1, 'b2' : b2}
    return parameters

In [None]:
parameters = init_parameters(3,2,1)
print(parameters)

### Activation Functions

In [None]:
def sigmoid(Z):
    A = 1/(1+np.exp(-Z))
    cache = Z
    
    return A, cache

def relu(Z):
    A = np.maximum(0,Z)
    assert(A.shape == Z.shape)
    cache = Z 
    
    return A, cache

**Output Table**:
       
<table style="width:65%">
  <tr>
    <td> -- > w1 | </td>
    <td> [[-0.00525133,  0.00427224, -0.00012176],
       [ 0.00343739, -0.00911596, -0.0039911 ]] </td> 
  </tr>

  <tr>
    <td> -- > b1 |</td>
    <td>[[ 0.]
 [ 0.]]</td> 
  </tr>
  
  <tr>
    <td>-- > w2 |</td>
    <td> [[0.00046495, 0.00540605]]</td>
  </tr>
  
  <tr>
    <td>-- > b2 | </td>
    <td> [[ 0.]] </td> 
  </tr>
  
</table>

### L-Layer Neural Network

* We should make sure that our dimensions match between each layer. Recall that $n^{[l]}$ is the number of units in layer $l$. Thus for example if the size of our input $X$ is $(1309, 209)$ (with $m=209$ examples) then:

<table style="width:100%">
    <tr>
        <td>  </td> 
        <td><b>Shape of "w"</b> </td> 
        <td><b>Shape of "b"</b></td> 
        <td> <b>Activation</b> </td>
        <td> <b>Shape of Activation</b> </td> 
    <tr>   
    <tr>
        <td> <b>Layer 1</b></td> 
        <td> $(n^{[1]},1309)$ </td> 
        <td> $(n^{[1]},1)$ </td> 
        <td> $Z^{[1]} = W^{[1]}  X + b^{[1]} $ </td> 
        <td> $(n^{[1]},209)$ </td> 
    <tr>
    <tr>
        <td><b>Layer 2</b></td> 
        <td> $(n^{[2]}, n^{[1]})$  </td> 
        <td> $(n^{[2]},1)$ </td> 
        <td>$Z^{[2]} = W^{[2]} A^{[1]} + b^{[2]}$ </td> 
        <td> $(n^{[2]}, 209)$ </td> 
    <tr>
       <tr>
        <td> $\vdots$ </td> 
        <td> $\vdots$  </td> 
        <td> $\vdots$  </td> 
        <td> $\vdots$</td> 
        <td> $\vdots$  </td> 
    <tr>
   <tr>
        <td> <b>Layer L-1</b> </td> 
        <td> $(n^{[L-1]}, n^{[L-2]})$ </td> 
        <td> $(n^{[L-1]}, 1)$  </td> 
        <td>$Z^{[L-1]} =  W^{[L-1]} A^{[L-2]} + b^{[L-1]}$ </td> 
        <td> $(n^{[L-1]}, 209)$ </td> 
    <tr>
    <tr>
        <td> <b>Layer L</b> </td> 
        <td> $(n^{[L]}, n^{[L-1]})$ </td> 
        <td> $(n^{[L]}, 1)$ </td>
        <td> $Z^{[L]} =  W^{[L]} A^{[L-1]} + b^{[L]}$</td>
        <td> $(n^{[L]}, 209)$  </td> 
    <tr>
</table>

In [None]:
def init_parameters_L(layer_dimension):
    # layer_dim : dimensions of each layer in network
    
    parameters = {}
    L = len(layer_dimension)
    
    for l in range(1,L):
        parameters['w' + str(l)] = np.random.randn(layer_dimension[l], layer_dimension[l-1]) * 0.01
        parameters['b' + str(l)] = np.zeros((layer_dimension[l], 1))
        
        assert(parameters['w' + str(l)].shape == (layer_dimension[l], layer_dimension[l-1]))
        assert(parameters['b' + str(l)].shape == (layer_dimension[l], 1))
        
    return parameters        

In [None]:
parameters = init_parameters_L([5,4,3])
print("w1 = " + str(parameters["w1"]))
print("b1 = " + str(parameters["b1"]))
print("w2 = " + str(parameters["w2"]))
print("b2 = " + str(parameters["b2"]))

## Forward Propagation

### Linear Forward

In [None]:
def linear_forward(a,w,b):
    """
    a -- activations from previous layer (or input data): (size of previous layer, number of examples)
    w -- weights matrix: numpy array of shape (size of current layer, size of previous layer)
    b -- bias vector, numpy array of shape (size of the current layer, 1)

    z -- the input of the activation function, also called pre-activation parameter 
    cache -- a python tuple containing "a", "w" and "b" ; stored for computing the backward pass efficiently
    """
    
    z = np.dot(w,a) + b
    assert(z.shape == (w.shape[0], a.shape[1]))
    cache = (a,w,b)
    
    return z, cache

### Linear Activation Forward

Mathematical relation is: $A^{[l]} = g(Z^{[l]}) = g(W^{[l]}A^{[l-1]} +b^{[l]})$ where the activation "g" can be sigmoid() or relu().

In [None]:
def linear_activation_forward(A_prev, W, b, activation):
    """
    A_prev -- activations from previous layer (or input data): (size of previous layer, number of examples)
    activation -- the activation to be used in this layer

    A -- the output of the activation function, also called the post-activation value 
    cache -- a tuple containing "linear_cache" and "activation_cache";
             stored for computing the backward pass efficiently
    """
    
    if activation == "sigmoid":
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = sigmoid(Z)
   
    elif activation == "relu":
        Z, linear_cache = linear_forward(A_prev, W, b)
        A, activation_cache = relu(Z)
    
    assert (A.shape == (W.shape[0], A_prev.shape[1]))
    cache = (linear_cache, activation_cache)

    return A, cache

### L-Layer Model
* When implementing the $L$-layer Neural Net, we will need a function that replicates the previous one (`linear_activation_forward` with RELU) $L-1$ times, then follows that with one `linear_activation_forward` with SIGMOID.
* Implementing the forward propagation of the above model.

In [None]:
def L_model_forward(X, parameters):
    """
    X -- number of examples
    parameters -- output of init_parameters_L()
    
    AL -- last post-activation value
    caches -- every cache of linear_activation_forward() 
    """

    caches = []
    A = X
    L = len(parameters) // 2 # number of layers in the neural network
    
    # Implement [LINEAR -> RELU]*(L-1). Add "cache" to the "caches" list.
    for l in range(1, L):
        A_prev = A 
        A, cache = linear_activation_forward(A_prev, parameters['W' + str(l)], parameters['b' + str(l)], activation = "relu")
        caches.append(cache)
    
    # Implement LINEAR -> SIGMOID. Add "cache" to the "caches" list.
    AL, cache = linear_activation_forward(A, parameters['W' + str(L)], parameters['b' + str(L)], activation = "sigmoid")
    caches.append(caches)    
    assert(AL.shape == (1,X.shape[1]))
            
    return AL, caches

## Cost Function
* We need to check if our model is learning or not.
* Denote, Cost(J) = $$-\frac{1}{m} \sum\limits_{i = 1}^{m} (y^{(i)}\log\left(a^{[L] (i)}\right) + (1-y^{(i)})\log\left(1- a^{[L](i)}\right)) $$

In [None]:
def compute_cost(al, y):
    """
    al -- probability vector corresponding to label predictions, shape (1, number of examples)
    y -- true "label" vector (for example: containing 0 false, 1 if true), shape (1, number of examples)

    cost -- cost
    """
    
    m = Y.shape[1]

    # Compute loss from aL and y.
    logProbs = np.multiply(np.log(al),y) + np.multiply(np.log(1-al), (1-y))
    cost = -1/m * np.sum(logProbs)
    
    cost = np.squeeze(cost)
    assert(cost.shape == ())
    
    return -cost

## Linear Backward
$$ dW^{[l]} = \frac{\partial \mathcal{J} }{\partial W^{[l]}} = \frac{1}{m} dZ^{[l]} A^{[l-1] T} $$
$$ db^{[l]} = \frac{\partial \mathcal{J} }{\partial b^{[l]}} = \frac{1}{m} \sum_{i = 1}^{m} dZ^{[l](i)}$$
$$ dA^{[l-1]} = \frac{\partial \mathcal{L} }{\partial A^{[l-1]}} = W^{[l] T} dZ^{[l]}$$

In [None]:
def relu_backward(dA, cache):
    Z = cache
    dZ = np.array(dA, copy=True) # just converting dz to a correct object.
    
    dZ[Z <= 0] = 0
    assert (dZ.shape == Z.shape)
    
    return dZ

def sigmoid_backward(dA, cache):
    Z = cache
    
    s = 1/(1+np.exp(-Z))
    dZ = dA * s * (1-s)
    assert (dZ.shape == Z.shape)
    
    return dZ

In [None]:
def linear_backward(dZ, cache):
    """
    Desc: The linear portion of backward propagation for a single layer
    dZ -- Gradient of the cost with respect to the linear output (of current layer l)
    cache -- tuple of values (A_prev, W, b) coming from the forward propagation in the current layer

    Returns:
    dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
    dW -- Gradient of the cost with respect to W (current layer l), same shape as W
    db -- Gradient of the cost with respect to b (current layer l), same shape as b
    """
    A_prev, W, b = cache
    m = A_prev.shape[1]

    dW = 1./m * np.dot(dZ, A_prev.T)
    db = 1./m * np.sum(dZ, axis = 1, keepdims = True)
    dA_prev = np.dot(W.T, dZ)
    
    assert (dA_prev.shape == A_prev.shape)
    assert (dW.shape == W.shape)
    assert (db.shape == b.shape)
    
    return dA_prev, dW, db

* If $g(.)$ is the activation function, 
`sigmoid_backward` and `relu_backward` compute $$dZ^{[l]} = dA^{[l]} * g'(Z^{[l]}) \tag{11}$$.  

* Backpropagation for the *LINEAR->ACTIVATION* layer.

In [None]:
def linear_activation_backward(dA, cache, activation):
    """
    dA -- post-activation gradient for current layer l 
    cache -- tuple of values (linear_cache, activation_cache) we store for computing backward propagation efficiently
    activation -- the activation to be used in this layer, stored as a text string: "sigmoid" or "relu"
    
    dA_prev -- Gradient of the cost with respect to the activation (of the previous layer l-1), same shape as A_prev
    dW -- Gradient of the cost with respect to W (current layer l), same shape as W
    db -- Gradient of the cost with respect to b (current layer l), same shape as b
    """
    linear_cache, activation_cache = cache
    
    if activation == "relu":
        dZ = relu_backward(dA, activation_cache)
        dA_prev, dW, db = linear_backward(dZ, linear_cache)
        
    elif activation == "sigmoid":
        dZ = sigmoid_backward(dA, activation_cache)
        dA_prev, dW, db = linear_backward(dZ,linear_cache)
    
    return dA_prev, dW, db

### 6.3 - L-Model Backward 

Now, it is the right for to implementation of the backward function for the whole network. At each iteration, we stored a cache which contains (X,W,b, and z). 
To backpropagate through this network, we know that the output is, 
$A^{[L]} = \sigma(Z^{[L]})$. Your code thus needs to compute `dAL` $= \frac{\partial \mathcal{L}}{\partial A^{[L]}}$.
To do so, use this formula (derived using calculus which you don't need in-depth knowledge of):
```python
dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL)) # derivative of cost with respect to AL
```
We will use this formula:
$$grads["dW" + str(l)] = dW^{[l]}\tag{15} $$

For example, for $l23$ this would store $dW^{[l]}$ in `grads["dW2"]`.

In [None]:
def L_model_backward(al, y, caches):
    """
    al -- probability vector, output of the forward propagation (L_model_forward())
    Y -- true "label" vector (containing 0 if false, 1 if true)
    caches -- list of caches
    
    The gradients
             grads["dA" + str(l)] = ... 
             grads["dW" + str(l)] = ...
             grads["db" + str(l)] = ... 
    """
    grads = {}
    L = len(caches) # the number of layers
    m = al.shape[1]
    Y = Y.reshape(al.shape) # after this line, Y is the same shape as al
    
    # Init the backpropagation
    dAL = -(np.divide(y,al) - np.divide(1-y, 1-al))
    
    current_cache = caches[L-1]
    grads["dA" + str(L-1)], grads["dW" + str(L)], grads["db" + str(L)] = linear_activation_backward(dAL, current_cache, activation = "sigmoid")
    
    # Loop from l=L-2 to l=0
    for l in reversed(range(L-1)):
        # lth layer: (relu ---> linear) gradients.
        current_cache = caches[l]
        dA_prev_temp, dW_temp, db_temp = linear_activation_backward(grads["dA" + str(l+1)], current_cache, activation = "relu")
        grads["dA" + str(l)] = dA_prev_temp
        grads["dW" + str(l + 1)] = dW_temp
        grads["db" + str(l + 1)] = db_temp

    return grads

### 6.4 - Update Parameters
* We will update the parameters of the model, using gradient descent: 

$$ W^{[l]} = W^{[l]} - \alpha \text{ } dW^{[l]} $$
$$ b^{[l]} = b^{[l]} - \alpha \text{ } db^{[l]} $$

where $\alpha$ is the learning rate.

In [None]:
def update_parameters(parameters, grads, learning_rate):
    """
    Update parameters using gradient descent
    parameters -- python dictionary containing parameters 
    grads -- dictionary containing your gradients, output ofthe L_model_backward
    """
    L = len(parameters) // 2 # number of layers in the neural network

    # Update for each param
    for l in range(L):
        parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate * grads["dW" + str(l+1)]
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate * grads["db" + str(l+1)]
    return parameters