In [None]:
import numpy as np
from matplotlib import pyplot as plt




1.   Initialize the parameters
2.   Forward propagation
3.   Compute the loss
4.   Backward propagation
5.   Updating the parameters




# 1.Initialization of the parameters


Here values of parameters will be randomly assigned. Hovewer we need to determine the shapes of the parameters.To do so we can do the following analysis. $$Z^{[i]}=W^{[i]}.a^{[i-1]}+b^{[i]}$$ Where $a^{[1]}=x_{[1]}.$  and $a^{[i]}=\sigma(z^{[i]}).$ or $a^{[i]}=RELU(z^{[i]}).$ \
Dimensions of $Z^{[i]}$ is $(n^{[i]},1)$. So dimensions of $W^{[i]}$ is $(n^{[i]},n^{[i-1]})$ and dimensions of $b^{[i]}$ is $(n^{[i]},1).$


In [None]:
def initialize_parameters(layer_dims):
    #layer dims will be a list which contains dimension in each layer where layer_dims[0] denoting number of input parameters.
    parameters={} #dictionary containing parameters
    for i in range(1,len(layer_dims)):
        parameters["W"+str(i)]=np.random.randn(layer_dims[i],layer_dims[i-1])
        parameters["b"+str(i)]=np.zeros((layer_dims[i],1))
    assert(parameters['W' + str(l)].shape == (layer_dims[l], layer_dims[l-1]))
    assert(parameters['b' + str(l)].shape == (layer_dims[l], 1))
    return parameters

# 2. Forward Propagation


Now that $W^{[i]}$ and $b^{[i]}$ values are assigned we can find the input of the activation function.
$$Z^{[i]}=W^{[i]}.a^{[i-1]}+b^{[i]}$$

In [None]:
def forward_Z(a,W,b):
    Z=np.dot(W,a)+b
    cache=(a,W,b)  #will be useful for backward propagation
    return Z,cache

In this neural network i will use ReLu activation function and sigmoid activation function.

In [None]:
#relu and sigmoid function
def relu(Z):
    A=np.maximum(0,z)
    cache=Z           
    assert(A.shape == Z.shape)
    return A,cache
def sigmoid(Z):
    A=1/(1+np.exp(-Z))
    cache=Z
    return A,cache

In [None]:
def activation(A_prev,W,b,activation):
    if activation=="sigmoid":
        Z,linear_cache=forward_Z(A_prev,W,b)
        A,activation_cache=sigmoid(Z)
    elif activation=="relu":
        Z,linear_cache=forward_Z(A_prev,W,b)
        A,activation_cache=relu(Z)
    cache=(linear_cache,activation_cache)
    return A,cache

For this neural network i will repeating relu for L-1 times and as a last step one sigmoid.

In [None]:
def forward(X,parameters):
    #Do [forward_z>ReLu(z)] for L-1 times and then [forward_z>sigmoid(z)] for 1 time. 
    #here X is the data. It's numpy array and its shape is(input data size,number of examples)    
    caches=[]
    A=X #here initial A is the X for each loop it will change. 
    L=len(parameters)//2  #consists of w_i and b_i so its length is 2*L
    
    #Relu for L-1 times:
    for i in range(1,L):
        A_prev=A
        W=parameters["W"+str(i)]
        b=parameters["b"+str(i)]
        #calculate activation
        A,cache=activation(A_prev,W,b,activation="relu")
        caches.append(cache)
    
    #sigmoid for one time
    W=parameters["W"+str(L)]
    b=parameters["b"+str(L)]
    AL,cache=activation(A,W,b,activation="sigmoid") #here AL denotes a^{L}
    caches.append(cache)
    return AL, caches




# 3.Cost Function


Next i will calculate the cost function: 
$$Cost=-\frac{1}{m} \sum\limits_{i = 1}^{m} (y^{(i)}\log\left(A^{[L] (i)}\right) + (1-y^{(i)})\log\left(1- A^{[L](i)}\right))$$




In [None]:
def cost(AL,Y):
    #Here AL is the result of the sigmoid function(last step of the forward propagation)
    #and Y is the true label vector.
    m=Y.shape[1]
    cost=-1/m*np.sum(Y*np.log(AL)+(1-Y)*np.log(1-AL))
    cost=np.squeeze(cost) #if cost returns [0.1] this will turn it into 0.1
    assert(cost.shape==())
    return cost

# 4.Back Propagation


In the 5th step i will update the parameters with the following formula:
$$W=W-\alpha dW$$
$$b=b-\alpha db$$ Where $\alpha$ is the pre-determined learning rate and $dW$ and $db$ are the short notations for the following:

 $$dW^{[i]}=\frac{\partial J}{\partial W^{[i]}}$$
$$db^{[i]}=\frac{\partial J}{\partial b^{[i]}}$$
Here we will need to use chain rule in order to find above equations. The full chain is the following:
$$\frac{\partial L}{\partial w_1}=\frac{\partial L}{\partial A^{[L]}}\frac{\partial A^{[L]}}{\partial z}\frac{\partial z}{\partial w_{1}}$$
but i will assume $\frac{\partial L}{\partial z}$ is given. So we will instead calculate: 
$$\frac{\partial L}{\partial w_1}=\frac{\partial L}{\partial z}\frac{\partial z}{\partial w_1}$$
But we need $\frac{\partial J}{\partial x}$ so we will sum over m examples and divide by m. Remember Z:

$$Z^{[i]} = W^{[i]} A^{[i-1]} + b^{[i]}$$ So $dW^{[i]}$ becomes
$$dW^{[i]}=\frac{1}{m}{dZ^{[i]}*A^{[i-1]}.T}$$
$$dA^{[i-1]}=W^{[i]}.T*dZ$$
and $db^{[i]}$ becomes
$$db^{[i]}=\frac{1}{m}\sum_{i = 1}^{m}{dZ^{[i]}}$$


In [None]:
def linear_backward(dZ,cache):
    A_prev, W,b =cache
    m.A_prev.shape[1]
    dW=1/m*np.dot(dZ,A_prev.T)
    db=1/m*np.sum(dZ,axis=1,keepdims=True)
    dA_prev=np.dot(W.T,dZ)
    assert (dA_prev.shape == A_prev.shape)
    assert (dW.shape == W.shape)
    assert (db.shape == b.shape)

    
    return dA_prev,dW,db

Now finding dZ:
$$A=g(z)$$
$$\frac{dL}{dA}=\frac{dL}{dZ}*g^{\prime}(z)$$
$$dZ=dA*g^{\prime}$$

In [None]:
def relu_backward(dA,cache):
    Z = cache
    dZ = np.array(dA, copy=True) 
    #For Z>0 A(Z)=Z So derivative is =1
    #For Z<0 A(Z)=0 Since this one is constant derivative  is =0
    dZ[Z <= 0] = 0  #Z<=0 part just sets every element as True or False and the whole line sets True
    #values to =0. We need to specify 0 terms
    #For Z>0 part we won't implement anything since we have already copied dA.
    assert (dZ.shape == Z.shape)
    
    return dZ

def sigmoid_backward(dA,cache):
    Z = cache
    s = 1/(1+np.exp(-Z))
    dZ = dA*s*(1-s)
    assert (dZ.shape == Z.shape)
    
    return dZ



Combining the previous 2 cells:

In [None]:
def activation_backward(dA,cache,activation):
    linear_cache,activation_cache=cache
    if activation=="relu":
        dZ=relu_backward(dA,activation_cache)
        dA_prev,dW,db=linear_backward(dZ,linear_cache)
    
    elif activation=="sigmoid":
        dZ=sigmoid_backward(dA,activation_cache)
        dA_prev,dW,db=linear_backward(dZ,linear_cache)
    return dA_prev,dW,db

In [None]:
def L_backward(AL,Y,caches):
    grads={} #dA1=.. dA2=..
    L=len(caches)
    m=AL.shape[1]
    Y=Y.reshape(AL.shape) # here Y is the true label vector. 
    dAL=-np.divide(Y/AL)-np.divide(1-Y,1-AL)
    current_cache=caches[L-1]
    #Lth layer(last step) sigmoid function
    grads["dA"+str(L-1)],grads["dW"+str(L)],grads["db"+str(L)]=activation_backward(dAL,current_cache,activation="sigmoid")

    #for (L-1)th layer to first layer (all Relu)
    for i in reversed(range(L-1)):
        current_cache=caches[i]
        dA_prev_temp, dW_temp, db_temp = activation_backward(grads["dA"+str(l+1)],current_cache,"relu")
        grads["dA" + str(l)] = dA_prev_temp
        grads["dW" + str(l + 1)] = dW_temp
        grads["db" + str(l + 1)] = db_temp
    return grads

# 5.Updating the parameters

As a last step parameters will be updated: 
$$W=W-\alpha dW$$
$$b=b-\alpha db$$ Where $\alpha$ is the pre-determined learning rate and $dW$ and $db$ are the short notations for the following:

In [None]:
def update_parameters(parameters, grads, learning_rate):
    L=len(parameters)//2
    for i in range(L):
        parameters["W" + str(i+1)] = parameters["W" + str(i+1)]-learning_rate*grads["dW" + str(i+1)]
        parameters["b" + str(i+1)] = parameters["b" + str(i+1)]-learning_rate*grads["db" + str(i+1)]
    return parameters