In [None]:
#importing all necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
'''Some variables used
n_x=No.of features of input X
m=No.of training examples in X
n_y=No.of nodes in last layer(No.of classes for multiclass system)

X(Input matrix) should be of dimensions (n_x,m)
Y(True output matrix) should be of dimensions (n_y,m)
If it is a multiclass system,then n_y corresponds to each case of it being in a class
so this matrix will have 1 for the class that example belongs to and zero otherwise(such is used for loss fn)

layer_dims-It is an array containing the no.of nodes in each layer,starting from input layer
to output layer.So it is of the form [n_x,...,n_y]


'''

In [None]:
def initialize_parameters(layer_dims):
    #Input:dimensions of layers
    #Output:dictionary containing weights and bais for them
    parameters=dict()
    for i in range(len(layer_dims)-1):
        parameters["W"+str(i+1)]=np.random.randn(layer_dims[i+1],layer_dims[i])*0.01
        parameters["b"+str(i+1)]=np.zeros((layer_dims[i+1],1))
        assert(parameters["W"+str(i+1)]).shape==(layer_dims[i+1],layer_dims[i])
        assert(parameters["b"+str(i+1)]).shape==(layer_dims[i+1],1)
        
    return parameters 

In [None]:
'''The following part will contain functions required for FORWARD PROPAGATION'''

In [None]:
#This function is called by forward_act,which calculates the activation of a layer
def forward_prop(A_prev,W,b):
    #Calculates Z
    #Inputs:activations of previous layer,current layer weights and bias
    #Output:Z
    Z=np.dot(W,A_prev)+b
    assert(Z.shape==(W.shape[0],A_prev.shape[1]))
    return Z

In [None]:
#This is further called by L_layer_forward,which completes the forward pass part of the cycle
def forward_act(A_prev,W,b,activation):
    #Input -Type of activation,activations of previous layer,current layer weights and bias
    #Output-Activation of current layer,Z(to be cached,in L_layer_forward)
    Z=forward_prop(A_prev,W,b)
    if activation=='sigmoid':
        A=1/(1+np.exp(-Z))
    elif activation=='relu':
        A=np.maximum(0,Z)
    elif activation=='tanh':
        A=(np.exp(Z)-np.exp(-Z))/(np.exp(Z)+np.exp(-Z))
    elif activation=='softmax':
        A=np.exp(Z)/np.sum(np.exp(Z),axis=0,keepdims=True)
        
    assert(Z.shape==(W.shape[0],A_prev.shape[1]))    
    return A ,Z   

In [None]:
def L_layer_forward(X,parameters):
   #Inputs:X-the input matrix
   #       parameters-dictionary containing weights and biases
   #Output:Activations of last layer,Cache containing Z,Cache containing all prev activations(including X,
   #     for looping purpose) 
   #This function contains code for activation as relu for all layers except for last,which has softmax for multiclass
   #classification,so it can be changed if required
    l=int(len(parameters)/2)
    A_prev=X
    Z_caches=[]
    A_caches=[]
    A_caches.append(X)
    #caches.append(X)
    for i in range(l):
        if i==l-1:#Usually for last layer,different activation is used,here softmax  
            AL,cache=forward_act(A_prev,parameters["W"+str(i+1)],parameters["b"+str(i+1)],'softmax')
        else:#For other layers,relu,can be changed according to requirement
            A,cache=forward_act(A_prev,parameters["W"+str(i+1)],parameters["b"+str(i+1)],'relu')
            A_prev=A
            A_caches.append(A_prev)
        Z_caches.append(cache)
    A_caches.append(AL)
    
   # assert(AL.shape==(n_y,m))
    return AL,Z_caches,A_caches

In [None]:
'''The next part contains the cost function
Different function should be selected according to the model requirements
****Regression****
J=-1/(2m)*np.power((AL-Y),2)
****Binary classification****
J=-1/m*np.sum(Y*np.log(AL)+(1-Y)*(np.log(1-AL)))
****Multiclass classification****
J=-1/m*np.sum(np.sum(np.log(AL)*Y,axis=0,keepdims=True),axis=1,keepdims=True)
*********************************
(Note:The concept behind classifactioncost function is same,that the only term remaining in the loss
function is the only one which is the true class(value of y=1 at that pt) and our log of probability
that it belongs to that class.To minise error our prob. should be as high as possible)
'''

In [None]:
def cost(AL,Y):
    #***************************************
    '''Choose the cost function from above para,according to requirements'''
    #***************************************
    return J

In [None]:
'''The following part contains BACKWARD PROPAGATION'''

In [None]:
#This function just calculates the dZ(L) part,knowing the dA(L)
#It is basically for g'(Z) 
#It is called by grad
def backwards(dA,Z,activation):
    #Inputs:The dA for L layer,Z of that layer,type of activation that layer went through in forward pass
    #Output:dZ for that layer
    #'*' represents element wise multiplication
    if activation=='relu':
        temp=(Z>0)
        dZ=dA*temp
    elif activation=='softmax':
        dZ=dA*(np.exp(Z)/np.sum(np.exp(Z),axis=0,keepdims=True))*(1-np.exp(Z)/np.sum(np.exp(Z),axis=0,keepdims=True))
        #d/dx of softmax=softmax(x)*(1-softmax(x))
    elif activation=='sigmoid':
        dZ=dA*1/(1+np.exp(-Z))*(1-1/(1+np.exp(-Z)))
    elif activation=='tanh':
        dZ=dA*(1-np.power((np.exp(Z)-np.exp(-Z))/(np.exp(Z)+np.exp(-Z)),2))
        
    return dZ

In [None]:
def grad(A_caches,Z_caches,parameters,AL,Y):
    #Inputs:caches containing A,Z(made during FORWARD PASS),set of parameters(dict()),activation of last layer
    #      (also present in A_cache,needs optimisation),Y-the true output matrix
    #Internal Inputs:Activation type of last layer,Activation type of all other layers
    #Output:Dictionary containing gradients of Z,W,b
    grads=dict()
    l=int(len(parameters)/2)#Parameter has W and b,and we iterate through a W,b pair at once
    grads["dZ"+str(l)]=AL-Y#Is same for all 3 cost functions mentioned here
    for i in reversed(range(l)):
        grads["dW"+str(i+1)]=1/m*np.dot(grads["dZ"+str(i+1)],A_caches[i].T)
        grads["db"+str(i+1)]=1/m*np.sum(grads["dZ"+str(i+1)],axis=1,keepdims=True)
        if i==0:break#We dont want to calculate dJ/dX(input)
        dA=np.dot(parameters["W"+str(i+1)].T,grads["dZ"+str(i+1)])
        if i==l-1:
            grads["dZ"+str(i)]=backwards(dA,Z_caches[i-1],'softmax')
        else:
            grads["dZ"+str(i)]=backwards(dA,Z_caches[i-1],'relu')   
        
    return grads    

In [None]:
def update_paramters(parameters,grads,learning_rate):
    #Does the work of updating weights and biases
    #Inputs:gradients dictionary,parameter dictionary,learning_rate-alpha(Try to plot J vs num_iter to get best alpha)
    l=int(len(parameters)/2)
    for i in range(l):
        parameters["W"+str(i+1)]-=learning_rate*grads["dW"+str(i+1)]
        parameters["b"+str(i+1)]-=learning_rate*grads["db"+str(i+1)]
        
    return parameters    

In [None]:
'''The next part contains a whole model trainer,which utilizes the above declared functions'''

In [None]:
def Train_model(X,Y,layer_dims,num_iter=1500,print_cost=False):
    #Inputs:X-The input matrix
    #       Y-The true output matrix
    #       layer_dims-As specified above
    #       num_iter-No.of iterations for gradient descent
    #       print_cost(bool)-To tell if plotting J vs num_iter has to be done
    #Output:Effective parameters after gradient descent
    costs=[]
    #******************************
    #define the learning rate
    learning_rate=0.01
    #***************************
    parameters=initialize_parameters(layer_dims)
    for i in range(num_iter):
        AL,Z_caches,A_caches=L_layer_forward(X,parameters)
        J=cost(AL,Y)
        grads=grad(A_caches,Z_caches,parameters,AL,Y)
        parameters=update_paramters(parameters,grads,learning_rate)
        # Print the J every 100 training example
        if print_cost and i % 100 == 0:
            print("Cost after iteration "+str(i)+":"+str(J))
        if print_cost and i % 100 == 0:
            costs.append(J)
       
    # plot the J vs num_iter

    plt.plot(costs)
    plt.ylabel('cost')
    plt.xlabel('iterations (per hundreds)')
    plt.title("Learning rate =" + str(learning_rate))
    plt.show()
        
    
    return parameters