In [96]:
import numpy as np
import pandas as pd 

In [98]:
class NeuralNetwork:
    def __init__(self , N , hidden_layer_size , lr, A_function ,max_iter , Weight_fun , batch_size ) :
        self.N = N
        self.hidden_layer_size = hidden_layer_size
        self.lr = lr ## learning rate
        self.A_function = A_function
        self.batch_size = batch_size
        self.max_iter = max_iter ## number of epochs
        self.Weight_fun = Weight_fun  # Weight initialization function
        self.layers = []



# ##### Helper functions  for part A
    def forward(self , X):   ## . helper function for going into next layer
        W1 = self.random_init_function(self.hidden_layer_size[0] , self.hidden_layer_size[1])   
        ## Weight matrix using function
        A_prev = X   ## initial value at the previous layer
            
        Z = np.dot(A_prev, W1 )     ## Multiply with weights to get probability

        if(self.A_function == "relu"):
            A = self.relu(Z)

        elif (self.A_function == "tanh"):
            A = self.tanh(Z)

        elif (self.A_function == "linear"):
            A = self.linear(Z)

        elif (self.A_function == "sigmoid"):
            A = self.sigmoid(Z)

        return A 
    
    def loss_function(self, A, y):  ## Helper function
        log_val = - np.log(A[np.arange(len(y)), y.argmax(axis=1)])
        loss = np.sum(log_val)/ len(y)
        return loss


## Part A
#   
##
    def fit(self , X , Y):
        train_loss = [] ## array for storing the train loss
        for epoch in self.max_iter:
            train_batch_loss = []

            for batch in self.batch_size:
                A = self.forward(X) ## Moving to next nueron
                curr_loss = self.loss_function(A , Y[batch]) ## Calculating loss using the helper function
                train_batch_loss.append(curr_loss) 
            
            train_loss.append( train_batch_loss)
        return train_loss



    def predict_proba(self , X):
        y_proba = []        
        
        output = X;   
        for layer in self.layers:
            output = self.forward(output) ## Going forward through all the layers
            y_proba.append(output)
        y_proba = self.softmax(y_proba)     ## using softmax as this is the last layer
        return np.array(y_proba) ## class wise probability 


    def predict(self , X):
        y_pred = []
        y_proba = self.predict_proba(X)
        for i in range(len(y_proba)):
            y_pred.append(np.argmax(y_proba[i])) ##Predicting the y by passing the X_train into the predict_proba function  
        return np.array(y_pred)                     ## and getting the y pred

    def score(self , X , Y):
        y_pred = self.predict(X) ## predicting the y
        return np.mean(y_pred== Y)   ## returning he score


### B Part
 
    ## Here gradients are partial derivatives of the functions

    def relu(self, X):
        return X * (X>=0) ### return max(0,X)

    def relu_grad(self, X):
        return 1*(X>=0)     ## return 1 if x>0 else 0

    def leaky_relu(self, X):
        return np.maximum(0.1 * X, X)      ## return 0.01 * X if x<0 else X

    def leaky_relu_grad(self, X):
        if X>0 :
            return X
        else :
            return 0.01*X        ## return 0.01 if x<0 else X
    
    def linear(self, X):   ## in linear we return X only
        return X

    def linear_grad(self, X):   ## the matrix with all 1s instead of values of x
        return np.ones(X.shape)
    
    def sigmoid(self, X):
        return 1/(1+np.exp(-X)) ## returning this value using the definition of sigmoid function

    def sigmoid_grad(self, X):
        return self.sigmoid(X) * (1-self.sigmoid (X)) ## We get this result on taking derivative of the sigmoid function

    def tanh(self, X):   # Compute hyperbolic tangent element-wise ( (e^X - e^-X)/(e^X + e^-X) )
        return np.tanh(X)

    def tanh_grad(self, X):
        return 1 - self.tanh(X)*self.tanh(X)  ## derivative of tanh(x) is 1 - tanh2(x)

    def softmax(self, X):
        exp = np.exp(X)
        return exp/(np.sum(exp))        ## softmax is a normalized exponential function

    def softmax_grad(self, X):
        result_matrix = np.diag(X) ## constructing jacobian matrix of softmax as it is the maxtrix of partial derivatives

        for i in range(len(result_matrix)):
            for j in range(len(result_matrix)):
                if i == j:
                    result_matrix[i][j] = (1-X[i])
                else: 
                    result_matrix[i][j] = -X[i]*X[j]
        return result_matrix            
    


#### Part C

    ## Here shape = (self.hidden_layer_size[0],self.hidden_layer_size[1])

    def zero_init_function(self, shape):
        weight = np.zeros(shape)    ## returns weight with just zero initialization 
        return weight

    def random_init_function(self, shape):
        weight = np.random.rand(shape[0], shape[1])     ## return weights with random initialization
        return weight

    def normal_init_function(self, shape):
        weight = np.random.normal(0 , 1 , size = shape , scale = 0.01)    
        ##samples from the parameterized normal distribution (Guassian)
        ## Mean 0 , Variance 1
        return weight

