In [None]:
class NN:
  def __init__(self, input_dim, output_dim, alpha, activation_func, num_of_epoch, weight_initial, hidden_dim1, hidden_dim2):
    self.input_dim = input_dim # The number of input features.
    self.output_dim = output_dim # The number of output classes.
    self.alpha = alpha # The learning rate.
    self.cost_list = [] # To store cost after each epoch.
    self.num_of_epoch = num_of_epoch # The number of epochs to train the network.
    
    # hidden_dim1, hidden_dim2 represent the number of hidden units in the first and second hidden layers, respectively.

    # The activation function to use, can be one of 'relu', 'sigmoid', or 'softmax'.
    if activation_func == 'relu':
      self.activation_func = self.relu
      self.deriv_activation_func = self.deriv_relu
    elif activation_func == 'softmax':
      self.activation_func = self.softmax
      self.deriv_activation_func = self.deriv_softmax
    elif activation_func == 'sigmoid':
      self.activation_func = self.sigmoid
      self.deriv_activation_func = self.deriv_sigmoid
    
    np.random.seed(0) # to initialise all the weights with the same random values every time the code is executed.

    # The weight initialization method can be one of 'random', 'zero', or 'constant'.
    if weight_initial == 'random':
      self.w1 = np.random.rand(input_dim, hidden_dim1)
      self.b1 = np.zeros((1, hidden_dim1))

      self.w2 = np.random.rand(hidden_dim1, hidden_dim2)
      self.b2 = np.zeros((1, hidden_dim2))

      self.w3 = np.random.rand(hidden_dim2, output_dim)
      self.b3 = np.zeros((1, output_dim))

    elif weight_initial == 'zero':
      self.w1 = np.zeros((input_dim, hidden_dim1))
      self.b1 = np.zeros((1, hidden_dim1))

      self.w2 = np.zeros((hidden_dim1, hidden_dim2))
      self.b2 = np.zeros((1, hidden_dim2))

      self.w3 = np.zeros((hidden_dim2, output_dim))
      self.b3 = np.zeros((1, output_dim))

    elif weight_initial == 'constant':
      self.w1 = np.ones((input_dim, hidden_dim1)) * 0.5
      self.b1 = np.zeros((1, hidden_dim1))

      self.w2 = np.ones((hidden_dim1, hidden_dim2)) * 0.5
      self.b2 = np.zeros((1, hidden_dim2))

      self.w3 = np.ones((hidden_dim2, output_dim)) * 0.5
      self.b3 = np.zeros((1, output_dim))

  def relu(self, Z): # The ReLU activation function.
    return np.maximum(0, Z)

  def sigmoid(self, Z): # The sigmoid activation function.
    return 1/(1+np.exp(-Z))

  def softmax(self, Z): # The softmax activation function.
    exp_Z = np.exp(Z - np.max(Z, axis=1, keepdims=True))
    return exp_Z / np.sum(exp_Z, axis=1, keepdims=True)

  def deriv_softmax(self, Z): # The derivative of the softmax activation function.
    s = self.softmax(Z)
    return s * (1 - s)

  def deriv_relu(self, Z): # The derivative of the ReLU activation function.
    return Z > 0

  def deriv_sigmoid(self, Z): # The derivative of the sigmoid activation function.
    s = self.sigmoid(Z)
    return s*(1-s)

  def forward(self, X): # Forward propagates the input through the network.
    self.z1 = np.matmul(X, self.w1) + self.b1
    self.a1 = self.activation_func(self.z1)

    self.z2 = np.matmul(self.z1, self.w2) + self.b2
    self.a2 = self.activation_func(self.z2)

    self.z3 = np.matmul(self.z2, self.w3) + self.b3
    self.a3 = self.softmax(self.z3)

  def one_hot(self, y): # Converts a target vector to matrix of 1 and 0 using one-hot encoding.
    one_hot_y = np.zeros((y.size, (y.max()+1)))
    one_hot_y[np.arange(y.size), y] = 1
    return one_hot_y

  def backward(self, X, y): # Backward propagates the error and computes the gradients.
    m = y.shape[1]

    cost = np.sum((self.a3 - y)**2, axis=1).mean()
    self.cost_list.append(cost)

    dw3 = (1/m)*(np.sum(self.a3 - y, axis=1).mean()) * (np.matmul(self.a2.T, self.deriv_softmax(self.z3)))
    db3 = (1/m)*(np.sum(self.a3 - y, axis=1).mean()) * (self.deriv_softmax(self.z3))
    db3 = np.mean(db3, axis=0).reshape(1, -1)

    dw2 = (1/m)*(np.sum(self.a3 - y, axis=1).mean()) * np.matmul(self.a1.T, np.matmul(self.deriv_softmax(self.z3), self.w3.T) * self.deriv_activation_func(self.z2))
    db2 = (1/m)*(np.sum(self.a3 - y, axis=1).mean()) * np.matmul(self.deriv_softmax(self.z3), self.w3.T) * self.deriv_activation_func(self.z2)
    db2 = np.mean(db2, axis=0).reshape(1, -1)

    dw1 = (1/m)*(np.sum(self.a3 - y, axis=1).mean()) * np.matmul(X.T, np.matmul(np.matmul(self.deriv_softmax(self.z3), self.w3.T) * self.deriv_activation_func(self.z2), self.w2.T) * self.deriv_activation_func(self.z1))
    db1 = (1/m)*(np.sum(self.a3 - y, axis=1).mean()) * np.matmul(np.matmul(self.deriv_softmax(self.z3), self.w3.T) * self.deriv_activation_func(self.z2), self.w2.T) * self.deriv_activation_func(self.z1)
    db1 = np.mean(db1, axis=0).reshape(1, -1)
    
    return dw1, db1, dw2, db2, dw3, db3

  def update_params(self, dw1, db1, dw2, db2, dw3, db3): # Updates the parameters obtained from backward method.
    self.w1 = self.w1 - self.alpha*dw1
    self.b1 = self.b1 - self.alpha*db1

    self.w2 = self.w2 - self.alpha*dw2
    self.b2 = self.b2 - self.alpha*db2

    self.w3 = self.w3 - self.alpha*dw3
    self.b3 = self.b3 - self.alpha*db3

  def stochastic_gradient_descent(self, X, y): # Trains the network using stochastic gradient descent.
    for i in range(self.num_of_epoch):
      self.forward(X)
      dw1, db1, dw2, db2, dw3, db3 = self.backward(X, y)
      self.update_params(dw1, db1, dw2, db2, dw3, db3)

  def fit(self, X, y): # Fits the input in the model.
    one_hot_y = y.copy()
    one_hot_y = self.one_hot(one_hot_y)
    self.stochastic_gradient_descent(X, one_hot_y)

  def predict(self, X): # Predicts the output for a given test sample.
    z1_p = np.matmul(X, self.w1) + self.b1
    a1_p = self.relu(z1_p)

    z2_p = np.matmul(a1_p, self.w2) + self.b2
    a2_p = self.relu(z2_p)

    z3_p = np.matmul(a2_p, self.w3) + self.b3
    a3_p = self.softmax(z3_p)

    return np.argmax(a3_p, axis=1)