In [1]:
import numpy as np


def sigmoid(self, z):
        """ Sigmoid activation function. """
        return 1 / (1 + np.exp(-z))

def sigmoid_derivative(self, z):
        """ Derivative of the sigmoid function. """
        return z * (1 - z)

def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return (x > 0).astype(float)

def linear(x):
    return x  

def linear_derivative(x):
    return np.ones_like(x)

def mse(y_true, y_pred):
    return np.mean(0.5 * (y_true - y_pred) ** 2)

def mse_derivative(y_true, y_pred):
    return y_pred - y_true

# Initialize network parameters
np.random.seed(42)
input_size = 4  # Number of input features
hidden1_size = 8  # First hidden layer 
hidden2_size = 6  # Second hidden layer 
output_size = 2  # Two output neurons

# Weight and bias initialization
W1 = np.random.randn(input_size, hidden1_size) * 0.1
b1 = np.zeros((1, hidden1_size))
W2 = np.random.randn(hidden1_size, hidden2_size) * 0.1
b2 = np.zeros((1, hidden2_size))
W3 = np.random.randn(hidden2_size, output_size) * 0.1
b3 = np.zeros((1, output_size))

# Learning rate
lr = 0.01

# Training data (random example)
X = np.random.randn(5, input_size)  # 5 training samples with 4 features each
y = np.random.randn(5, output_size)  # 5 corresponding labels with 2 outputs

for epoch in range(2):
    total_loss = 0
    # SGD runs for 2*Number of Input Samples
    for i in range(len(X)):
        # Forward pass
        x_sample = X[i:i+1]  
        y_sample = y[i:i+1]
        
        z1 = np.dot(x_sample, W1) + b1
        a1 = relu(z1)
        z2 = np.dot(a1, W2) + b2
        a2 = relu(z2)
        z3 = np.dot(a2, W3) + b3
        y_pred = linear(z3)
        
        # Compute loss using mse as asked for regression problem we used mse
        loss = mse(y_sample, y_pred)
        total_loss += loss
        
        # Backpropagation
        dL_dy = mse_derivative(y_sample, y_pred) * linear_derivative(z3)
        dL_dW3 = np.dot(a2.T, dL_dy)
        # The given line of code is used to compute the sum of gradients 
        # (dL_dy) along a specific axis (in this case, axis 0) and keep 
        # the resulting array with the same number of dimensions 
        #  +  as the original array.
        dL_db3 = np.sum(dL_dy, axis=0, keepdims=True)
        
        dL_da2 = np.dot(dL_dy, W3.T) * relu_derivative(z2)
        dL_dW2 = np.dot(a1.T, dL_da2)
        dL_db2 = np.sum(dL_da2, axis=0, keepdims=True)
        
        dL_da1 = np.dot(dL_da2, W2.T) * relu_derivative(z1)
        dL_dW1 = np.dot(x_sample.T, dL_da1)
        dL_db1 = np.sum(dL_da1, axis=0, keepdims=True)
        
        # Weight updates (SGD)
        W3 -= lr * dL_dW3
        b3 -= lr * dL_db3
        W2 -= lr * dL_dW2
        b2 -= lr * dL_db2
        W1 -= lr * dL_dW1
        b1 -= lr * dL_db1
        
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(X):.4f}")

# Printing Updated Weights after 2 Epochs
print("Updated Weights and Biases :")
print("W1 : ",W1)
print("b1 : ",b1)

print("W2 : ",W2)
print("b2 : ",b2)

print("W3 : ",W3)
print("b3 : ",b3)


Epoch 1, Loss: 0.5474
Epoch 2, Loss: 0.5426
Updated Weights and Biases :
W1 :  [[ 0.04963782 -0.01380574  0.06478025  0.15227964 -0.02344647 -0.02337419
   0.15782983  0.07673328]
 [-0.04688789  0.05419524 -0.04638782 -0.04655272  0.02425938 -0.19133543
  -0.172563   -0.05622581]
 [-0.10094809  0.03115187 -0.09092753 -0.1409703   0.14688987 -0.02280657
   0.00644426 -0.14239888]
 [-0.05405665  0.0110691  -0.11550417  0.03756425 -0.05998458 -0.02917767
  -0.06040685  0.18522887]]
b1 :  [[-2.78475937e-04 -1.53543807e-04  2.07243084e-04 -9.64785220e-05
   1.02909720e-04  1.17904560e-04  2.60529501e-04 -3.95783223e-05]]
W2 :  [[-0.00134972 -0.10577109  0.08203591 -0.12177368  0.02072917 -0.19596701]
 [-0.1328186   0.01968612  0.07362862  0.01735462 -0.01156111 -0.03011037]
 [-0.1478522  -0.07198442 -0.04643931  0.10611355  0.03439221 -0.17630402]
 [ 0.0324084  -0.03850823 -0.0676922   0.06140887  0.10255425  0.09312801]
 [-0.08392175 -0.03092124  0.03233812  0.09831982 -0.04783947 -0.01856