In [1]:
import numpy as np

def relu(x):
    return np.maximum(0, x)

def heaviside(x):
    return (x > 0).astype(float)

def forward_pass(X, W1, W3):
    X1 = np.dot(X, W1)
    X2 = relu(X1)
    X3 = np.dot(X2, W3)
    print("Shape of X1:", X1.shape)
    print("Shape of X2:", X2.shape)
    print("Shape of X3:", X3.shape)
    return X1, X2, X3

def backward_pass(X, y, W1, W3, X1, X2, X3):
    # Compute the derivative of the loss with respect to X3 (output of the network)
    # dE_dX3 represents the error gradient at the output layer
    dE_dX3 = -(y - X3) 

    # Compute the derivative of the loss with respect to W3 (weights of the output layer)
    # dE_dW3 represents the gradient of the loss with respect to the weights of the output layer
    # It is computed by multiplying the error gradient (dE_dX3) with the activations of the previous layer (X2)
    dE_dW3 = dE_dX3 * X2

    # Compute the derivative of the loss with respect to X2 (activations of the hidden layer)
    # dE_dX2 represents the error gradient propagated back to the hidden layer
    # It is computed by multiplying the weights of the output layer (W3) with the error gradient (dE_dX3)
    dE_dX2 = W3 * dE_dX3

    # Compute the derivative of the loss with respect to X1 (input to the hidden layer)
    # dE_dX1 represents the error gradient propagated back to the input of the hidden layer
    # It is computed by multiplying the transpose of the Heaviside function (derivative of ReLU) of X2 with dE_dX2
    dE_dX1 = heaviside(X2).T * dE_dX2
    
    # Compute the derivative of the loss with respect to W1 (weights of the hidden layer)
    # dE_dW1 represents the gradient of the loss with respect to the weights of the hidden layer
    # It is computed by performing a matrix multiplication between dE_dX1 and the input data X
    dE_dW1 = dE_dX1 @ X 

    return dE_dW1, dE_dW3

# Example usage
X = np.array([[0.5, 0.8]])  # Input data
print("Shape of X:", X.shape)
y = np.array([[1.0]])  # Ground truth labels
print("Shape of y:", y.shape)

input_dim = X.shape[1]
hidden_dim = 2
output_dim = 1

# Initialize weights randomly
W1 = np.random.randn(input_dim, hidden_dim)
print("Shape of W1:", W1.shape)
W2 = np.random.randn(hidden_dim, output_dim)
print("Shape of W2:", W2.shape)

# Perform forward pass
X1, X2, X3 = forward_pass(X, W1, W2)

# Perform backward pass
dE_dW1, dE_dW3 = backward_pass(X, y, W1, W2, X1, X2, X3)

print("Gradient with respect to W1:")
print(dE_dW1)
print("Gradient with respect to W2:")
print(dE_dW3)

Shape of X: (1, 2)
Shape of y: (1, 1)
Shape of W1: (2, 2)
Shape of W2: (2, 1)
Shape of X1: (1, 2)
Shape of X2: (1, 2)
Shape of X3: (1, 1)
Gradient with respect to W1:
[[0.11002194 0.1760351 ]
 [0.11509157 0.18414651]]
Gradient with respect to W2:
[[-0.39088535 -0.00685729]]
