## Imports and definitions

In [1]:
import numpy as np

# --- Activation Functions ---
def relu(z):
    """Element-wise ReLU activation."""
    return np.maximum(0, z)

def relu_derivative(z):
    """Element-wise derivative of ReLU."""
    return np.where(z > 0, 1.0, 0.0)

def softmax(z):
    """Softmax activation for a single sample or batch (along axis 1)."""
    # Shift z for numerical stability (subtract max)
    shift_z = z - np.max(z, axis=0, keepdims=True)
    exps = np.exp(shift_z)
    return exps / np.sum(exps, axis=0, keepdims=True)

### Neural net definition

In [2]:
# Use column vectors for biases and inputs
W1 = np.array([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]]) # Shape (3, 2)
b1 = np.array([[0.1], [0.1], [0.1]])              # Shape (3, 1)
W2 = np.array([[0.7, 0.8, 0.9], [1.0, 1.1, 1.2]]) # Shape (2, 3)
b2 = np.array([[0.2], [0.2]])                     # Shape (2, 1)

# Store parameters in lists for scalability
weights = [W1, W2]
biases = [b1, b2]
num_layers = len(weights) # L = 2

### Input + sample target

In [3]:
x = np.array([[1.0], [0.5]]) # Shape (2, 1)
y = np.array([[0], [1]])     # Shape (2, 1) (one-hot)

## Forward pass

In [5]:
# --- Forward Pass ---
activations = [x] # Store activations, a^0 = x
pre_activations = [] # Store z^l

a = x
for l in range(num_layers):
    W = weights[l]
    b = biases[l]
    z = W @ a + b
    pre_activations.append(z)
    if l < num_layers - 1: # Hidden layers use ReLU
        a = relu(z)
    else: # Output layer uses Softmax
        a = softmax(z)
    activations.append(a)

In [6]:
activations

[array([[1. ],
        [0.5]]),
 array([[0.3],
        [0.6],
        [0.9]]),
 array([[0.36818758],
        [0.63181242]])]

### Output prediction

In [7]:
y_hat = activations[-1]
print("--- Forward Pass ---")
print(f"z^1:\n{pre_activations[0]}")
print(f"a^1:\n{activations[1]}")
print(f"z^2:\n{pre_activations[1]}")
print(f"a^2 (ŷ):\n{y_hat}")

--- Forward Pass ---
z^1:
[[0.3]
 [0.6]
 [0.9]]
a^1:
[[0.3]
 [0.6]
 [0.9]]
z^2:
[[1.7 ]
 [2.24]]
a^2 (ŷ):
[[0.36818758]
 [0.63181242]]


## Backward Pass

In [9]:
gradients_W = [None] * num_layers
gradients_b = [None] * num_layers

In [10]:
# Calculate delta for the output layer (L)
# delta_L = dJ/dz^L = y_hat - y
delta = y_hat - y # delta^2 for L=2

# Gradient for the last layer
gradients_W[-1] = delta @ activations[-2].T # dJ/dW^L = delta^L * (a^(L-1))^T
gradients_b[-1] = delta                   # dJ/db^L = delta^L

# Iterate backwards from L-1 down to 1
for l in range(num_layers - 2, -1, -1):
    # Propagate delta backwards: delta^l = (W^(l+1)^T @ delta^(l+1)) .* relu'(z^l)
    W_next = weights[l+1]
    z_current = pre_activations[l]
    delta = (W_next.T @ delta) * relu_derivative(z_current)

    # Gradient for current layer l
    gradients_W[l] = delta @ activations[l].T # dJ/dW^l = delta^l * (a^(l-1))^T (activations[l] is a^l)
    gradients_b[l] = delta                  # dJ/db^l = delta^l

print("\n--- Backward Pass (Gradients) ---")
print(f"dJ/dW^2 (shape {gradients_W[1].shape}):\n{gradients_W[1]}")
print(f"dJ/db^2 (shape {gradients_b[1].shape}):\n{gradients_b[1]}")
print(f"dJ/dW^1 (shape {gradients_W[0].shape}):\n{gradients_W[0]}")
print(f"dJ/db^1 (shape {gradients_b[0].shape}):\n{gradients_b[0]}")


--- Backward Pass (Gradients) ---
dJ/dW^2 (shape (2, 3)):
[[ 0.11045627  0.22091255  0.33136882]
 [-0.11045627 -0.22091255 -0.33136882]]
dJ/db^2 (shape (2, 1)):
[[ 0.36818758]
 [-0.36818758]]
dJ/dW^1 (shape (3, 2)):
[[-0.11045627 -0.05522814]
 [-0.11045627 -0.05522814]
 [-0.11045627 -0.05522814]]
dJ/db^1 (shape (3, 1)):
[[-0.11045627]
 [-0.11045627]
 [-0.11045627]]


In [11]:
# --- Verification (Compare with manual calculation) ---
# Manual results (approx):
# dJ/dW^2 ≈ [[0.1104, 0.2208, 0.3312], [-0.1104, -0.2208, -0.3312]]
# dJ/db^2 ≈ [0.368, -0.368]^T
# dJ/dW^1 ≈ [[-0.1104, -0.0552], [-0.1104, -0.0552], [-0.1104, -0.0552]]
# dJ/db^1 ≈ [-0.1104, -0.1104, -0.1104]^T

# Check if code results match (allowing for floating point differences)
assert np.allclose(gradients_W[1], [[0.1104, 0.2208, 0.3312], [-0.1104, -0.2208, -0.3312]], atol=1e-4)
assert np.allclose(gradients_b[1], [[0.368], [-0.368]], atol=1e-4)
assert np.allclose(gradients_W[0], [[-0.1104, -0.0552], [-0.1104, -0.0552], [-0.1104, -0.0552]], atol=1e-4)
assert np.allclose(gradients_b[0], [[-0.1104], [-0.1104], [-0.1104]], atol=1e-4)

AssertionError: 