In [2]:
import numpy as np

# -------------------------------
# 1) Setup: inputs, initial params
# -------------------------------
x1 = 0.05
x2 = 0.10

# Initial weights (input -> hidden)
w1, w2 = 0.35, 0.45  # updated weights to H1 from x1, x2
w3, w4 = 0.55, 0.60  # updated weights to H2 from x1, x2

# Initial weights (hidden -> output)
w5, w6 = 0.65, 0.70  # updated weights to y1 from H1, H2
w7, w8 = 0.75, 0.80  # updated weights to y2 from H1, H2

# Biases
b1 = 0.35  # bias for hidden neurons H1, H2
b2 = 0.60  # bias for output neurons y1, y2

# Targets (desired outputs)
T1 = 0.01
T2 = 0.99

# Learning rate
lr = 0.5

# -------------------------------
# 2) Activation functions
# -------------------------------
def sigmoid(z):
    """Sigmoid activation."""
    return 1.0 / (1.0 + np.exp(-z))

def sigmoid_derivative_from_activation(a):
    """Derivative of sigmoid given activation a = sigmoid(z): a*(1-a)."""
    return a * (1.0 - a)

# -------------------------------
# 3) Helper: forward pass
# -------------------------------
def forward_pass(x1, x2, w1, w2, w3, w4, w5, w6, w7, w8, b1, b2):
    """
    Compute net inputs and activations for hidden and output layers.
    Returns a dict with nets and activations for printing/teaching.
    """
    # Hidden layer linear combinations (net inputs)
    H1_net = x1 * w1 + x2 * w2 + b1
    H2_net = x1 * w3 + x2 * w4 + b1
    # Hidden activations (sigmoid)
    H1 = sigmoid(H1_net)
    H2 = sigmoid(H2_net)
    # Output layer linear combinations
    y1_net = H1 * w5 + H2 * w6 + b2
    y2_net = H1 * w7 + H2 * w8 + b2
    # Output activations (sigmoid)
    y1 = sigmoid(y1_net)
    y2 = sigmoid(y2_net)
    return {
        "H1_net": H1_net, "H2_net": H2_net,
        "H1": H1, "H2": H2,
        "y1_net": y1_net, "y2_net": y2_net,
        "y1": y1, "y2": y2
    }

# -------------------------------
# 4) Forward before update (print step-by-step)
# -------------------------------
print("\n=== FORWARD PASS (before weight update) ===")
out = forward_pass(x1, x2, w1, w2, w3, w4, w5, w6, w7, w8, b1, b2)

print(f"H1_net = {out['H1_net']:.7f} => H1 = sigmoid(H1_net) = {out['H1']:.9f}")
print(f"H2_net = {out['H2_net']:.7f} => H2 = sigmoid(H2_net) = {out['H2']:.9f}")
print(f"y1_net = {out['y1_net']:.9f} => y1 = sigmoid(y1_net) = {out['y1']:.9f}")
print(f"y2_net = {out['y2_net']:.9f} => y2 = sigmoid(y2_net) = {out['y2']:.9f}")

# Compute per-output squared errors and total error
E1 = 0.5 * (T1 - out['y1'])**2
E2 = 0.5 * (T2 - out['y2'])**2
E_total = E1 + E2

print(f"\nE1 = 0.5*(T1 - y1)^2 = {E1:.9f}")
print(f"E2 = 0.5*(T2 - y2)^2 = {E2:.9f}")
print(f"Total error E_total = E1 + E2 = {E_total:.9f}")

# -------------------------------
# 5) BACKPROPAGATION — output layer
#    compute deltas and gradients for w5..w8
# -------------------------------
print("\n=== BACKPROP: output layer ===")

# For each output neuron i:
# delta_i = dE/dy_i * dy_i/dnet_i
# where dE/dy_i = -(T_i - y_i) for E = 1/2*(T-y)^2
dE_dy1 = -(T1 - out['y1'])
dy1_dnet = sigmoid_derivative_from_activation(out['y1'])
delta1 = dE_dy1 * dy1_dnet  # scalar

dE_dy2 = -(T2 - out['y2'])
dy2_dnet = sigmoid_derivative_from_activation(out['y2'])
delta2 = dE_dy2 * dy2_dnet  # scalar

print(f"dE/dy1 = {dE_dy1:.9f}, dy1/dnet = {dy1_dnet:.9f}, => delta1 = {delta1:.9f}")
print(f"dE/dy2 = {dE_dy2:.9f}, dy2/dnet = {dy2_dnet:.9f}, => delta2 = {delta2:.9f}")

# Gradients for weights from hidden -> outputs:
dw5 = delta1 * out['H1']
dw6 = delta1 * out['H2']
dw7 = delta2 * out['H1']
dw8 = delta2 * out['H2']

print("\nGradients for hidden->output weights:")
print(f"dw5 (for w5) = delta1 * H1 = {dw5:.9f}")
print(f"dw6 (for w6) = delta1 * H2 = {dw6:.9f}")
print(f"dw7 (for w7) = delta2 * H1 = {dw7:.9f}")
print(f"dw8 (for w8) = delta2 * H2 = {dw8:.9f}")

# Update output weights (gradient descent):
w5_new = w5 - lr * dw5
w6_new = w6 - lr * dw6
w7_new = w7 - lr * dw7
w8_new = w8 - lr * dw8

print("\nUpdated hidden->output weights (one step):")
print(f"w5 -> {w5_new:.9f}")
print(f"w6 -> {w6_new:.9f}")
print(f"w7 -> {w7_new:.9f}")
print(f"w8 -> {w8_new:.9f}")

# -------------------------------
# 6) BACKPROPAGATION — hidden layer
#    compute deltas for H1, H2 and gradients for w1..w4
# -------------------------------
print("\n=== BACKPROP: hidden layer ===")

# Error contribution from both output neurons flows back to each hidden neuron:
# delta_H1 = (delta1*w5 + delta2*w7) * sigmoid'(H1_net)
# delta_H2 = (delta1*w6 + delta2*w8) * sigmoid'(H2_net)
delta_H1 = (delta1 * w5 + delta2 * w7) * sigmoid_derivative_from_activation(out['H1'])
delta_H2 = (delta1 * w6 + delta2 * w8) * sigmoid_derivative_from_activation(out['H2'])

print(f"delta_H1 = (delta1*w5 + delta2*w7) * sigmoid'(H1) = {delta_H1:.12f}")
print(f"delta_H2 = (delta1*w6 + delta2*w8) * sigmoid'(H2) = {delta_H2:.12f}")

# Gradients for input->hidden weights:
dw1 = delta_H1 * x1
dw2 = delta_H1 * x2
dw3 = delta_H2 * x1
dw4 = delta_H2 * x2

print("\nGradients for input->hidden weights:")
print(f"dw1 (for w1) = delta_H1 * x1 = {dw1:.12f}")
print(f"dw2 (for w2) = delta_H1 * x2 = {dw2:.12f}")
print(f"dw3 (for w3) = delta_H2 * x1 = {dw3:.12f}")
print(f"dw4 (for w4) = delta_H2 * x2 = {dw4:.12f}")

# Update hidden weights w1_new = w1 - lr * dw1 w2_new = w2 - lr * dw2 w3_new = w3 - lr * dw3 w4_new = w4 - lr * dw4
w1_new = w1 - lr * dw1
w2_new = w2 - lr * dw2
w3_new = w3 - lr * dw3
w4_new = w4 - lr * dw4

print("\nUpdated input->hidden weights (one step):")
print(f"w1 -> {w1_new:.9f}")
print(f"w2 -> {w2_new:.9f}")
print(f"w3 -> {w3_new:.9f}")
print(f"w4 -> {w4_new:.9f}")

# For completeness, update biases too (if you want)
b2_new = b2 - lr * (delta1 + delta2)  # update using both output deltas
b1_new = b1 - lr * (delta_H1 + delta_H2)  # update using hidden deltas

print(f"\nUpdated biases:")
print(f"b1 -> {b1_new:.9f}")
print(f"b2 -> {b2_new:.9f}")

# -------------------------------
# 7) Forward pass after the update (to show error decreased)
# -------------------------------
print("\n=== FORWARD PASS (after weight update) ===")
out_after = forward_pass(x1, x2,
    w1_new, w2_new, w3_new, w4_new,
    w5_new, w6_new, w7_new, w8_new,
    b1_new, b2_new)

print(f"H1 (after) = {out_after['H1']:.9f}")
print(f"H2 (after) = {out_after['H2']:.9f}")
print(f"y1 (after) = {out_after['y1']:.9f}")
print(f"y2 (after) = {out_after['y2']:.9f}")

E1_after = 0.5 * (T1 - out_after['y1'])**2
E2_after = 0.5 * (T2 - out_after['y2'])**2
E_total_after = E1_after + E2_after

print(f"\nE_total (before) = {E_total:.9f}")
print(f"E_total (after)  = {E_total_after:.9f}")
print("\n(You should see the total error decreased after one backprop step.)")


=== FORWARD PASS (before weight update) ===
H1_net = 0.4125000 => H1 = sigmoid(H1_net) = 0.601687180
H2_net = 0.4375000 => H2 = sigmoid(H2_net) = 0.607663170
y1_net = 1.416460886 => y1 = sigmoid(y1_net) = 0.804782995
y2_net = 1.537395921 => y2 = sigmoid(y2_net) = 0.823085850

E1 = 0.5*(T1 - y1)^2 = 0.315840005
E2 = 0.5*(T2 - y2)^2 = 0.013930167
Total error E_total = E1 + E2 = 0.329770172

=== BACKPROP: output layer ===
dE/dy1 = 0.794782995, dy1/dnet = 0.157107326, => delta1 = 0.124866231
dE/dy2 = -0.166914150, dy2/dnet = 0.145615534, => delta2 = -0.024305293

Gradients for hidden->output weights:
dw5 (for w5) = delta1 * H1 = 0.075130410
dw6 (for w6) = delta1 * H2 = 0.075876610
dw7 (for w7) = delta2 * H1 = -0.014624183
dw8 (for w8) = delta2 * H2 = -0.014769431

Updated hidden->output weights (one step):
w5 -> 0.612434795
w6 -> 0.662061695
w7 -> 0.757312092
w8 -> 0.807384716

=== BACKPROP: hidden layer ===
delta_H1 = (delta1*w5 + delta2*w7) * sigmoid'(H1) = 0.015082763906
delta_H2 = (de