# Solution: Debug Drill 07 - The Exploding Loss

This is the solution notebook for the diverging gradient descent drill.

In [None]:
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(42)

In [None]:
def loss_fn(x, y):
    return x**2 + y**2

def gradient_fn(x, y):
    return np.array([2*x, 2*y])

def gradient_descent(start, lr, n_steps=50):
    position = np.array(start, dtype=float)
    path = [position.copy()]
    losses = [loss_fn(position[0], position[1])]
    
    for _ in range(n_steps):
        grad = gradient_fn(position[0], position[1])
        position = position - lr * grad
        path.append(position.copy())
        losses.append(loss_fn(position[0], position[1]))
    
    return np.array(path), np.array(losses)

In [None]:
# Original divergent training
START_POSITION = [-2.0, 2.0]
LEARNING_RATE = 1.2

path_bad, losses_bad = gradient_descent(START_POSITION, LEARNING_RATE, n_steps=20)

print("=== Original (Divergent) ===")
print(f"Learning rate: {LEARNING_RATE}")
for i in range(min(5, len(losses_bad))):
    if np.isfinite(losses_bad[i]):
        print(f"  Step {i}: {losses_bad[i]:.2f}")
    else:
        print(f"  Step {i}: EXPLODED!")
        break

In [None]:
# SOLUTION: Fixed learning rate
LEARNING_RATE_FIXED = 0.1  # Safe learning rate

path_fixed, losses_fixed = gradient_descent(START_POSITION, LEARNING_RATE_FIXED, n_steps=50)

print(f"=== Fixed Training (lr={LEARNING_RATE_FIXED}) ===")
print(f"Starting loss: {losses_fixed[0]:.4f}")
print(f"Final loss: {losses_fixed[-1]:.6f}")
print(f"Final position: ({path_fixed[-1, 0]:.4f}, {path_fixed[-1, 1]:.4f})")
print(f"\nConverged: {losses_fixed[-1] < 0.01}")

In [None]:
# Visualization
x = np.linspace(-5, 5, 100)
y = np.linspace(-5, 5, 100)
X, Y = np.meshgrid(x, y)
Z = loss_fn(X, Y)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

ax1 = axes[0]
ax1.contour(X, Y, Z, levels=20, cmap='viridis', alpha=0.5)
ax1.plot(np.clip(path_bad[:10], -5, 5)[:, 0], np.clip(path_bad[:10], -5, 5)[:, 1], 
         'r-o', markersize=4, linewidth=1, label=f'Divergent (lr={LEARNING_RATE})', alpha=0.7)
ax1.plot(path_fixed[:, 0], path_fixed[:, 1], 
         'b-o', markersize=4, linewidth=1, label=f'Converged (lr={LEARNING_RATE_FIXED})')
ax1.scatter([0], [0], color='green', s=100, marker='*', zorder=5)
ax1.set_xlabel('θ₁')
ax1.set_ylabel('θ₂')
ax1.set_title('Gradient Descent Paths')
ax1.legend()
ax1.set_xlim(-5, 5)
ax1.set_ylim(-5, 5)

ax2 = axes[1]
ax2.plot(np.clip(losses_bad, 0, 50), 'r-', linewidth=2, label=f'lr={LEARNING_RATE} (diverges)')
ax2.plot(losses_fixed, 'b-', linewidth=2, label=f'lr={LEARNING_RATE_FIXED} (converges)')
ax2.set_xlabel('Step')
ax2.set_ylabel('Loss')
ax2.set_title('Loss Over Time')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## Sample Postmortem

### What happened:
- Loss increased instead of decreased, eventually going to infinity
- Training completely failed to converge

### Root cause:
- Learning rate (1.2) was too high
- For a quadratic loss, lr must be < 1.0 to converge
- With lr > 1, updates overshoot and amplify the error

### How to prevent:
- Start with small learning rate (0.001-0.1)
- If loss explodes, reduce lr by 10x
- Use learning rate schedulers for stability
- Monitor loss during early training iterations