# Chapter 4 Bonus: Neural Network Experimental Lab

This bonus notebook extends **Chapter 4 – Neural Networks** with practical experiments that deepen your intuition about how architectural and training choices affect learning.

We will run a series of controlled experiments on simple datasets to see how different **activations, initializations, depths, and optimizers** change training dynamics and decision boundaries.

## What we will explore

1. **Comparing Activations** – Sigmoid vs Tanh vs ReLU in the same network
2. **Impact of Initialization** – good vs bad initialization
3. **Depth and Width Experiment** – from logistic regression to deeper MLPs
4. **Optimizers and Learning Rates** – SGD, Momentum, Adam, and different learning rates
5. **(Optional) Manual Backprop Verification** – sanity-checking gradients on a tiny network

These experiments complement the theory in Chapter 4 and prepare you for designing and debugging your own networks.

## Setup and Imports

We will use **PyTorch** for defining and training networks, **NumPy** for utilities, **matplotlib** for plots, and **scikit-learn** for generating synthetic datasets.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons, make_circles
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader

# Plot style
plt.style.use('seaborn-v0_8-darkgrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 11

# Reproducibility
np.random.seed(42)
torch.manual_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

In [None]:
# Utility: create a 2D nonlinear dataset (moons) and dataloaders

def make_moons_dataloaders(n_samples=600, batch_size=64, test_size=0.3):
    X, y = make_moons(n_samples=n_samples, noise=0.25, random_state=42)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(
        X_scaled, y, test_size=test_size, random_state=42, stratify=y
    )
    
    X_train_t = torch.tensor(X_train, dtype=torch.float32)
    y_train_t = torch.tensor(y_train, dtype=torch.long)
    X_test_t = torch.tensor(X_test, dtype=torch.float32)
    y_test_t = torch.tensor(y_test, dtype=torch.long)
    
    train_ds = TensorDataset(X_train_t, y_train_t)
    test_ds = TensorDataset(X_test_t, y_test_t)
    
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)
    
    return (X_train, y_train, X_test, y_test), (train_loader, test_loader), scaler


# Utility: plot decision boundary for a 2D classifier

def plot_decision_boundary(model, X, y, title="Decision boundary", scaler=None, ax=None, device=device):
    model.eval()
    if ax is None:
        fig, ax = plt.subplots(figsize=(6, 5))
    
    X = np.asarray(X)
    y = np.asarray(y)
    
    x1_min, x1_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
    x2_min, x2_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
    xx1, xx2 = np.meshgrid(np.linspace(x1_min, x1_max, 200),
                           np.linspace(x2_min, x2_max, 200))
    grid = np.c_[xx1.ravel(), xx2.ravel()]
    
    if scaler is not None:
        grid_scaled = scaler.transform(grid)
    else:
        grid_scaled = grid
    
    with torch.no_grad():
        inputs = torch.tensor(grid_scaled, dtype=torch.float32, device=device)
        logits = model(inputs)
        probs = F.softmax(logits, dim=1)[:, 1].cpu().numpy()
    Z = probs.reshape(xx1.shape)
    
    contour = ax.contourf(xx1, xx2, Z, levels=20, cmap="RdBu", alpha=0.6)
    ax.contour(xx1, xx2, Z, levels=[0.5], colors="k", linewidths=2)
    
    ax.scatter(X[y == 0, 0], X[y == 0, 1], c="blue", edgecolor="k", label="Class 0", alpha=0.7)
    ax.scatter(X[y == 1, 0], X[y == 1, 1], c="red", edgecolor="k", label="Class 1", alpha=0.7)
    
    ax.set_title(title)
    ax.set_xlabel("x1")
    ax.set_ylabel("x2")
    ax.legend(loc="upper left")
    plt.tight_layout()
    return ax

# 1. Comparing Activations: Sigmoid vs Tanh vs ReLU

Activation functions strongly affect gradient flow and training speed.

In this experiment we:

- Use the **same architecture** (2-layer MLP: 2 → 32 → 2) and the same dataset (moons)
- Train three networks that differ **only** in activation: Sigmoid, Tanh, ReLU
- Track the training loss curves

We expect to see that:

- Sigmoid may train more slowly and can suffer from **saturation** (vanishing gradients)
- Tanh is often better than Sigmoid but can still saturate
- ReLU tends to converge faster on this kind of problem

This visualizes the ideas discussed in the **Activation Functions** section of Chapter 4.

In [None]:
class SimpleMLP(nn.Module):
    def __init__(self, in_dim=2, hidden_dim=32, out_dim=2, activation="relu"):
        super().__init__()
        self.fc1 = nn.Linear(in_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, out_dim)
        
        if activation == "sigmoid":
            self.act = nn.Sigmoid()
        elif activation == "tanh":
            self.act = nn.Tanh()
        elif activation == "relu":
            self.act = nn.ReLU()
        else:
            raise ValueError(f"Unknown activation: {activation}")
    
    def forward(self, x):
        x = self.act(self.fc1(x))
        x = self.fc2(x)
        return x


def train_model(model, train_loader, test_loader, epochs=100, lr=1e-2, device=device):
    model.to(device)
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    train_losses = []
    test_losses = []
    
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            logits = model(xb)
            loss = criterion(logits, yb)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * xb.size(0)
        train_losses.append(running_loss / len(train_loader.dataset))
        
        model.eval()
        running_loss_test = 0.0
        with torch.no_grad():
            for xb, yb in test_loader:
                xb, yb = xb.to(device), yb.to(device)
                logits = model(xb)
                loss = criterion(logits, yb)
                running_loss_test += loss.item() * xb.size(0)
        test_losses.append(running_loss_test / len(test_loader.dataset))
    
    return train_losses, test_losses


# Run the activation comparison experiment
(X_train, y_train, X_test, y_test), (train_loader, test_loader), scaler = make_moons_dataloaders()

activations = ["sigmoid", "tanh", "relu"]
results = {}

for act in activations:
    print(f"\nTraining MLP with {act} activation")
    model = SimpleMLP(activation=act)
    train_losses, test_losses = train_model(model, train_loader, test_loader, epochs=150, lr=5e-2)
    results[act] = (model, train_losses, test_losses)

# Plot loss curves
plt.figure(figsize=(10, 5))
for act in activations:
    _, train_losses, test_losses = results[act][0], results[act][1], results[act][2]
    plt.plot(train_losses, label=f"{act} (train)")
plt.title("Training Loss vs Epoch for Different Activations")
plt.xlabel("Epoch")
plt.ylabel("Cross-Entropy Loss")
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Visualize decision boundaries for the final models
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
for ax, act in zip(axes, activations):
    model, _, _ = results[act]
    plot_decision_boundary(model, np.vstack([X_train, X_test]), 
                           np.hstack([y_train, y_test]),
                           title=f"Activation: {act}", scaler=scaler, ax=ax)
plt.show()

**Takeaways:**

- All three activations can eventually separate the moons dataset.
- **Sigmoid** often yields slower training and can plateau earlier due to **vanishing gradients** when activations saturate.
- **Tanh** is usually better behaved than Sigmoid (zero-centered), but can still saturate.
- **ReLU** tends to converge faster and is the de facto default for hidden layers in modern networks.

This aligns with the activation function guidelines from Chapter 4.

# 2. Impact of Initialization

Initialization can dramatically influence how fast (or whether) a network learns.

In this experiment we:

- Use the same ReLU MLP (2 → 64 → 64 → 2)
- Compare three initialization schemes:
  1. **He initialization** (good for ReLU)
  2. **Too small** weights (almost zero)
  3. **Too large** weights (can cause saturation / exploding activations)
- Train each model and compare loss curves

From Chapter 4 you know that for ReLU, **He initialization** keeps the variance of activations roughly constant across layers, which helps gradients flow.

In [None]:
class DeepMLP(nn.Module):
    def __init__(self, in_dim=2, hidden_dim=64, out_dim=2):
        super().__init__()
        self.fc1 = nn.Linear(in_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, out_dim)
        self.act = nn.ReLU()
    
    def forward(self, x):
        x = self.act(self.fc1(x))
        x = self.act(self.fc2(x))
        x = self.fc3(x)
        return x


def init_he(model):
    for m in model.modules():
        if isinstance(m, nn.Linear):
            nn.init.kaiming_normal_(m.weight, nonlinearity="relu")
            nn.init.zeros_(m.bias)


def init_small(model, scale=1e-3):
    for m in model.modules():
        if isinstance(m, nn.Linear):
            nn.init.normal_(m.weight, mean=0.0, std=scale)
            nn.init.zeros_(m.bias)


def init_large(model, scale=1.0):
    for m in model.modules():
        if isinstance(m, nn.Linear):
            nn.init.normal_(m.weight, mean=0.0, std=scale)
            nn.init.zeros_(m.bias)


# Reuse moons data
(X_train2, y_train2, X_test2, y_test2), (train_loader2, test_loader2), scaler2 = make_moons_dataloaders()

inits = {
    "he": init_he,
    "small": lambda m: init_small(m, scale=1e-3),
    "large": lambda m: init_large(m, scale=1.0),
}

init_results = {}

for name, init_fn in inits.items():
    print(f"\nTraining DeepMLP with {name} initialization")
    model = DeepMLP()
    init_fn(model)
    train_losses, test_losses = train_model(model, train_loader2, test_loader2, epochs=120, lr=1e-2)
    init_results[name] = (model, train_losses, test_losses)

# Plot loss curves
plt.figure(figsize=(10, 5))
for name in inits.keys():
    _, train_losses, _ = init_results[name]
    plt.plot(train_losses, label=f"{name} init")
plt.title("Initialization Impact on Training Loss (ReLU MLP)")
plt.xlabel("Epoch")
plt.ylabel("Cross-Entropy Loss")
plt.legend()
plt.tight_layout()
plt.show()

# 3. Depth and Width Experiment

Here we illustrate how **network capacity** (depth and width) affects what decision boundaries a model can represent.

We will train on the same moons dataset:

1. **Logistic regression** (no hidden layer → linear boundary)
2. **Shallow MLP** with 1 hidden layer (2 → 32 → 2)
3. **Deeper MLP** with 2 hidden layers (2 → 32 → 32 → 2)

We will then visualize their decision regions to see how depth allows increasingly complex boundaries.

In [None]:
class LogisticReg(nn.Module):
    def __init__(self, in_dim=2, out_dim=2):
        super().__init__()
        self.fc = nn.Linear(in_dim, out_dim)
    
    def forward(self, x):
        return self.fc(x)


class MLP1Hidden(nn.Module):
    def __init__(self, in_dim=2, hidden_dim=32, out_dim=2):
        super().__init__()
        self.fc1 = nn.Linear(in_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, out_dim)
        self.act = nn.ReLU()
    
    def forward(self, x):
        x = self.act(self.fc1(x))
        x = self.fc2(x)
        return x


class MLP2Hidden(nn.Module):
    def __init__(self, in_dim=2, hidden_dim=32, out_dim=2):
        super().__init__()
        self.fc1 = nn.Linear(in_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, out_dim)
        self.act = nn.ReLU()
    
    def forward(self, x):
        x = self.act(self.fc1(x))
        x = self.act(self.fc2(x))
        x = self.fc3(x)
        return x


# Use the same training data as before
(X_train3, y_train3, X_test3, y_test3), (train_loader3, test_loader3), scaler3 = make_moons_dataloaders()

models_depth = {
    "logistic": LogisticReg(),
    "1-hidden": MLP1Hidden(),
    "2-hidden": MLP2Hidden(),
}

trained_models_depth = {}

for name, model in models_depth.items():
    print(f"\nTraining model: {name}")
    train_losses, _ = train_model(model, train_loader3, test_loader3, epochs=150, lr=5e-2)
    trained_models_depth[name] = model

# Plot decision regions
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
all_X = np.vstack([X_train3, X_test3])
all_y = np.hstack([y_train3, y_test3])

for ax, name in zip(axes, ["logistic", "1-hidden", "2-hidden"]):
    model = trained_models_depth[name]
    plot_decision_boundary(model, all_X, all_y, title=name, scaler=scaler3, ax=ax)

plt.show()

**Takeaways:**

- **Logistic regression** can only learn a **linear** decision boundary; it struggles to separate the interleaving moons.
- A **single hidden layer** already allows a non-linear boundary that fits the data much better.
- Adding a **second hidden layer** can further increase flexibility, often capturing subtler structure with smoother boundaries.

This connects directly to the motivation for deeper networks in Chapter 4.

# 4. Optimizers and Learning Rates

So far we used vanilla SGD. In practice, optimization details matter a lot.

In this experiment we:

- Use the same 2-layer MLP (2 → 32 → 2) with ReLU
- Compare three optimizers:
  - **SGD**
  - **SGD with Momentum**
  - **Adam**
- Compare two learning rates for SGD: a reasonable one and a too-large one

We will plot loss curves to see differences in convergence speed and stability.

In [None]:
class SmallMLP(nn.Module):
    def __init__(self, in_dim=2, hidden_dim=32, out_dim=2):
        super().__init__()
        self.fc1 = nn.Linear(in_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, out_dim)
        self.act = nn.ReLU()
    
    def forward(self, x):
        x = self.act(self.fc1(x))
        x = self.fc2(x)
        return x


def train_with_optimizer(optimizer_name, lr, train_loader, test_loader, epochs=80):
    model = SmallMLP().to(device)
    if optimizer_name == "sgd":
        optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    elif optimizer_name == "momentum":
        optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    elif optimizer_name == "adam":
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    else:
        raise ValueError("Unknown optimizer")
    
    criterion = nn.CrossEntropyLoss()
    train_losses = []
    
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            logits = model(xb)
            loss = criterion(logits, yb)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * xb.size(0)
        train_losses.append(running_loss / len(train_loader.dataset))
    
    return train_losses


# Use a fresh moons dataset
(_, _, _, _), (train_loader_opt, test_loader_opt), _ = make_moons_dataloaders()

optimizers = ["sgd", "momentum", "adam"]
lr = 0.05
opt_results = {}

for opt in optimizers:
    print(f"Training with optimizer: {opt}")
    losses = train_with_optimizer(opt, lr, train_loader_opt, test_loader_opt, epochs=80)
    opt_results[opt] = losses

# Learning rate experiment for SGD
print("\nLearning rate experiment with SGD")
sgd_good = train_with_optimizer("sgd", lr=0.05, train_loader=train_loader_opt, test_loader=test_loader_opt, epochs=80)
sgd_bad = train_with_optimizer("sgd", lr=0.5, train_loader=train_loader_opt, test_loader=test_loader_opt, epochs=80)

# Plot optimizer comparison
plt.figure(figsize=(10, 4))
for opt in optimizers:
    plt.plot(opt_results[opt], label=opt)
plt.title("Optimizer Comparison (Training Loss)")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.tight_layout()
plt.show()

# Plot learning rate comparison
plt.figure(figsize=(10, 4))
plt.plot(sgd_good, label="SGD lr=0.05")
plt.plot(sgd_bad, label="SGD lr=0.5")
plt.title("Learning Rate Effect (SGD)")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.tight_layout()
plt.show()

**Takeaways:**

- **Adam** and **SGD with Momentum** usually converge faster and more smoothly than plain SGD.
- Too large a learning rate can cause loss to oscillate or diverge, even for simple problems.
- In practice you should **tune both the optimizer and the learning rate** rather than relying on defaults.

# 5. (Optional) Manual Backprop Verification

To build trust in backpropagation, we can verify gradients on a tiny network by comparing them to **numerical gradients**.

We will:

- Define a very small network: 2 → 2 → 1 with Tanh activation
- Use a single input–target pair
- Compute the loss and gradients using PyTorch autograd
- Approximate the gradients with finite differences
- Compare the two sets of gradients.

In [None]:
class TinyNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(2, 2)
        self.fc2 = nn.Linear(2, 1)
        self.act = nn.Tanh()
    
    def forward(self, x):
        x = self.act(self.fc1(x))
        x = self.fc2(x)
        return x


def numerical_gradient(model, x, y, loss_fn, eps=1e-4):
    grads = {}
    # Ensure we work on CPU tensors for simplicity
    x = x.detach().cpu()
    y = y.detach().cpu()
    
    for name, param in model.named_parameters():
        param_data = param.data.clone()
        grad_approx = torch.zeros_like(param_data)
        
        it = np.nditer(param_data.cpu().numpy(), flags=['multi_index'], op_flags=['readwrite'])
        while not it.finished:
            idx = it.multi_index
            original = param_data[idx].item()
            
            # f(theta + eps)
            param_data[idx] = original + eps
            param.data = param_data
            y_pred_pos = model(x)
            loss_pos = loss_fn(y_pred_pos, y)
            
            # f(theta - eps)
            param_data[idx] = original - eps
            param.data = param_data
            y_pred_neg = model(x)
            loss_neg = loss_fn(y_pred_neg, y)
            
            # central difference
            grad_approx[idx] = (loss_pos - loss_neg) / (2 * eps)
            
            # restore
            param_data[idx] = original
            param.data = param_data
            
            it.iternext()
        
        grads[name] = grad_approx
    return grads


# Gradient check
torch.manual_seed(0)
net = TinyNet()
net = net.to(device)

x = torch.tensor([[0.5, -1.0]], dtype=torch.float32, device=device)
y = torch.tensor([[1.0]], dtype=torch.float32, device=device)

loss_fn = nn.MSELoss()

# Autograd gradients
net.zero_grad()
y_pred = net(x)
loss = loss_fn(y_pred, y)
loss.backward()
autograd_grads = {name: p.grad.detach().cpu().clone() for name, p in net.named_parameters()}

# Numerical gradients (on CPU copy)
net_cpu = TinyNet()
net_cpu.load_state_dict(net.state_dict())
num_grads = numerical_gradient(net_cpu, x.detach().cpu(), y.detach().cpu(), loss_fn)

print("Gradient check (autograd vs numerical):")
for name in autograd_grads.keys():
    print(f"\nParameter: {name}")
    print("Autograd:\n", autograd_grads[name].numpy())
    print("Numerical:\n", num_grads[name].detach().numpy())
    diff = autograd_grads[name] - num_grads[name]
    print("Max abs diff:", diff.abs().max().item())

## Summary

In this experimental lab we:

- Compared **Sigmoid, Tanh, and ReLU** activations and saw how they affect training speed and decision boundaries.
- Demonstrated how **initialization** (He vs too small vs too large) influences convergence.
- Showed how **depth** changes model capacity, from linear logistic regression to deeper MLPs.
- Compared **optimizers** (SGD, Momentum, Adam) and learning rates, highlighting convergence and stability issues.
- Performed a small **gradient check** to verify that backpropagated gradients match numerical approximations.

These experiments bring the theory of Chapter 4 to life and provide intuition for designing, initializing, and training your own neural networks.