In [None]:
from IPython.display import YouTubeVideo

YouTubeVideo("hfMk-kjRv4c", width=640, height=360)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
from IPython.display import HTML

# Create figure and axis
fig, ax = plt.subplots()
xdata, ydata = [], []
ln, = ax.plot([], [], 'ro-')

def init():
    ax.set_xlim(0, 2*np.pi)
    ax.set_ylim(-1.1, 1.1)
    return ln,

def update(frame):
    xdata.append(frame)
    ydata.append(np.sin(frame))
    ln.set_data(xdata, ydata)
    return ln,

ani = FuncAnimation(fig, update, frames=np.linspace(0, 2*np.pi, 64),
                    init_func=init, blit=True, interval=50)

# Display in Jupyter
HTML(ani.to_jshtml())

In [None]:
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
from IPython.display import HTML

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Generate random 2D data - two spiraling clusters
n_points = 100
noise = 0.2

# Class 0: cluster centered around (-0.5, -0.5)
X0 = np.random.randn(n_points, 2) * 0.4 + np.array([-0.5, -0.5])
# Class 1: cluster centered around (0.5, 0.5)
X1 = np.random.randn(n_points, 2) * 0.4 + np.array([0.5, 0.5])

X = np.vstack([X0, X1]).astype(np.float32)
y = np.array([0]*n_points + [1]*n_points).astype(np.float32)

# Convert to PyTorch tensors
X_tensor = torch.from_numpy(X)
y_tensor = torch.from_numpy(y).unsqueeze(1)

# Simple one-layer neural network
class SimpleClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(2, 1)  # 2 inputs -> 1 output
    
    def forward(self, x):
        return torch.sigmoid(self.linear(x))

model = SimpleClassifier()

print("\nModel Parameters:")
for name, param in model.named_parameters():
    print(f"  {name}: {param.shape} = {param.numel()} params")
total_params = sum(p.numel() for p in model.parameters())
print(f"\nTotal: {total_params} parameters")

criterion = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1.0)

# Store decision boundaries at each epoch
epochs = 100
boundaries = []
losses = []

# Create mesh grid for decision boundary visualization
xx, yy = np.meshgrid(np.linspace(-2, 2, 100), np.linspace(-2, 2, 100))
grid = torch.from_numpy(np.c_[xx.ravel(), yy.ravel()].astype(np.float32))

# Training loop - save state at each epoch
for epoch in range(epochs):
    # Forward pass
    outputs = model(X_tensor)
    loss = criterion(outputs, y_tensor)
    
    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    # Save decision boundary
    with torch.no_grad():
        Z = model(grid).numpy().reshape(xx.shape)
        boundaries.append(Z.copy())
        losses.append(loss.item())

# Create animation
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# Initialize plots
contour = [None]
scatter0 = ax1.scatter(X0[:, 0], X0[:, 1], c='red', edgecolors='white', s=60, label='Red')
scatter1 = ax1.scatter(X1[:, 0], X1[:, 1], c='blue', edgecolors='white', s=60, label='Blue')
ax1.set_xlim(-2, 2)
ax1.set_ylim(-2, 2)
ax1.set_xlabel('Feature 1')
ax1.set_ylabel('Feature 2')
ax1.legend()
title = ax1.set_title('Epoch 0')

# Loss plot
loss_line, = ax2.plot([], [], 'b-', linewidth=2)
ax2.set_xlim(0, epochs)
ax2.set_ylim(0, max(losses) * 1.1)
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Loss')
ax2.set_title('Training Loss')
ax2.grid(True, alpha=0.3)

def init():
    return []

def update(frame):
    # Clear previous contour
    for c in ax1.collections[2:]:  # Keep scatter plots
        c.remove()
    
    # Draw new decision boundary
    ax1.contourf(xx, yy, boundaries[frame], levels=[0, 0.5, 1], 
					colors=['#ffcccc', '#ccccff'], alpha=0.6)
    ax1.contour(xx, yy, boundaries[frame], levels=[0.5], 
                colors=['black'], linewidths=2)
    
    # Re-draw scatter on top
    ax1.scatter(X0[:, 0], X0[:, 1], c='red', edgecolors='white', s=60, zorder=10)
    ax1.scatter(X1[:, 0], X1[:, 1], c='blue', edgecolors='white', s=60, zorder=10)
    
    title.set_text(f'Epoch {frame + 1} | Loss: {losses[frame]:.4f}')
    
    # Update loss plot
    loss_line.set_data(range(frame + 1), losses[:frame + 1])
    
    return []

ani = FuncAnimation(fig, update, frames=epochs, init_func=init, 
                    interval=100, blit=False)
plt.tight_layout()
HTML(ani.to_jshtml())


## Single Output vs Two Output Classifier

| Aspect | 1 Output (Above) | 2 Outputs (Below) |
|--------|------------------|-------------------|
| **Architecture** | 2 ‚Üí 1 | 2 ‚Üí 2 |
| **Output Activation** | Sigmoid | Softmax |
| **Loss Function** | BCELoss | CrossEntropyLoss |
| **Output Meaning** | P(blue) | [P(red), P(blue)] |
| **Parameters** | 3 (2 weights + 1 bias) | 6 (4 weights + 2 biases) |
| **Decision** | output > 0.5 ‚Üí blue | argmax([out0, out1]) |

**Why use 2 outputs?**
- Scales naturally to multi-class (3+)
- Each class has its own "confidence score"
- Softmax ensures probabilities sum to 1


In [None]:
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
from IPython.display import HTML

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Generate random 2D data - two clusters
n_points = 100

# Class 0 (Red): cluster centered around (-0.5, -0.5)
X0 = np.random.randn(n_points, 2) * 0.4 + np.array([-0.5, -0.5])
# Class 1 (Blue): cluster centered around (0.5, 0.5)
X1 = np.random.randn(n_points, 2) * 0.4 + np.array([0.5, 0.5])

X = np.vstack([X0, X1]).astype(np.float32)
y = np.array([0]*n_points + [1]*n_points)  # Class labels: 0=red, 1=blue

# Convert to PyTorch tensors
X_tensor = torch.from_numpy(X)
y_tensor = torch.from_numpy(y).long()  # CrossEntropyLoss needs long type

# Two-output neural network
class TwoOutputClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(2, 2)  # 2 inputs -> 2 outputs (one per class)
    
    def forward(self, x):
        return self.linear(x)  # Raw logits (CrossEntropyLoss applies softmax internally)

model = TwoOutputClassifier()

print("Model Architecture: 2 inputs ‚Üí 2 outputs")
print("\nModel Parameters:")
for name, param in model.named_parameters():
    print(f"  {name}: {param.shape} = {param.numel()} params")
total_params = sum(p.numel() for p in model.parameters())
print(f"\nTotal: {total_params} parameter")

# CrossEntropyLoss = Softmax + NLLLoss (handles multi-class)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1.0)

# Store decision boundaries at each epoch
epochs = 100
boundaries = []
losses = []

# Create mesh grid for decision boundary visualization
xx, yy = np.meshgrid(np.linspace(-2, 2, 100), np.linspace(-2, 2, 100))
grid = torch.from_numpy(np.c_[xx.ravel(), yy.ravel()].astype(np.float32))

# Training loop - save state at each epoch
for epoch in range(epochs):
    # Forward pass
    outputs = model(X_tensor)  # Shape: [200, 2]
    loss = criterion(outputs, y_tensor)
    
    # Backward pass
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    # Save decision boundary (probability of class 1 / blue)
    with torch.no_grad():
        logits = model(grid)
        probs = torch.softmax(logits, dim=1)  # Convert to probabilities
        Z = probs[:, 1].numpy().reshape(xx.shape)  # P(blue)
        boundaries.append(Z.copy())
        losses.append(loss.item())

# Create animation
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
fig.suptitle('Two-Output Classifier (2‚Üí2 with Softmax)', fontsize=14, fontweight='bold')

# Initialize plots
scatter0 = ax1.scatter(X0[:, 0], X0[:, 1], c='red', edgecolors='white', s=60, label='Red (class 0)')
scatter1 = ax1.scatter(X1[:, 0], X1[:, 1], c='blue', edgecolors='white', s=60, label='Blue (class 1)')
ax1.set_xlim(-2, 2)
ax1.set_ylim(-2, 2)
ax1.set_xlabel('Feature 1')
ax1.set_ylabel('Feature 2')
ax1.legend()
title = ax1.set_title('Epoch 0')

# Loss plot
loss_line, = ax2.plot([], [], 'b-', linewidth=2)
ax2.set_xlim(0, epochs)
ax2.set_ylim(0, max(losses) * 1.1)
ax2.set_xlabel('Epoch')
ax2.set_ylabel('CrossEntropy Loss')
ax2.set_title('Training Loss')
ax2.grid(True, alpha=0.3)

def init():
    return []

def update(frame):
    # Clear previous contour
    for c in ax1.collections[2:]:
        c.remove()
    
    # Draw new decision boundary
    ax1.contourf(xx, yy, boundaries[frame], levels=[0, 0.5, 1], 
				colors=['#ffcccc', '#ccccff'], alpha=0.6)
    ax1.contour(xx, yy, boundaries[frame], levels=[0.5], 
				colors=['black'], linewidths=2)
    
    # Re-draw scatter on top
    ax1.scatter(X0[:, 0], X0[:, 1], c='red', edgecolors='white', s=60, zorder=10)
    ax1.scatter(X1[:, 0], X1[:, 1], c='blue', edgecolors='white', s=60, zorder=10)
    
    title.set_text(f'Epoch {frame + 1} | Loss: {losses[frame]:.4f}')
    
    # Update loss plot
    loss_line.set_data(range(frame + 1), losses[:frame + 1])
    
    return []

ani = FuncAnimation(fig, update, frames=epochs, init_func=init, 
                    interval=100, blit=False)
plt.tight_layout()
HTML(ani.to_jshtml())


## What is a Logit?

**Logit** = raw output of a neural network *before* applying sigmoid/softmax.

```
Input ‚Üí Linear Layer ‚Üí [LOGIT] ‚Üí Sigmoid/Softmax ‚Üí Probability
                          ‚Üë
                    Range: -‚àû to +‚àû
```

### Why is it called "Logit"?

The name comes from **"log"** + **"unit"** (or logistic unit).

It's the **log-odds** (logarithm of the odds ratio):

$$\text{logit}(p) = \log\left(\frac{p}{1-p}\right)$$

Where $p$ is a probability and $\frac{p}{1-p}$ is the **odds**.

| Probability | Odds (p / 1-p) | Logit (log-odds) |
|-------------|----------------|------------------|
| 0.5 | 1:1 = 1.0 | 0 |
| 0.73 | ~2.7:1 = 2.7 | ~1.0 |
| 0.95 | 19:1 = 19 | ~2.9 |
| 0.01 | 1:99 = 0.01 | -4.6 |

### Converting Between Logit and Probability

**Logit ‚Üí Probability** (Sigmoid function):

$$P = \frac{1}{1 + e^{-\text{logit}}}$$

| Logit | ‚Üí Probability |
|-------|---------------|
| -3 | 0.05 |
| 0 | 0.50 |
| +3 | 0.95 |

**Probability ‚Üí Logit** (Logit function):

$$\text{logit} = \log\left(\frac{P}{1-P}\right)$$

These are inverse functions of each other.

### Why use logits in neural networks?

1. **Numerical stability** ‚Äî probabilities near 0 or 1 cause issues; logits don't
2. **CrossEntropyLoss expects logits** ‚Äî applies softmax internally for stability
3. **Unbounded range** ‚Äî easier for gradient descent to optimize


## From Sigmoid to Softmax (The Natural Progression)

### Binary Classification (2 classes)

With **1 output** neuron, we use **sigmoid**:

$$P(\text{blue}) = \frac{1}{1 + e^{-z}}$$

where $z$ is the single logit. Then $P(\text{red}) = 1 - P(\text{blue})$.

---

### Multi-class Classification (3+ classes)

What if we have **Red, Blue, Green**? We need 3 outputs.

Each class gets its own logit: $z_{\text{red}}, z_{\text{blue}}, z_{\text{green}}$

**Problem**: How do we convert 3 logits into 3 probabilities that sum to 1?

**Solution**: **Softmax**

$$P(\text{class } i) = \frac{e^{z_i}}{\sum_j e^{z_j}}$$

**Example**:
| Class | Logit ($z$) | $e^z$ | Softmax (probability) |
|-------|-------------|-------|----------------------|
| Red | 2.0 | 7.4 | 7.4 / 12.5 = **0.59** |
| Blue | 1.0 | 2.7 | 2.7 / 12.5 = **0.22** |
| Green | 0.5 | 1.6 | 1.6 / 12.5 = **0.13** |
| | | **Sum: 12.5** | **Sum: 1.00** |

### Softmax = Generalized Sigmoid

For **2 classes**, softmax reduces to sigmoid! (try the math)

```
Softmax with 2 outputs    ‚â°    Sigmoid with 1 output
     [z‚ÇÄ, z‚ÇÅ]                        z = z‚ÇÅ - z‚ÇÄ
```

---

## From BCE Loss to Cross-Entropy Loss

### Binary Cross-Entropy (BCE) Loss

For binary classification (sigmoid output):

$$\text{BCE} = -[y \cdot \log(p) + (1-y) \cdot \log(1-p)]$$

- If true label $y=1$ (blue): Loss = $-\log(p)$ ‚Üí punish low $p$
- If true label $y=0$ (red): Loss = $-\log(1-p)$ ‚Üí punish high $p$

### Cross-Entropy Loss (Multi-class)

For multi-class (softmax outputs):

$$\text{CE} = -\log(p_{\text{correct class}})$$

Just the negative log of the probability assigned to the **true class**.

**Example**: True class = Red, model outputs $[0.59, 0.22, 0.13]$
$$\text{Loss} = -\log(0.59) = 0.53$$

### Summary Table

| Classes | Output Activation | Loss Function | PyTorch |
|---------|------------------|---------------|---------|
| 2 (binary) | Sigmoid | BCE Loss | `nn.BCELoss()` |
| 2+ (multi) | Softmax | Cross-Entropy | `nn.CrossEntropyLoss()` |

**Note**: `nn.CrossEntropyLoss()` combines softmax + CE loss internally for numerical stability. That's why we pass **logits**, not probabilities.


## Why Exponentiate? Why Not Just Normalize?

Great question! You're asking: why use $\frac{e^{z_i}}{\sum e^{z_j}}$ instead of just $\frac{z_i}{\sum z_j}$?

### Problem 1: Logits Can Be Negative

Logits range from $-\infty$ to $+\infty$. Simple normalization breaks:

| Class | Logit | Simple normalize | Problem |
|-------|-------|------------------|---------|
| Red | 3 | 3/4 = 0.75 | |
| Blue | 1 | 1/4 = 0.25 | ‚úì works |

| Class | Logit | Simple normalize | Problem |
|-------|-------|------------------|---------|
| Red | 2 | 2/0 = ??? | √∑ by zero! |
| Blue | -2 | -2/0 = ??? | |

| Class | Logit | Simple normalize | Problem |
|-------|-------|------------------|---------|
| Red | -1 | -1/-3 = 0.33 | |
| Blue | -2 | -2/-3 = 0.67 | ‚ùå Blue wins but has LOWER score! |

**Exponential fixes this**: $e^z > 0$ always, so no negatives or zeros.

### Problem 2: We Want to Amplify Differences

Softmax makes the **largest logit dominate**:

| Class | Logit | $e^z$ | Softmax |
|-------|-------|-------|---------|
| Red | 5 | 148.4 | **0.88** |
| Blue | 3 | 20.1 | 0.12 |

The difference was only 2, but softmax gives Red 88% confidence.

With simple average: Red = 5/8 = 0.625. Much less decisive.

### Problem 3: Mathematical Properties

1. **Gradients are clean**: $\frac{\partial}{\partial z_i} \text{softmax}(z)_i = p_i(1-p_i)$ (same form as sigmoid!)
2. **Connects to physics**: Boltzmann distribution in thermodynamics
3. **Information theory**: Minimizing cross-entropy = maximizing likelihood

### Your Intuition Isn't Wrong Though!

There ARE alternatives:
- **Sparsemax**: Can output exact zeros (sparse probabilities)
- **Temperature scaling**: $\frac{e^{z_i / T}}{\sum e^{z_j / T}}$ controls "sharpness"
  - $T \to 0$: Winner takes all (argmax)
  - $T \to \infty$: Uniform distribution (your averaging idea!)
  - $T = 1$: Standard softmax


## Multi-Layer Neural Network

```
Input (2)  ‚Üí  Hidden1 (16 neurons)  ‚Üí  Hidden2 (16 neurons)  ‚Üí  Output (2)
   x‚ÇÅ ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚Üí [16 neurons] ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚Üí [16 neurons] ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚î¨‚îÄ‚îÄ‚Üí Red
   x‚ÇÇ ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îò        + ReLU                 + ReLU           ‚îî‚îÄ‚îÄ‚Üí Blue
```

**Why 16 neurons?** More neurons = more capacity to learn complex curved boundaries.

With only 2-3 neurons, the network can only combine a few "directions" ‚Üí still mostly linear. With 16, it can carve out circular/complex shapes.


In [None]:
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
from IPython.display import HTML, display
import ipywidgets as widgets

def train_and_animate(layer1_neurons, layer2_neurons, seed, epochs):
    """Train a network with custom architecture and show animation."""
    
    # Set random seeds
    torch.manual_seed(seed)
    np.random.seed(seed)
    
    # Generate circular data
    n_points = 150
    
    # Class 0 (Red): Inner circle
    theta0 = np.random.uniform(0, 2*np.pi, n_points)
    r0 = np.random.uniform(0, 0.5, n_points)
    X0 = np.column_stack([r0 * np.cos(theta0), r0 * np.sin(theta0)])
    
    # Class 1 (Blue): Outer ring
    theta1 = np.random.uniform(0, 2*np.pi, n_points)
    r1 = np.random.uniform(0.7, 1.0, n_points)
    X1 = np.column_stack([r1 * np.cos(theta1), r1 * np.sin(theta1)])
    
    X = np.vstack([X0, X1]).astype(np.float32)
    y = np.array([0]*n_points + [1]*n_points)
    
    X_tensor = torch.from_numpy(X)
    y_tensor = torch.from_numpy(y).long()
    
    # Build network with custom architecture
    class CustomClassifier(nn.Module):
        def __init__(self, n1, n2):
            super().__init__()
            self.layer1 = nn.Linear(2, n1)
            self.layer2 = nn.Linear(n1, n2)
            self.layer3 = nn.Linear(n2, 2)
            self.relu = nn.ReLU()
        
        def forward(self, x):
            x = self.relu(self.layer1(x))
            x = self.relu(self.layer2(x))
            x = self.layer3(x)
            return x
    
    model = CustomClassifier(layer1_neurons, layer2_neurons)
    
    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    print(f"Architecture: 2 ‚Üí {layer1_neurons} ‚Üí {layer2_neurons} ‚Üí 2")
    print(f"Total parameters: {total_params}")
    
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.05)
    
    # Training
    boundaries = []
    losses = []
    
    # Reduced grid resolution for faster animation (50x50 instead of 100x100)
    xx, yy = np.meshgrid(np.linspace(-1.5, 1.5, 50), np.linspace(-1.5, 1.5, 50))
    grid = torch.from_numpy(np.c_[xx.ravel(), yy.ravel()].astype(np.float32))
    
    # Progress display
    progress = widgets.IntProgress(value=0, min=0, max=epochs, description='Training:')
    status = widgets.Label(value='')
    display(widgets.HBox([progress, status]))
    
    for epoch in range(epochs):
        outputs = model(X_tensor)
        loss = criterion(outputs, y_tensor)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Update progress
        progress.value = epoch + 1
        status.value = f'Epoch {epoch + 1}/{epochs} | Loss: {loss.item():.4f}'
        
        if epoch % 2 == 0:
            with torch.no_grad():
                logits = model(grid)
                probs = torch.softmax(logits, dim=1)
                Z = probs[:, 1].numpy().reshape(xx.shape)
                boundaries.append(Z.copy())
                losses.append(loss.item())
    
    status.value = f'Done! Final Loss: {losses[-1]:.4f}'
    
    # Create animation
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))
    fig.suptitle(f'Network: 2 ‚Üí {layer1_neurons} ‚Üí {layer2_neurons} ‚Üí 2  ({total_params} params)', 
				fontsize=14, fontweight='bold')
    
    ax1.scatter(X0[:, 0], X0[:, 1], c='red', edgecolors='white', s=40, label='Red')
    ax1.scatter(X1[:, 0], X1[:, 1], c='blue', edgecolors='white', s=40, label='Blue')
    ax1.set_xlim(-1.5, 1.5)
    ax1.set_ylim(-1.5, 1.5)
    ax1.set_aspect('equal')
    ax1.legend(loc='upper right')
    title = ax1.set_title('Epoch 0')
    
    loss_line, = ax2.plot([], [], 'b-', linewidth=2)
    ax2.set_xlim(0, len(boundaries))
    ax2.set_ylim(0, max(losses) * 1.1)
    ax2.set_xlabel('Epoch (√∑2)')
    ax2.set_ylabel('Loss')
    ax2.grid(True, alpha=0.3)
    
    def update(frame):
        for c in ax1.collections[2:]:
            c.remove()
        
        ax1.contourf(xx, yy, boundaries[frame], levels=np.linspace(0, 1, 20), 
					cmap='RdBu', alpha=0.6)
        ax1.contour(xx, yy, boundaries[frame], levels=[0.5], colors=['black'], linewidths=2)
        
        ax1.scatter(X0[:, 0], X0[:, 1], c='red', edgecolors='white', s=40, zorder=10)
        ax1.scatter(X1[:, 0], X1[:, 1], c='blue', edgecolors='white', s=40, zorder=10)
        
        title.set_text(f'Epoch {frame * 2} | Loss: {losses[frame]:.4f}')
        loss_line.set_data(range(frame + 1), losses[:frame + 1])
        return []
    
    ani = FuncAnimation(fig, update, frames=len(boundaries), interval=80, blit=False)
    plt.tight_layout()
    plt.close(fig)
    return HTML(ani.to_jshtml())

# Create input widgets
layer1_input = widgets.IntSlider(value=8, min=2, max=32, step=1, description='Layer 1:')
layer2_input = widgets.IntSlider(value=8, min=2, max=32, step=1, description='Layer 2:')
seed_input = widgets.IntText(value=0, description='Seed:')
epochs_input = widgets.IntSlider(value=50, min=10, max=300, step=10, description='Epochs:')
run_button = widgets.Button(description='Train & Animate', button_style='primary')
output = widgets.Output()

def on_button_click(b):
    output.clear_output()
    with output:
        print(f"üé≤ Seed: {seed_input.value} | Epochs: {epochs_input.value}")
        result = train_and_animate(layer1_input.value, layer2_input.value, 
                                   seed_input.value, epochs_input.value)
        display(result)

run_button.on_click(on_button_click)

# Display widgets
display(widgets.VBox([
    widgets.HBox([layer1_input, layer2_input]),
    widgets.HBox([seed_input, epochs_input]),
    run_button,
    output
]))


In [None]:
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
import ipywidgets as widgets
from IPython.display import display

def test_architecture_with_decay(n1, n2, num_seeds=20, epochs=300):
    """Test architecture and track alive neurons per epoch."""
    all_results = []
    
    for seed in range(num_seeds):
        torch.manual_seed(seed)
        np.random.seed(seed)
        
        # Generate circular data
        n_points = 150
        theta0 = np.random.uniform(0, 2*np.pi, n_points)
        r0 = np.random.uniform(0, 0.5, n_points)
        X0 = np.column_stack([r0 * np.cos(theta0), r0 * np.sin(theta0)])
        
        theta1 = np.random.uniform(0, 2*np.pi, n_points)
        r1 = np.random.uniform(0.7, 1.0, n_points)
        X1 = np.column_stack([r1 * np.cos(theta1), r1 * np.sin(theta1)])
        
        X = torch.from_numpy(np.vstack([X0, X1]).astype(np.float32))
        y = torch.from_numpy(np.array([0]*n_points + [1]*n_points)).long()
        
        # Model with activation tracking
        class TrackedNet(nn.Module):
            def __init__(self, n1, n2):
                super().__init__()
                self.l1 = nn.Linear(2, n1)
                self.l2 = nn.Linear(n1, n2)
                self.l3 = nn.Linear(n2, 2)
                self.act1 = None
                self.act2 = None
            
            def forward(self, x):
                x = torch.relu(self.l1(x))
                self.act1 = x.detach()
                x = torch.relu(self.l2(x))
                self.act2 = x.detach()
                return self.l3(x)
        
        model = TrackedNet(n1, n2)
        optimizer = torch.optim.Adam(model.parameters(), lr=0.05)
        criterion = nn.CrossEntropyLoss()
        
        # Track alive neurons PER LAYER over epochs
        alive_l1_history = []
        alive_l2_history = []
        
        for epoch in range(epochs):
            out = model(X)
            loss = criterion(out, y)
            
            # Count alive neurons per layer
            alive1 = (model.act1.sum(dim=0) > 0).sum().item()
            alive2 = (model.act2.sum(dim=0) > 0).sum().item()
            alive_l1_history.append(alive1)
            alive_l2_history.append(alive2)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        # Final accuracy
        with torch.no_grad():
            preds = model(X).argmax(dim=1)
            acc = (preds == y).float().mean().item()
        
        all_results.append({
            'seed': seed,
            'accuracy': acc,
            'loss': loss.item(),
            'alive_l1': alive_l1_history,
            'alive_l2': alive_l2_history,
            'final_alive': alive_l1_history[-1] + alive_l2_history[-1]
        })
    
    return all_results, n1, n2

# Widgets
layer1_test = widgets.IntSlider(value=4, min=2, max=32, step=1, description='Layer 1:')
layer2_test = widgets.IntSlider(value=4, min=2, max=32, step=1, description='Layer 2:')
seeds_test = widgets.IntSlider(value=20, min=5, max=50, step=5, description='# Seeds:')
test_button = widgets.Button(description='Test Success Rate', button_style='success')
test_output = widgets.Output()

def on_test_click(b):
    test_output.clear_output()
    with test_output:
        n1, n2 = layer1_test.value, layer2_test.value
        num_seeds = seeds_test.value
        total_neurons = n1 + n2
        
        print(f"Testing 2 ‚Üí {n1} ‚Üí {n2} ‚Üí 2 with {num_seeds} seeds...")
        print("=" * 50)
        
        results, _, _ = test_architecture_with_decay(n1, n2, num_seeds)
        
        successes = sum(1 for r in results if r['accuracy'] > 0.95)
        rate = successes / num_seeds * 100
        
        # Color-coded result
        if rate >= 80:
            emoji = "üü¢"
        elif rate >= 50:
            emoji = "üü°"
        else:
            emoji = "üî¥"
        
        print(f"\n{emoji} Success Rate: {successes}/{num_seeds} = {rate:.0f}%")
        
        params = 2*n1 + n1 + n1*n2 + n2 + n2*2 + 2
        print(f"   Total Parameters: {params}")
        print(f"   Total Neurons: {total_neurons}")
        
        # Plot: Grid of graphs - one per seed (BIGGER - 2 per row)
        cols = 2
        rows = (num_seeds + cols - 1) // cols
        fig, axes = plt.subplots(rows, cols, figsize=(14, 5 * rows))
        axes = axes.flatten()
        
        for i, r in enumerate(results):
            ax = axes[i]
            
            # Color based on success/fail
            if r['accuracy'] > 0.95:
                color = 'green'
                status = '‚úÖ SUCCESS'
            else:
                color = 'red'
                status = '‚ùå FAILED'
            
            # Plot alive neurons PER LAYER
            epochs_range = range(len(r['alive_l1']))
            
            # Layer 1 line
            ax.plot(epochs_range, r['alive_l1'], color='blue', linewidth=2.5, 
                   label=f'Layer 1 (max {n1})')
            ax.axhline(y=n1, color='blue', linestyle='--', alpha=0.3, linewidth=1)
            
            # Layer 2 line
            ax.plot(epochs_range, r['alive_l2'], color='orange', linewidth=2.5, 
                   label=f'Layer 2 (max {n2})')
            ax.axhline(y=n2, color='orange', linestyle='--', alpha=0.3, linewidth=1)
            
            # Mark final values
            end_l1 = r['alive_l1'][-1]
            end_l2 = r['alive_l2'][-1]
            
            ax.annotate(f'L1: {end_l1}/{n1}', xy=(290, end_l1), fontsize=9, color='blue',
                       fontweight='bold', ha='right')
            ax.annotate(f'L2: {end_l2}/{n2}', xy=(290, end_l2 - 0.8), fontsize=9, color='orange',
                       fontweight='bold', ha='right')
            
            # Background color based on success/fail
            ax.set_facecolor('#e8f5e9' if r['accuracy'] > 0.95 else '#ffebee')
            
            ax.set_ylim(0, max(n1, n2) + 1)
            ax.set_xlim(0, 300)
            ax.set_title(f"Seed {r['seed']} ‚Äî {status}\nAccuracy: {r['accuracy']*100:.0f}%", 
                        fontsize=11, fontweight='bold', color=color)
            ax.set_xlabel('Epoch', fontsize=10)
            ax.set_ylabel('Alive Neurons', fontsize=10)
            ax.grid(True, alpha=0.3)
            ax.legend(loc='lower left', fontsize=8)
        
        # Hide empty subplots
        for i in range(num_seeds, len(axes)):
            axes[i].axis('off')
        
        fig.suptitle(f'How Many Neurons Stay Alive During Training?\n'
                     f'Architecture: 2 ‚Üí {n1} ‚Üí {n2} ‚Üí 2 (Total: {total_neurons} neurons)\n'
                     f'Line going DOWN = neurons dying = network losing capacity', 
                     fontsize=14, fontweight='bold')
        plt.tight_layout()
        plt.show()
        
        # Add explanation
        print("\n" + "="*60)
        print("üìñ HOW TO READ THESE GRAPHS:")
        print("="*60)
        print(f"‚Ä¢ üîµ Blue line = Layer 1 alive neurons (max {n1})")
        print(f"‚Ä¢ üü† Orange line = Layer 2 alive neurons (max {n2})")
        print(f"‚Ä¢ Dashed lines = Maximum for each layer")
        print(f"‚Ä¢ Green background = SUCCESS | Red background = FAILED")
        print(f"‚Ä¢ Line DROPS ‚Üí neurons died ‚Üí that layer lost capacity")
        print(f"‚Ä¢ If BOTH layers lose neurons ‚Üí network can't learn!")
        
        # Print summary
        print(f"\nüìä Neuron Survival Summary:")
        dead_runs = [r for r in results if r['final_alive'] < total_neurons * 0.5]
        print(f"   Runs with >50% dead neurons: {len(dead_runs)}/{num_seeds}")
        
        failed = [r for r in results if r['accuracy'] <= 0.95]
        if failed:
            avg_alive_failed = np.mean([r['final_alive'] for r in failed])
            print(f"   Avg alive neurons in FAILED runs: {avg_alive_failed:.1f}/{total_neurons}")
        
        succeeded = [r for r in results if r['accuracy'] > 0.95]
        if succeeded:
            avg_alive_success = np.mean([r['final_alive'] for r in succeeded])
            print(f"   Avg alive neurons in SUCCESS runs: {avg_alive_success:.1f}/{total_neurons}")
        
        # Only show best/worst if there's variance
        if rate == 100:
            print(f"\nüéâ All seeds succeeded! Most neurons alive:")
            for r in sorted(results, key=lambda x: -x['final_alive'])[:3]:
                print(f"   Seed {r['seed']}: {r['final_alive']}/{total_neurons} alive")
        elif rate == 0:
            print(f"\nüíÄ All seeds failed! Least dead neurons:")
            for r in sorted(results, key=lambda x: -x['final_alive'])[:3]:
                print(f"   Seed {r['seed']}: {r['final_alive']}/{total_neurons} alive, {r['accuracy']*100:.1f}% acc")
        else:
            print(f"\n‚úÖ Best seeds (succeeded):")
            for r in sorted([r for r in results if r['accuracy'] > 0.95], key=lambda x: -x['final_alive'])[:3]:
                print(f"   Seed {r['seed']}: {r['final_alive']}/{total_neurons} alive")
            
            print(f"\n‚ùå Worst seeds (failed):")
            for r in sorted([r for r in results if r['accuracy'] <= 0.95], key=lambda x: x['accuracy'])[:3]:
                print(f"   Seed {r['seed']}: {r['accuracy']*100:.1f}% acc, {r['final_alive']}/{total_neurons} alive")

test_button.on_click(on_test_click)

display(widgets.VBox([
    widgets.HTML("<h3>üß™ Test Architecture Success Rate</h3>"),
    widgets.HBox([layer1_test, layer2_test, seeds_test]),
    test_button,
    test_output
]))


## Why Do Some Architectures Work and Others Fail?

Great observation! You noticed:
- 4,4 fails | 4,3 works
- 3,6 fails | 3,7 works  
- 5,3 works

### It's Not Exactly Polynomials ‚Äî It's **Piecewise Linear Regions**

With **ReLU activation**, neural networks don't create smooth polynomial curves. Instead, they create **piecewise linear** decision boundaries (like a polygon approximating a circle).

```
Smooth circle:  ‚óã        ReLU network:  ‚¨° (polygon with flat edges)
```

### How Many "Sides" Can a Network Create?

Each ReLU neuron creates a **hyperplane** (a line in 2D). The network combines these to carve up space:

| Layer 1 neurons | Creates | Effect |
|----------------|---------|--------|
| 1 | 1 line | Splits space in 2 |
| 2 | 2 lines | Up to 4 regions |
| 3 | 3 lines | Up to 7 regions |
| n | n lines | Up to ~n¬≤ regions |

**Maximum linear regions** for a ReLU network with layers of width $n_1, n_2, ..., n_L$:

$$\text{Regions} \leq \prod_{i=1}^{L} \sum_{j=0}^{\min(n_i, d)} \binom{n_i}{j}$$

For practical purposes: **more neurons ‚âà more "polygon sides" ‚âà smoother curves**

### Why Some Fail Despite Having "Enough" Neurons?

1. **Random initialization** ‚Äî Some starting weights land in bad spots
2. **Optimization landscape** ‚Äî Gradient descent can get stuck
3. **First layer width matters most** ‚Äî It does the initial "space transformation"

Your observations hint at this: it's not just total neurons, but **how they're distributed**.

### Try This Experiment

| Architecture | Total Params | Linear Regions (rough) | Works? |
|-------------|--------------|------------------------|--------|
| 2 ‚Üí 4 ‚Üí 4 ‚Üí 2 | 42 | ~16-64 | ‚ùì |
| 2 ‚Üí 8 ‚Üí 2 ‚Üí 2 | 36 | ~8-16 | ‚ùì |
| 2 ‚Üí 4 ‚Üí 8 ‚Üí 2 | 58 | ~32-128 | ‚ùì |

The **depth vs width** tradeoff: wider first layers help, but deeper networks can compose more complex functions.


## Key Insight: 4,4 IS Capable ‚Äî But Gets Stuck!

You just discovered one of the classic problems in deep learning! üéØ

| Result | Meaning |
|--------|---------|
| 45% success | Architecture CAN learn it |
| 100% when works | Perfectly learns the circle |
| 50% when fails | Predicts ONE class for everything |
| Loss = 0.6931 | Exactly $-\ln(0.5)$ = stuck at "random guessing" |

### What's Happening?

```
Good initialization:          Bad initialization:
   ‚Üò                             ‚Üò
    ‚Üí finds solution ‚úì            ‚Üí stuck in flat region ‚úó
                                     (gradients ‚âà 0)
```

This is the **"dead ReLU" problem** or **saddle point problem**:

1. Some initial weights cause ReLU neurons to output 0 for all inputs
2. If all neurons are "dead", gradients are 0 ‚Üí no learning
3. The network predicts ~50% (random) and can't escape

### Why Does 8,8 Work More Reliably?

More neurons = **redundancy**:
- If 2 out of 8 neurons die, 6 still work
- If 2 out of 4 neurons die, only 2 left ‚Üí might not be enough

### Solutions to This Problem:

1. **Use more neurons** (redundancy)
2. **Try different random seeds** (you found this!)
3. **Use LeakyReLU** instead of ReLU (no dead neurons)
4. **Better initialization** (He initialization, etc.)
5. **Use batch normalization**
