# Module 15: Neural Networks

**Goal:** Understand how neural networks learn and when to use them vs simpler models.

**Prerequisites:** Modules 7 (Optimization), 11 (Regularization)

**Expected Runtime:** ~30 minutes

**Outputs:**
- Built and trained a simple neural network
- Visualized decision boundaries
- Compared NN vs simpler models

---

## Setup

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.datasets import make_circles, make_moons, make_classification
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
plt.rcParams['figure.figsize'] = (12, 5)

## Part 1: The Need for Neural Networks

Linear models can't learn non-linear patterns. Let's see why.

In [None]:
# Generate XOR-like data (not linearly separable)
np.random.seed(42)
n = 200
X = np.random.randn(n, 2)
y = ((X[:, 0] > 0) != (X[:, 1] > 0)).astype(int)  # XOR pattern

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

for label in [0, 1]:
    mask = y == label
    axes[0].scatter(X[mask, 0], X[mask, 1], label=f'Class {label}', alpha=0.7)
axes[0].set_xlabel('Feature 1')
axes[0].set_ylabel('Feature 2')
axes[0].set_title('XOR Pattern (Not Linearly Separable)')
axes[0].legend()
axes[0].axhline(y=0, color='gray', linestyle='--', alpha=0.5)
axes[0].axvline(x=0, color='gray', linestyle='--', alpha=0.5)

# Try logistic regression
lr = LogisticRegression()
lr.fit(X, y)
lr_acc = lr.score(X, y)

# Try neural network
nn = MLPClassifier(hidden_layer_sizes=(8,), max_iter=1000, random_state=42)
nn.fit(X, y)
nn_acc = nn.score(X, y)

axes[1].bar(['Logistic\nRegression', 'Neural\nNetwork'], [lr_acc, nn_acc], 
            color=['#ef4444', '#22c55e'])
axes[1].set_ylabel('Accuracy')
axes[1].set_title('Model Comparison on XOR')
axes[1].set_ylim(0, 1)

plt.tight_layout()
plt.show()

print(f"Logistic Regression: {lr_acc:.1%}")
print(f"Neural Network: {nn_acc:.1%}")
print("\nüí° Linear models can't solve XOR. Neural networks can!")

## Part 2: Building a Neural Network

Using sklearn's MLPClassifier (Multi-Layer Perceptron).

In [None]:
# Generate more complex data
X, y = make_moons(n_samples=500, noise=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features (important for neural networks!)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build neural network
model = MLPClassifier(
    hidden_layer_sizes=(16, 8),  # Two hidden layers: 16 and 8 neurons
    activation='relu',           # ReLU activation
    solver='adam',               # Adam optimizer
    max_iter=500,
    random_state=42,
    early_stopping=True,         # Stop when validation stops improving
    validation_fraction=0.1
)

model.fit(X_train_scaled, y_train)

print("=== Network Architecture ===")
print(f"Input layer: 2 features")
for i, size in enumerate(model.hidden_layer_sizes):
    print(f"Hidden layer {i+1}: {size} neurons")
print(f"Output layer: 2 classes")

print(f"\nTotal parameters: {sum(c.size for c in model.coefs_) + sum(i.size for i in model.intercepts_)}")
print(f"Training iterations: {model.n_iter_}")

In [None]:
# Evaluate
train_acc = model.score(X_train_scaled, y_train)
test_acc = model.score(X_test_scaled, y_test)

print(f"Training Accuracy: {train_acc:.1%}")
print(f"Test Accuracy: {test_acc:.1%}")

if train_acc > test_acc + 0.1:
    print("‚ö†Ô∏è Gap suggests possible overfitting")
else:
    print("‚úì Good generalization")

## Part 3: Visualizing Decision Boundaries

In [None]:
def plot_decision_boundary(model, X, y, title, ax):
    """Plot decision boundary for a classifier."""
    h = 0.02
    x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
    y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    
    Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    ax.contourf(xx, yy, Z, alpha=0.4, cmap='RdYlBu')
    ax.scatter(X[y==0, 0], X[y==0, 1], c='#ef4444', label='Class 0', edgecolor='white')
    ax.scatter(X[y==1, 0], X[y==1, 1], c='#0ea5e9', label='Class 1', edgecolor='white')
    ax.set_title(title)
    ax.legend()

# Compare models
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# Logistic Regression
lr = LogisticRegression()
lr.fit(X_train_scaled, y_train)
plot_decision_boundary(lr, X_train_scaled, y_train, 
                       f'Logistic Regression ({lr.score(X_test_scaled, y_test):.1%})', axes[0])

# Neural Network - shallow
nn_shallow = MLPClassifier(hidden_layer_sizes=(4,), max_iter=500, random_state=42)
nn_shallow.fit(X_train_scaled, y_train)
plot_decision_boundary(nn_shallow, X_train_scaled, y_train,
                       f'NN - 1 Layer, 4 Neurons ({nn_shallow.score(X_test_scaled, y_test):.1%})', axes[1])

# Neural Network - deep
nn_deep = MLPClassifier(hidden_layer_sizes=(16, 8), max_iter=500, random_state=42)
nn_deep.fit(X_train_scaled, y_train)
plot_decision_boundary(nn_deep, X_train_scaled, y_train,
                       f'NN - 2 Layers, 16-8 Neurons ({nn_deep.score(X_test_scaled, y_test):.1%})', axes[2])

plt.tight_layout()
plt.show()

print("üí° More layers and neurons ‚Üí more complex decision boundaries")

## Part 4: Activation Functions

Different activations create different decision boundaries.

In [None]:
# Visualize activation functions
x = np.linspace(-5, 5, 100)

activations = {
    'ReLU': np.maximum(0, x),
    'Sigmoid': 1 / (1 + np.exp(-x)),
    'Tanh': np.tanh(x),
    'Identity (Linear)': x
}

fig, axes = plt.subplots(1, 4, figsize=(14, 3))

for ax, (name, y) in zip(axes, activations.items()):
    ax.plot(x, y, color='#f97316', linewidth=2)
    ax.axhline(y=0, color='gray', linestyle='--', alpha=0.5)
    ax.axvline(x=0, color='gray', linestyle='--', alpha=0.5)
    ax.set_title(name)
    ax.set_xlabel('Input')
    ax.set_ylabel('Output')
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("üí° ReLU is most common for hidden layers (fast, avoids vanishing gradients)")
print("   Sigmoid/Softmax used for output layer (produces probabilities)")

## Part 5: Overfitting Demonstration

In [None]:
# Generate small dataset (prone to overfitting)
X_small, y_small = make_moons(n_samples=100, noise=0.3, random_state=42)
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_small, y_small, test_size=0.3)

scaler_s = StandardScaler()
X_train_s_scaled = scaler_s.fit_transform(X_train_s)
X_test_s_scaled = scaler_s.transform(X_test_s)

# Compare different architectures
architectures = [
    ((4,), 'Small (4)'),
    ((16, 8), 'Medium (16, 8)'),
    ((64, 32, 16), 'Large (64, 32, 16)'),
]

results = []
for arch, name in architectures:
    model = MLPClassifier(hidden_layer_sizes=arch, max_iter=1000, random_state=42)
    model.fit(X_train_s_scaled, y_train_s)
    
    train_acc = model.score(X_train_s_scaled, y_train_s)
    test_acc = model.score(X_test_s_scaled, y_test_s)
    
    results.append({
        'Architecture': name,
        'Train Acc': train_acc,
        'Test Acc': test_acc,
        'Gap': train_acc - test_acc
    })

results_df = pd.DataFrame(results)
print("=== Overfitting Analysis (Small Dataset, 70 training samples) ===")
print(results_df.to_string(index=False))

print("\nüí° Larger networks can overfit on small data. Gap between train/test indicates overfitting.")

## Part 6: NN vs Gradient Boosting on Tabular Data

In [None]:
# Generate tabular classification data
X_tab, y_tab = make_classification(
    n_samples=1000, n_features=20, n_informative=10,
    n_redundant=5, n_clusters_per_class=2, random_state=42
)

X_train_t, X_test_t, y_train_t, y_test_t = train_test_split(X_tab, y_tab, test_size=0.2)

scaler_t = StandardScaler()
X_train_t_scaled = scaler_t.fit_transform(X_train_t)
X_test_t_scaled = scaler_t.transform(X_test_t)

# Compare models
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'Neural Network': MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=500, random_state=42)
}

print("=== Tabular Data Comparison (1000 samples, 20 features) ===")
comparison = []

for name, model in models.items():
    if name == 'Gradient Boosting':
        model.fit(X_train_t, y_train_t)  # GBM doesn't need scaling
        test_acc = model.score(X_test_t, y_test_t)
    else:
        model.fit(X_train_t_scaled, y_train_t)
        test_acc = model.score(X_test_t_scaled, y_test_t)
    
    comparison.append({'Model': name, 'Test Accuracy': test_acc})
    print(f"{name}: {test_acc:.1%}")

print("\nüí° For tabular data, gradient boosting often matches or beats neural networks!")
print("   NNs shine on unstructured data (images, text) with large datasets.")

## Part 7: TODO - Hyperparameter Tuning

In [None]:
# TODO: Experiment with different hyperparameters
# Try different combinations and record results

# Hyperparameters to try:
# - hidden_layer_sizes: (8,), (16, 8), (32, 16, 8)
# - activation: 'relu', 'tanh'
# - alpha: 0.0001, 0.001, 0.01 (L2 regularization)

from sklearn.model_selection import GridSearchCV

param_grid = {
    'hidden_layer_sizes': [(8,), (16, 8), (32, 16)],
    'activation': ['relu', 'tanh'],
    'alpha': [0.0001, 0.001, 0.01]
}

# Uncomment to run grid search (takes a few minutes)
# grid = GridSearchCV(MLPClassifier(max_iter=500, random_state=42), 
#                     param_grid, cv=5, scoring='accuracy')
# grid.fit(X_train_t_scaled, y_train_t)
# print(f"Best params: {grid.best_params_}")
# print(f"Best CV score: {grid.best_score_:.3f}")

print("TODO: Uncomment grid search code above to find best hyperparameters")

## Part 8: TODO - Stakeholder Summary

Explain to a product manager:
1. When to use neural networks vs simpler models
2. What the tradeoffs are (interpretability, data requirements, compute)
3. How to know if a neural network is overfitting

### Your Summary:

*Write your explanation here...*

---

## Key Takeaways

1. **Neural networks** learn features automatically through layers
2. **Activation functions** add non-linearity (ReLU is default)
3. **More layers/neurons** ‚Üí more complex patterns, but risk overfitting
4. **Always scale features** before training neural networks
5. **For tabular data**, gradient boosting often works as well or better
6. **NNs shine** on unstructured data with large datasets

### Next Steps
- Explore the interactive playground
- Complete the quiz
- Move to Module 16: Transformers