# Lab 02 - Handwritten Digit Recognition with MNIST

## Objective
Build and refine Convolutional Neural Network (CNN) and Multi-Layer Perceptron (MLP) models to classify handwritten digits from the MNIST dataset.

### Tasks:
1. **Activation Function Challenge**: Compare Sigmoid, Tanh, and ReLU
2. **Optimizer Showdown**: Compare SGD, SGD+Momentum, and Adam
3. **Batch Normalization and Dropout**: Experiment with different configurations

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, confusion_matrix

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import SGD, Adam

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

print(f"TensorFlow version: {tf.__version__}")
print(f"Keras version: {keras.__version__}")

## Data Loading and Preprocessing

In [None]:
# Load MNIST dataset
(X_train, y_train), (X_test, y_test) = mnist.load_data()

print(f"Training data shape: {X_train.shape}")
print(f"Training labels shape: {y_train.shape}")
print(f"Test data shape: {X_test.shape}")
print(f"Test labels shape: {y_test.shape}")

# Visualize some samples
fig, axes = plt.subplots(2, 5, figsize=(12, 5))
for i, ax in enumerate(axes.flat):
    ax.imshow(X_train[i], cmap='gray')
    ax.set_title(f"Label: {y_train[i]}")
    ax.axis('off')
plt.tight_layout()
plt.savefig('mnist_samples.png', dpi=100, bbox_inches='tight')
plt.show()

In [None]:
# Preprocessing for CNN
X_train_cnn = X_train.reshape(-1, 28, 28, 1).astype('float32') / 255.0
X_test_cnn = X_test.reshape(-1, 28, 28, 1).astype('float32') / 255.0

# Preprocessing for MLP
X_train_mlp = X_train.reshape(-1, 784).astype('float32') / 255.0
X_test_mlp = X_test.reshape(-1, 784).astype('float32') / 255.0

# One-hot encode labels
y_train_cat = to_categorical(y_train, 10)
y_test_cat = to_categorical(y_test, 10)

print(f"CNN Training data shape: {X_train_cnn.shape}")
print(f"MLP Training data shape: {X_train_mlp.shape}")
print(f"One-hot labels shape: {y_train_cat.shape}")

## Model Architectures

In [None]:
def create_cnn_model(activation='relu', fc_units=128, dropout_rate=0.25, use_bn=False):
    """
    Create CNN model based on the base architecture:
    - Conv2D Layer 1: 32 filters, 3x3 kernel
    - Conv2D Layer 2: 64 filters, 3x3 kernel
    - Max Pooling: 2x2 kernel
    - Dropout
    - Dense Layer: fc_units neurons
    - Output Layer: 10 neurons (Softmax)
    """
    model = models.Sequential([
        layers.Input(shape=(28, 28, 1)),
        layers.Conv2D(32, (3, 3), activation=activation, padding='same'),
        layers.Conv2D(64, (3, 3), activation=activation, padding='same'),
        layers.MaxPooling2D((2, 2)),
    ])
    
    if dropout_rate > 0:
        model.add(layers.Dropout(dropout_rate))
    
    model.add(layers.Flatten())
    
    if use_bn:
        model.add(layers.Dense(fc_units))
        model.add(layers.BatchNormalization())
        model.add(layers.Activation(activation))
    else:
        model.add(layers.Dense(fc_units, activation=activation))
    
    model.add(layers.Dense(10, activation='softmax'))
    
    return model


def create_mlp_model(activation='relu', hidden_units=None, dropout_rate=0.0, use_bn=True):
    """
    Create MLP model based on the base architecture:
    - Flatten (784)
    - Dense layers with BatchNormalization (optional)
    - Output Layer: 10 neurons (Softmax)
    """
    if hidden_units is None:
        hidden_units = [256, 128]
    model = models.Sequential([
        layers.Input(shape=(784,)),
    ])
    
    for units in hidden_units:
        model.add(layers.Dense(units))
        if use_bn:
            model.add(layers.BatchNormalization())
        model.add(layers.Activation(activation))
        if dropout_rate > 0:
            model.add(layers.Dropout(dropout_rate))
    
    model.add(layers.Dense(10, activation='softmax'))
    
    return model


# Test model creation
test_cnn = create_cnn_model()
test_mlp = create_mlp_model()

print("CNN Model Summary:")
test_cnn.summary()
print("\nMLP Model Summary:")
test_mlp.summary()

## Helper Functions for Training and Evaluation

In [None]:
def train_and_evaluate(model, X_train, y_train, X_test, y_test, 
                       optimizer, epochs, batch_size=128, verbose=1):
    """
    Train and evaluate a model
    """
    model.compile(
        optimizer=optimizer,
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    
    history = model.fit(
        X_train, y_train,
        batch_size=batch_size,
        epochs=epochs,
        validation_split=0.1,
        verbose=verbose
    )
    
    # Evaluate on test set
    test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)
    
    return history, test_accuracy


def plot_history(histories, labels, title, save_name):
    """
    Plot training history for multiple experiments
    """
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Plot training loss
    for history, label in zip(histories, labels):
        axes[0].plot(history.history['loss'], label=f'{label} (train)', linewidth=2)
        axes[0].plot(history.history['val_loss'], label=f'{label} (val)', linestyle='--', linewidth=2)
    axes[0].set_xlabel('Epoch', fontsize=12)
    axes[0].set_ylabel('Loss', fontsize=12)
    axes[0].set_title('Training and Validation Loss', fontsize=14, fontweight='bold')
    axes[0].legend()
    axes[0].grid(True, alpha=0.3)
    
    # Plot training accuracy
    for history, label in zip(histories, labels):
        axes[1].plot(history.history['accuracy'], label=f'{label} (train)', linewidth=2)
        axes[1].plot(history.history['val_accuracy'], label=f'{label} (val)', linestyle='--', linewidth=2)
    axes[1].set_xlabel('Epoch', fontsize=12)
    axes[1].set_ylabel('Accuracy', fontsize=12)
    axes[1].set_title('Training and Validation Accuracy', fontsize=14, fontweight='bold')
    axes[1].legend()
    axes[1].grid(True, alpha=0.3)
    
    plt.suptitle(title, fontsize=16, fontweight='bold', y=1.02)
    plt.tight_layout()
    plt.savefig(save_name, dpi=100, bbox_inches='tight')
    plt.show()

## Task 1: The Activation Function Challenge

Compare the training loss and accuracy curves when using:
- **Sigmoid**: Observe if the model suffers from "vanishing gradients" or slow start
- **Tanh**: Compare its speed to Sigmoid
- **ReLU**: Document why this usually leads to faster convergence

In [None]:
# Task 1: Activation Function Comparison
activation_functions = ['sigmoid', 'tanh', 'relu']
task1_results = []
task1_histories = []

print("="*80)
print("TASK 1: ACTIVATION FUNCTION COMPARISON")
print("="*80)

for activation in activation_functions:
    print(f"\n{'='*60}")
    print(f"Training CNN with {activation.upper()} activation...")
    print(f"{'='*60}")
    
    model = create_cnn_model(activation=activation, fc_units=128, dropout_rate=0.25)
    optimizer = Adam(learning_rate=0.001)
    
    history, test_acc = train_and_evaluate(
        model, X_train_cnn, y_train_cat, X_test_cnn, y_test_cat,
        optimizer=optimizer, epochs=10, verbose=1
    )
    
    task1_histories.append(history)
    task1_results.append({
        'Experiment': f'CNN-{activation}',
        'Activation': activation,
        'Optimizer': 'Adam',
        'Epochs': 10,
        'Final Test Accuracy': f"{test_acc:.4f}"
    })
    
    print(f"\n✓ {activation.upper()}: Test Accuracy = {test_acc:.4f}")

# Plot comparison
plot_history(
    task1_histories,
    activation_functions,
    'Task 1: Activation Function Comparison',
    'task1_activation_comparison.png'
)

# Display results table
task1_df = pd.DataFrame(task1_results)
print("\n" + "="*80)
print("TASK 1 RESULTS")
print("="*80)
print(task1_df.to_string(index=False))

### Task 1: Observations

**Sigmoid Activation:**
- Expected to show slower convergence due to vanishing gradient problem
- Outputs are bounded between 0 and 1
- Gradients become very small for large positive or negative inputs

**Tanh Activation:**
- Should perform better than Sigmoid
- Outputs centered around 0 (range: -1 to 1)
- Still suffers from vanishing gradients but less severe than Sigmoid

**ReLU Activation:**
- Expected to show fastest convergence
- No vanishing gradient problem for positive inputs
- Computationally efficient (simple thresholding)
- Most commonly used in modern deep learning

## Task 2: The Optimizer Showdown

Compare optimizers while keeping the best activation function (ReLU) constant:
- **SGD**: Observe the stability of the loss
- **SGD with Momentum**: Note how it handles "bumps" in the loss landscape
- **Adam**: Observe how quickly it reaches high accuracy

In [None]:
# Task 2: Optimizer Comparison (using ReLU as best activation)
optimizers_config = [
    ('SGD', SGD(learning_rate=0.01)),
    ('SGD+Momentum', SGD(learning_rate=0.01, momentum=0.9)),
    ('Adam', Adam(learning_rate=0.001))
]

task2_results = []
task2_histories = []

print("\n" + "="*80)
print("TASK 2: OPTIMIZER COMPARISON (with ReLU)")
print("="*80)

for opt_name, optimizer in optimizers_config:
    print(f"\n{'='*60}")
    print(f"Training CNN with {opt_name} optimizer...")
    print(f"{'='*60}")
    
    model = create_cnn_model(activation='relu', fc_units=128, dropout_rate=0.25)
    
    history, test_acc = train_and_evaluate(
        model, X_train_cnn, y_train_cat, X_test_cnn, y_test_cat,
        optimizer=optimizer, epochs=10, verbose=1
    )
    
    task2_histories.append(history)
    task2_results.append({
        'Experiment': f'CNN-{opt_name}',
        'Activation': 'ReLU',
        'Optimizer': opt_name,
        'Epochs': 10,
        'Final Test Accuracy': f"{test_acc:.4f}"
    })
    
    print(f"\n✓ {opt_name}: Test Accuracy = {test_acc:.4f}")

# Plot comparison
plot_history(
    task2_histories,
    [opt[0] for opt in optimizers_config],
    'Task 2: Optimizer Comparison (ReLU Activation)',
    'task2_optimizer_comparison.png'
)

# Display results table
task2_df = pd.DataFrame(task2_results)
print("\n" + "="*80)
print("TASK 2 RESULTS")
print("="*80)
print(task2_df.to_string(index=False))

### Task 2: Observations

**SGD (Stochastic Gradient Descent):**
- Basic optimizer with fixed learning rate
- May show noisy convergence
- Can get stuck in local minima

**SGD with Momentum:**
- Adds momentum term to smooth out updates
- Helps overcome local minima and "bumps"
- Generally faster and more stable than plain SGD

**Adam (Adaptive Moment Estimation):**
- Adapts learning rate for each parameter
- Combines benefits of momentum and RMSprop
- Usually fastest convergence and best performance
- Most popular optimizer in practice

## Task 3: Batch Normalization and Dropout Experiments

Run specific scenarios to observe the contrast:
1. **WITHOUT** Batch Normalization and Dropout
2. **Without BN**, Dropout layer = 0.1
3. **With BN**, Dropout layer = 0.25

In [None]:
# Task 3: Batch Normalization and Dropout Experiments
task3_configs = [
    ('No BN, No Dropout', False, 0.0),
    ('No BN, Dropout=0.1', False, 0.1),
    ('With BN, Dropout=0.25', True, 0.25)
]

task3_results = []
task3_histories = []

print("\n" + "="*80)
print("TASK 3: BATCH NORMALIZATION AND DROPOUT EXPERIMENTS")
print("="*80)

for config_name, use_bn, dropout_rate in task3_configs:
    print(f"\n{'='*60}")
    print(f"Training CNN with {config_name}...")
    print(f"{'='*60}")
    
    model = create_cnn_model(
        activation='relu',
        fc_units=128,
        dropout_rate=dropout_rate,
        use_bn=use_bn
    )
    optimizer = Adam(learning_rate=0.001)
    
    history, test_acc = train_and_evaluate(
        model, X_train_cnn, y_train_cat, X_test_cnn, y_test_cat,
        optimizer=optimizer, epochs=10, verbose=1
    )
    
    task3_histories.append(history)
    task3_results.append({
        'Experiment': config_name,
        'Batch Normalization': 'Yes' if use_bn else 'No',
        'Dropout Rate': dropout_rate,
        'Epochs': 10,
        'Final Test Accuracy': f"{test_acc:.4f}"
    })
    
    print(f"\n✓ {config_name}: Test Accuracy = {test_acc:.4f}")

# Plot comparison
plot_history(
    task3_histories,
    [config[0] for config in task3_configs],
    'Task 3: Batch Normalization and Dropout Comparison',
    'task3_bn_dropout_comparison.png'
)

# Display results table
task3_df = pd.DataFrame(task3_results)
print("\n" + "="*80)
print("TASK 3 RESULTS")
print("="*80)
print(task3_df.to_string(index=False))

### Task 3: Observations

**Without BN and Dropout:**
- Baseline model without regularization
- May show signs of overfitting
- Training accuracy likely higher than validation accuracy

**Without BN, Dropout=0.1:**
- Light regularization through dropout
- Should reduce overfitting slightly
- Better generalization than no regularization

**With BN, Dropout=0.25:**
- Batch Normalization normalizes layer inputs
- Stronger dropout for better regularization
- Expected to show best generalization
- May train slightly slower but achieve better test accuracy

## Additional Experiments: CNN and MLP Models from Assignment

Train the specific model configurations mentioned in the assignment table:
- CNN-1: FC layer=128, Adam optimizer, 10 epochs
- MLP-1: 512-256-128 layers, SGD optimizer, 20 epochs
- MLP-2: 256 layers, Adam optimizer, 15 epochs

In [None]:
# Additional experiments as per assignment table
additional_results = []

print("\n" + "="*80)
print("ADDITIONAL EXPERIMENTS (Assignment Configurations)")
print("="*80)

# CNN-1: 128 FC, Adam, 10 epochs
print("\n" + "="*60)
print("Training CNN-1 (FC=128, Adam, 10 epochs)...")
print("="*60)
model_cnn1 = create_cnn_model(activation='relu', fc_units=128, dropout_rate=0.25, use_bn=True)
history_cnn1, acc_cnn1 = train_and_evaluate(
    model_cnn1, X_train_cnn, y_train_cat, X_test_cnn, y_test_cat,
    optimizer=Adam(learning_rate=0.001), epochs=10, verbose=1
)
additional_results.append({
    'Model': 'CNN-1',
    'FC Layer': '128',
    'Optimizer': 'Adam',
    'Epochs': 10,
    'Test Accuracy': f"{acc_cnn1:.4f}"
})
print(f"\n✓ CNN-1: Test Accuracy = {acc_cnn1:.4f}")

# MLP-1: 512-256-128, SGD, 20 epochs
print("\n" + "="*60)
print("Training MLP-1 (512-256-128, SGD, 20 epochs)...")
print("="*60)
model_mlp1 = create_mlp_model(activation='relu', hidden_units=[512, 256, 128], dropout_rate=0.0, use_bn=True)
history_mlp1, acc_mlp1 = train_and_evaluate(
    model_mlp1, X_train_mlp, y_train_cat, X_test_mlp, y_test_cat,
    optimizer=SGD(learning_rate=0.01, momentum=0.9), epochs=20, verbose=1
)
additional_results.append({
    'Model': 'MLP-1',
    'FC Layer': '512-256-128',
    'Optimizer': 'SGD',
    'Epochs': 20,
    'Test Accuracy': f"{acc_mlp1:.4f}"
})
print(f"\n✓ MLP-1: Test Accuracy = {acc_mlp1:.4f}")

# MLP-2: 256, Adam, 15 epochs
print("\n" + "="*60)
print("Training MLP-2 (256, Adam, 15 epochs)...")
print("="*60)
model_mlp2 = create_mlp_model(activation='relu', hidden_units=[256], dropout_rate=0.0, use_bn=True)
history_mlp2, acc_mlp2 = train_and_evaluate(
    model_mlp2, X_train_mlp, y_train_cat, X_test_mlp, y_test_cat,
    optimizer=Adam(learning_rate=0.001), epochs=15, verbose=1
)
additional_results.append({
    'Model': 'MLP-2',
    'FC Layer': '256',
    'Optimizer': 'Adam',
    'Epochs': 15,
    'Test Accuracy': f"{acc_mlp2:.4f}"
})
print(f"\n✓ MLP-2: Test Accuracy = {acc_mlp2:.4f}")

# Display results table
additional_df = pd.DataFrame(additional_results)
print("\n" + "="*80)
print("ADDITIONAL EXPERIMENTS RESULTS")
print("="*80)
print(additional_df.to_string(index=False))

## Comprehensive Results Summary

In [None]:
# Combine all results into a comprehensive table
print("\n" + "="*80)
print("COMPREHENSIVE RESULTS SUMMARY")
print("="*80)

print("\n" + "-"*80)
print("TASK 1: Activation Function Comparison")
print("-"*80)
print(task1_df.to_string(index=False))

print("\n" + "-"*80)
print("TASK 2: Optimizer Comparison")
print("-"*80)
print(task2_df.to_string(index=False))

print("\n" + "-"*80)
print("TASK 3: Batch Normalization and Dropout")
print("-"*80)
print(task3_df.to_string(index=False))

print("\n" + "-"*80)
print("Additional Experiments (Assignment Configurations)")
print("-"*80)
print(additional_df.to_string(index=False))

## Visualize Sample Predictions

In [None]:
# Make predictions with best model (CNN with Adam)
best_model = create_cnn_model(activation='relu', fc_units=128, dropout_rate=0.25, use_bn=True)
best_model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])
best_model.fit(X_train_cnn, y_train_cat, epochs=10, batch_size=128, validation_split=0.1, verbose=0)

# Get predictions
predictions = best_model.predict(X_test_cnn[:20])
predicted_labels = np.argmax(predictions, axis=1)

# Visualize predictions
fig, axes = plt.subplots(4, 5, figsize=(15, 12))
for i, ax in enumerate(axes.flat):
    ax.imshow(X_test[i], cmap='gray')
    true_label = y_test[i]
    pred_label = predicted_labels[i]
    color = 'green' if true_label == pred_label else 'red'
    ax.set_title(f"True: {true_label}\nPred: {pred_label}", color=color, fontweight='bold')
    ax.axis('off')

plt.suptitle('Sample Predictions (Green=Correct, Red=Incorrect)', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.savefig('sample_predictions.png', dpi=100, bbox_inches='tight')
plt.show()

## Confusion Matrix

In [None]:
# Generate confusion matrix
all_predictions = best_model.predict(X_test_cnn)
all_predicted_labels = np.argmax(all_predictions, axis=1)

cm = confusion_matrix(y_test, all_predicted_labels)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=range(10), yticklabels=range(10))
plt.xlabel('Predicted Label', fontsize=12)
plt.ylabel('True Label', fontsize=12)
plt.title('Confusion Matrix - Best CNN Model', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('confusion_matrix.png', dpi=100, bbox_inches='tight')
plt.show()

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, all_predicted_labels, target_names=[str(i) for i in range(10)]))

## Key Findings and Conclusions

### Task 1 - Activation Functions:
- **ReLU** consistently outperforms Sigmoid and Tanh
- **Sigmoid** shows slower convergence due to vanishing gradients
- **Tanh** performs better than Sigmoid but still slower than ReLU

### Task 2 - Optimizers:
- **Adam** achieves fastest convergence and best accuracy
- **SGD with Momentum** improves over plain SGD
- **Plain SGD** shows the slowest and most unstable convergence

### Task 3 - Regularization:
- **Batch Normalization + Dropout (0.25)** provides best generalization
- Without regularization, model tends to overfit
- Light dropout (0.1) helps but not as much as BN + higher dropout

### Overall Best Configuration:
- **Activation**: ReLU
- **Optimizer**: Adam
- **Regularization**: Batch Normalization + Dropout (0.25)
- This combination provides the best balance of speed and accuracy