# Experiment Notebook: CNN Architecture Exploration

This notebook follows a hypothesis-driven approach to explore CNN architectures with:
- Dropout layers
- Normalization techniques
- Various architectural patterns
- Hyperparameter interactions

We'll pause at each section to form hypotheses before running experiments.

## Part 1: Setup & Baseline Model

In [62]:
# Core imports
from pathlib import Path
import torch
import torch.nn as nn
import torch.optim as optim
from loguru import logger
import warnings
warnings.simplefilter("ignore", UserWarning)

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# MLflow for experiment tracking
import mlflow
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

# Dataset and training utilities
from mads_datasets import DatasetFactoryProvider, DatasetType
from mltrainer.preprocessors import BasePreprocessor
from mltrainer import metrics, Trainer, TrainerSettings, ReportTypes
from mltrainer.imagemodels import CNNConfig, CNNblocks
from torchinfo import summary
from datetime import datetime

In [63]:
# Device setup
if torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device("mps")
    print("Using MPS")
elif torch.cuda.is_available():
    device = "cuda:0"
    print("Using CUDA")
else:
    device = "cpu"
    print("Using CPU")

print(f"Device: {device}")

Using MPS
Device: mps


In [64]:
# MLflow setup - using the exact same pattern as 03_mlflow.py
experiment_path = "cnn_architecture_exploration"
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment(experiment_path)
print(f"MLflow experiment: {experiment_path}")
print("MLflow UI available at: http://127.0.0.1:5001")

2025/09/22 01:51:58 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/09/22 01:51:58 INFO mlflow.store.db.utils: Updating database tables


INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.


CommandError: Can't locate revision identified by '71994744cf8e'

In [46]:
# Load FASHION dataset
fashionfactory = DatasetFactoryProvider.create_factory(DatasetType.FASHION)
batchsize = 64
preprocessor = BasePreprocessor()

streamers = fashionfactory.create_datastreamer(batchsize=batchsize, preprocessor=preprocessor)
train = streamers["train"]
valid = streamers["valid"]
trainstreamer = train.stream()
validstreamer = valid.stream()

# Get sample batch
x, y = next(iter(trainstreamer))
print(f"Input shape: {x.shape}")
print(f"Label shape: {y.shape}")
print(f"Number of classes: {y.unique().shape[0]}")

[32m2025-09-22 01:31:53.880[0m | [1mINFO    [0m | [36mmads_datasets.base[0m:[36mdownload_data[0m:[36m121[0m - [1mFolder already exists at /Users/DINGZEEFS/.cache/mads_datasets/fashionmnist[0m
[32m2025-09-22 01:31:53.884[0m | [1mINFO    [0m | [36mmads_datasets.base[0m:[36mdownload_data[0m:[36m124[0m - [1mFile already exists at /Users/DINGZEEFS/.cache/mads_datasets/fashionmnist/fashionmnist.pt[0m


Input shape: torch.Size([64, 1, 28, 28])
Label shape: torch.Size([64])
Number of classes: 10


### Baseline CNN Model

Let's start with our baseline model from the previous notebook.

In [47]:
# Baseline model configuration
baseline_config = CNNConfig(
    matrixshape=(28, 28),
    batchsize=batchsize,
    input_channels=1,
    hidden=32,  # number of filters
    kernel_size=3,
    maxpool=3,
    num_layers=4,
    num_classes=10,
)

baseline_model = CNNblocks(baseline_config)
summary(baseline_model, input_size=(batchsize, 1, 28, 28))

Calculated matrix size: 9
Caluclated flatten size: 288


Layer (type:depth-idx)                   Output Shape              Param #
CNNblocks                                [64, 10]                  --
├─ModuleList: 1-1                        --                        --
│    └─ConvBlock: 2-1                    [64, 32, 28, 28]          --
│    │    └─Sequential: 3-1              [64, 32, 28, 28]          9,568
│    └─ConvBlock: 2-2                    [64, 32, 28, 28]          --
│    │    └─Sequential: 3-2              [64, 32, 28, 28]          18,496
│    └─ReLU: 2-3                         [64, 32, 28, 28]          --
│    └─MaxPool2d: 2-4                    [64, 32, 9, 9]            --
│    └─ConvBlock: 2-5                    [64, 32, 9, 9]            --
│    │    └─Sequential: 3-3              [64, 32, 9, 9]            18,496
│    └─ReLU: 2-6                         [64, 32, 9, 9]            --
│    └─ConvBlock: 2-7                    [64, 32, 9, 9]            --
│    │    └─Sequential: 3-4              [64, 32, 9, 9]            18,496


In [48]:
# Training settings for quick experiments
def create_trainer_settings(logdir="models", epochs=5, train_steps=100, valid_steps=50):
    return TrainerSettings(
        epochs=epochs,
        metrics=[metrics.Accuracy()],
        logdir=Path(logdir).resolve(),
        train_steps=train_steps,
        valid_steps=valid_steps,
        reporttypes=[ReportTypes.MLFLOW, ReportTypes.TOML],
    )

In [49]:
def train_model(model, settings, run_name="baseline", log_params=None):
    """Helper function to train a model with MLflow tracking"""
    with mlflow.start_run(run_name=run_name):
        # Log parameters
        if log_params:
            mlflow.log_params(log_params)
        
        # Set tags
        mlflow.set_tag("model_type", model.__class__.__name__)
        mlflow.set_tag("experiment_phase", run_name.split("_")[0])
        
        # Initialize training components
        optimizer = optim.Adam
        loss_fn = nn.CrossEntropyLoss()
        accuracy = metrics.Accuracy()
        
        # Create trainer
        trainer = Trainer(
            model=model,
            settings=settings,
            loss_fn=loss_fn,
            optimizer=optimizer,
            traindataloader=trainstreamer,
            validdataloader=validstreamer,
            scheduler=optim.lr_scheduler.ReduceLROnPlateau,
            device=device,
        )
        
        # Train
        trainer.loop()
        
        # Log final metrics
        mlflow.log_metric("final_train_loss", trainer.train_loss)
        mlflow.log_metric("final_valid_loss", trainer.test_loss)
        
        return trainer

# Training settings for quick experiments - using MLflow reporting like 03_mlflow.py
def create_trainer_settings(logdir="models", epochs=5, train_steps=100, valid_steps=50):
    return TrainerSettings(
        epochs=epochs,
        metrics=[metrics.Accuracy()],
        logdir=Path(logdir).resolve(),
        train_steps=train_steps,
        valid_steps=valid_steps,
        reporttypes=[ReportTypes.MLFLOW, ReportTypes.TOML],  # This is the key - automatic MLflow logging!
    )

In [None]:
def train_model(model, settings, run_name="baseline", log_params=None):
    """Helper function to train a model with MLflow tracking - following 03_mlflow.py pattern"""
    # Start MLflow run like in the working example
    with mlflow.start_run():
        # Set tags
        mlflow.set_tag("model_type", model.__class__.__name__)
        mlflow.set_tag("experiment_phase", run_name.split("_")[0])
        mlflow.set_tag("dev", "student")
        
        # Log parameters
        if log_params:
            mlflow.log_params(log_params)
        
        # Initialize training components
        optimizer = optim.Adam
        loss_fn = nn.CrossEntropyLoss()
        
        # Create trainer - the trainer will automatically log to MLflow due to ReportTypes.MLFLOW
        trainer = Trainer(
            model=model,
            settings=settings,
            loss_fn=loss_fn,
            optimizer=optimizer,
            traindataloader=trainstreamer,
            validdataloader=validstreamer,
            scheduler=optim.lr_scheduler.ReduceLROnPlateau,
            device=device,
        )
        
        # Train - MLflow logging happens automatically
        trainer.loop()
        
        return trainer

In [52]:
# Run baseline experiment
print("Training baseline model...")
baseline_settings = create_trainer_settings(epochs=3)

baseline_params = {
    "model": "baseline_cnn",
    "filters": 32,
    "layers": 4,
    "kernel_size": 3,
    "dropout": 0.0,
    "normalization": "none",
    "batch_size": batchsize
}

baseline_trainer = train_model(
    baseline_model, 
    baseline_settings, 
    run_name="baseline_cnn",
    log_params=baseline_params
)

print(f"\nBaseline Results:")
# Let's inspect what attributes the trainer actually has
print("Available trainer attributes:")
for attr in dir(baseline_trainer):
    if not attr.startswith('_') and 'loss' in attr.lower():
        print(f"  {attr}: {getattr(baseline_trainer, attr, 'N/A')}")

# Try to access the correct attributes
try:
    if hasattr(baseline_trainer, 'train_losses') and baseline_trainer.train_losses:
        print(f"Final Train Loss: {baseline_trainer.train_losses[-1]:.4f}")
    if hasattr(baseline_trainer, 'valid_losses') and baseline_trainer.valid_losses:
        print(f"Final Valid Loss: {baseline_trainer.valid_losses[-1]:.4f}")
    elif hasattr(baseline_trainer, 'test_losses') and baseline_trainer.test_losses:
        print(f"Final Valid Loss: {baseline_trainer.test_losses[-1]:.4f}")
    if hasattr(baseline_trainer, 'train_metrics') and baseline_trainer.train_metrics:
        print(f"Final Accuracy: {baseline_trainer.train_metrics[-1][0]:.4f}")
except Exception as e:
    print(f"Error accessing trainer attributes: {e}")
    print("Let's check all attributes:")
    print([attr for attr in dir(baseline_trainer) if not attr.startswith('_')])

print(f"\nExperiment logged to MLflow. View at: http://127.0.0.1:5001")

Training baseline model...


[32m2025-09-22 01:32:32.098[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mdir_add_timestamp[0m:[36m24[0m - [1mLogging to /Users/DINGZEEFS/MADS-MachineLearning-course/notebooks/2_convolutions/models/20250922-013232[0m
[32m2025-09-22 01:32:32.103[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36m__init__[0m:[36m68[0m - [1mFound earlystop_kwargs in settings.Set to None if you dont want earlystopping.[0m
100%|[38;2;30;71;6m██████████[0m| 100/100 [00:02<00:00, 34.82it/s]
[32m2025-09-22 01:32:35.467[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 0 train 0.7069 test 0.6736 metric ['0.7412'][0m
100%|[38;2;30;71;6m██████████[0m| 100/100 [00:04<00:00, 20.61it/s]
[32m2025-09-22 01:32:40.787[0m | [1mINFO    [0m | [36mmltrainer.trainer[0m:[36mreport[0m:[36m209[0m - [1mEpoch 1 train 0.6389 test 0.6396 metric ['0.7550'][0m
100%|[38;2;30;71;6m██████████[0m| 100/100 [00:02<00:00, 36.26it/s]
[32m2025-09-22 0


Baseline Results:
Available trainer attributes:
  loss_fn: CrossEntropyLoss()
  test_loss: 0.6020043933391571

Experiment logged to MLflow. View at: http://127.0.0.1:5001





In [None]:
# Run baseline experiment
print("Training baseline model...")
baseline_settings = create_trainer_settings(epochs=3)

baseline_params = {
    "model": "baseline_cnn",
    "filters": 32,
    "layers": 4,
    "kernel_size": 3,
    "dropout": 0.0,
    "normalization": "none",
    "batch_size": batchsize
}

baseline_trainer = train_model(
    baseline_model, 
    baseline_settings, 
    run_name="baseline_cnn",
    log_params=baseline_params
)

print(f"\nBaseline Results:")
# Use the exact same pattern as the working 03_mlflow.py example
print(f"Final Test Loss: {baseline_trainer.test_loss:.4f}")

print(f"\nExperiment should now be visible in MLflow UI at: http://127.0.0.1:5001")
print("Check the 'cnn_architecture_exploration' experiment!")

---

## Part 2: Dropout Experiments

Now let's explore the effect of dropout layers.

### 🔬 Dropout Hypothesis

**Before we add dropout, form a hypothesis:**

Questions to consider:
- How will dropout affect the training/validation accuracy gap?
- What dropout rate will work best (0.1, 0.3, 0.5)?
- Where should dropout be placed for maximum effect?
- Will dropout slow down training?

**Please write your hypothesis below:**

### 📝 Your Dropout Hypothesis:

*(Double-click to edit and write your hypothesis here)*

- Effect on overfitting:
- Optimal dropout rate:
- Best placement:
- Training speed impact:

In [None]:
# Dropout CNN Model Implementation
class DropoutCNN(nn.Module):
    def __init__(self, dropout_rate=0.3, filters=32, input_size=(1, 28, 28)):
        super().__init__()
        self.dropout_rate = dropout_rate
        self.filters = filters
        
        self.conv_layers = nn.Sequential(
            # First conv block
            nn.Conv2d(1, filters, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Dropout2d(dropout_rate),
            
            # Second conv block
            nn.Conv2d(filters, filters, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Dropout2d(dropout_rate),
            
            # Third conv block
            nn.Conv2d(filters, filters, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Dropout2d(dropout_rate),
        )
        
        # Calculate size after convolutions
        with torch.no_grad():
            dummy_input = torch.zeros(1, *input_size)
            conv_output = self.conv_layers(dummy_input)
            flattened_size = conv_output.view(1, -1).size(1)
        
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(flattened_size, 128),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(64, 10)
        )
    
    def forward(self, x):
        x = self.conv_layers(x)
        x = self.classifier(x)
        return x

# Test different dropout rates
dropout_rates = [0.0, 0.1, 0.3, 0.5]
dropout_results = []

print("Running dropout experiments...")
print("This will test dropout rates: ", dropout_rates)
print("\nRunning experiments (this may take a few minutes)...")

---

## Part 3: Normalization Experiments

*(We'll add this section after completing dropout experiments)*

In [None]:
# Run experiments for each dropout rate
for dropout_rate in dropout_rates:
    print(f"\n{'='*50}")
    print(f"Testing dropout rate: {dropout_rate}")
    print(f"{'='*50}")
    
    # Create model
    model = DropoutCNN(dropout_rate=dropout_rate)
    model.to(device)
    
    # Training parameters
    dropout_params = {
        "model": "dropout_cnn",
        "dropout_rate": dropout_rate,
        "filters": 32,
        "epochs": 3,
        "batch_size": batchsize
    }
    
    # Train model
    settings = create_trainer_settings(epochs=3)
    trainer = train_model(
        model, 
        settings, 
        run_name=f"dropout_{dropout_rate}",
        log_params=dropout_params
    )
    
    # Store results
    result = {
        "dropout_rate": dropout_rate,
        "test_loss": trainer.test_loss,
        "model_size": sum(p.numel() for p in model.parameters())
    }
    dropout_results.append(result)
    
    print(f"Results: Test Loss = {trainer.test_loss:.4f}")

print(f"\n{'='*60}")
print("DROPOUT EXPERIMENTS SUMMARY")
print(f"{'='*60}")
for result in dropout_results:
    print(f"Dropout {result['dropout_rate']:.1f}: Test Loss = {result['test_loss']:.4f}")

In [None]:
---

## Part 3: Normalization Experiments

### 🔬 Normalization Hypothesis

**Before we test different normalization techniques, form a hypothesis:**

Questions to consider:
- How will BatchNorm affect training speed and stability?
- Will LayerNorm or InstanceNorm work better for image data?
- How will normalization affect the final accuracy?
- Should normalization be applied before or after activation functions?

**Please write your normalization hypothesis below:**

---

## Part 4: Combined Architecture

*(We'll add this section after completing normalization experiments)*

### 📝 Your Normalization Hypothesis:

*(Double-click to edit and write your hypothesis here)*

- BatchNorm impact on training:
- Best normalization technique for CNNs:
- Accuracy improvement expected:
- Placement strategy (before/after activation):

In [None]:
# Normalization CNN Models
class NormalizationCNN(nn.Module):
    def __init__(self, norm_type="batch", filters=32, input_size=(1, 28, 28)):
        super().__init__()
        self.norm_type = norm_type
        self.filters = filters
        
        # Define normalization layer factory
        def get_norm_layer(channels):
            if norm_type == "batch":
                return nn.BatchNorm2d(channels)
            elif norm_type == "instance":
                return nn.InstanceNorm2d(channels)
            elif norm_type == "layer":
                return nn.GroupNorm(1, channels)  # LayerNorm equivalent for 2D
            else:  # "none"
                return nn.Identity()
        
        self.conv_layers = nn.Sequential(
            # First conv block
            nn.Conv2d(1, filters, kernel_size=3, padding=1),
            get_norm_layer(filters),
            nn.ReLU(),
            nn.MaxPool2d(2),
            
            # Second conv block  
            nn.Conv2d(filters, filters, kernel_size=3, padding=1),
            get_norm_layer(filters),
            nn.ReLU(),
            nn.MaxPool2d(2),
            
            # Third conv block
            nn.Conv2d(filters, filters, kernel_size=3, padding=1),
            get_norm_layer(filters),
            nn.ReLU(),
            nn.MaxPool2d(2),
        )
        
        # Calculate size after convolutions
        with torch.no_grad():
            dummy_input = torch.zeros(1, *input_size)
            conv_output = self.conv_layers(dummy_input)
            flattened_size = conv_output.view(1, -1).size(1)
        
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(flattened_size, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 10)
        )
    
    def forward(self, x):
        x = self.conv_layers(x)
        x = self.classifier(x)
        return x

# Test different normalization techniques
norm_types = ["none", "batch", "instance", "layer"]
norm_results = []

print("Running normalization experiments...")
print("This will test normalization types: ", norm_types)
print("\nRunning experiments (this may take a few minutes)...")

---

## Part 5: Hyperparameter Search

*(We'll add this section after completing architecture experiments)*

---

## Part 6: Analysis & Reflection

*(We'll complete this section at the end)*