In [None]:
import sys
import os
import functools
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

import wandb
import torch
from src.utils import load_env_vars
from src.dataset import Cifar10DataManager
from src.training import run_training_sweep
import json

# Setup
env = load_env_vars()
PROJECT_NAME = env.get("WANDB_PROJECT", "cifar10_mlops_project")
ENTITY = env.get("WANDB_ENTITY", None)

print(f"Project: {PROJECT_NAME}, Entity: {ENTITY}")

# 1. Download Dataset from W&B (Ensure consistency)
# We initialize a run just to fetch the data
run = wandb.init(project=PROJECT_NAME, job_type="training_prep")
artifact = run.use_artifact(f'{ENTITY}/{PROJECT_NAME}/cifar10_dataset:latest', type='dataset')
artifact_dir = artifact.download(root="../data")
run.finish()

print(f"Data downloaded to {artifact_dir}")

# 2. Define Sweep Config
sweep_config = {
    'method': 'bayes',
    'metric': {'name': 'val_acc', 'goal': 'maximize'},
    'parameters': {
        'learning_rate': {'min': 0.001, 'max': 0.1},
        'batch_size': {'values': [64, 128]},
        'optimizer': {'values': ['adam', 'sgd']},
        'architecture_option': {'values': ['standard', 'modified']},
        'epochs': {'value': 5}
    }
}

# 3. Initialize Sweep
sweep_id = wandb.sweep(sweep_config, project=PROJECT_NAME)
print(f"Sweep ID: {sweep_id}")

# Save Sweep ID for next steps
os.makedirs("../artifacts", exist_ok=True)
with open("../artifacts/sweep_id.txt", "w") as f:
    f.write(sweep_id)

# 4. Run Agent
# We use partial to pass the correct data_dir to the training function
train_func = functools.partial(run_training_sweep, data_dir="../data")
wandb.agent(sweep_id, train_func, count=5)

# 5. Extract Best Config
api = wandb.Api()
sweep = api.sweep(f"{ENTITY}/{PROJECT_NAME}/{sweep_id}")
best_run = sweep.best_run()
best_config = best_run.config
print(f"Best Run: {best_run.name} ({best_run.id}) with Val Acc: {best_run.summary.get('val_acc')}")

# Save best config for later Steps (Deployment & Retraining)
with open("../artifacts/best_config.json", "w") as f:
    json.dump(best_config, f)
    
print("Step 2 Complete: Sweep finished and best config saved.")

# Model Training and Hyperparameter Sweep

This notebook implements the training pipeline with Transfer Learning (ResNet18) and uses W&B Sweeps for hyperparameter optimization.


In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import wandb
import os
import copy
from pathlib import Path

# setup device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [None]:
# Define Sweep Configuration
sweep_config = {
    'method': 'bayes', # Bayesian optimization
    'metric': {
        'name': 'val_acc',
        'goal': 'maximize'
    },
    'parameters': {
        'learning_rate': {
            'min': 0.0001,
            'max': 0.1
        },
        'batch_size': {
            'values': [32, 64, 128]
        },
        'optimizer': {
            'values': ['adam', 'sgd']
        },
        'architecture_option': {
            # standard: The way we used here (Option C)
            # upsample: Upsampling to 224x224 (Option A - Lazy Way)
            # modified: Modify first layer (Option B - Pro Way)
            'values': ['standard', 'upsample', 'modified']
        },
        'epochs': {
            'value': 20 # Keep it small for demonstration, increase for real results
        }
    }
}

PROJECT_NAME = "cifar10_mlops_project"
sweep_id = wandb.sweep(sweep_config, project=PROJECT_NAME)

# --- AUTOMATION: SAVE SWEEP ID FOR NEXT STEPS ---
os.makedirs("../artifacts", exist_ok=True)
with open("../artifacts/sweep_id.txt", "w") as f:
    f.write(sweep_id)
print(f"Sweep ID {sweep_id} saved to ../artifacts/sweep_id.txt")

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 2


[34m[1mwandb[0m: You chose 'Use an existing W&B account'
[34m[1mwandb[0m: Logging into https://api.wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: Find your API key here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Create sweep with ID: b8ld9rmu
Sweep URL: https://wandb.ai/esi-sba-dz/cifar10_mlops_project/sweeps/b8ld9rmu


In [None]:
def build_dataset(batch_size, architecture_option):
    # Base transforms list
    transforms_list_train = [
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ]
    
    transforms_list_test = [
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ]

    # Adjust transforms based on architecture option
    if architecture_option == 'upsample':
        # Option A: Lazy Way (Resize to 224x224)
        transforms_list_train.insert(0, transforms.Resize(224))
        transforms_list_test.insert(0, transforms.Resize(224))
    else:
        # Option B (Modified) and C (Standard): Keep 32x32, usually with padding/cropping for train
        transforms_list_train.insert(0, transforms.RandomCrop(32, padding=4))

    transform_train = transforms.Compose(transforms_list_train)
    transform_test = transforms.Compose(transforms_list_test)

    trainset = torchvision.datasets.CIFAR10(root='../data/raw', train=True, download=True, transform=transform_train)
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)

    testset = torchvision.datasets.CIFAR10(root='../data/raw', train=False, download=True, transform=transform_test)
    testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=2)
    
    return trainloader, testloader

def build_model(architecture_option):
    # Load pretrained ResNet18
    model = torchvision.models.resnet18(pretrained=True)
    
    if architecture_option == 'modified':
        # Option B: The "Pro" Way
        # Replace 7x7 conv with 3x3 conv, stride 1, padding 1
        model.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        # Remove MaxPool
        model.maxpool = nn.Identity()
    
    # Note: Option A (Upsample) and C (Standard) use the unmodified backbone structure
    # Standard ResNet on 32x32 (Option C) is suboptimal but works.

    # Replace last layer for CIFAR-10 (10 classes)
    num_ftrs = model.fc.in_features
    model.fc = nn.Linear(num_ftrs, 10)
    
    return model.to(device)

def train(config=None):
    # Initialize a new wandb run
    with wandb.init(config=config):
        config = wandb.config
        
        # Access the architecture option from config (with default if missing)
        arch_opt = getattr(config, 'architecture_option', 'standard')
        
        trainloader, testloader = build_dataset(config.batch_size, arch_opt)
        model = build_model(arch_opt)
        
        criterion = nn.CrossEntropyLoss()
        if config.optimizer == "sgd":
            optimizer = optim.SGD(model.parameters(), lr=config.learning_rate, momentum=0.9)
        elif config.optimizer == "adam":
            optimizer = optim.Adam(model.parameters(), lr=config.learning_rate)
            
        best_acc = 0.0
        
        print(f"Starting training with option: {arch_opt}, lr: {config.learning_rate}, batch: {config.batch_size}")
        
        for epoch in range(config.epochs):
            model.train()
            running_loss = 0.0
            
            for i, (inputs, labels) in enumerate(trainloader):
                inputs, labels = inputs.to(device), labels.to(device)
                
                optimizer.zero_grad()
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
                
                running_loss += loss.item()
                
                # Log batch metrics
                wandb.log({"batch_loss": loss.item()})
            
            # Validation
            model.eval()
            correct = 0
            total = 0
            with torch.no_grad():
                for inputs, labels in testloader:
                    inputs, labels = inputs.to(device), labels.to(device)
                    outputs = model(inputs)
                    _, predicted = torch.max(outputs.data, 1)
                    total += labels.size(0)
                    correct += (predicted == labels).sum().item()
            
            val_acc = 100 * correct / total
            epoch_loss = running_loss / len(trainloader)
            
            # Log epoch metrics
            wandb.log({"epoch": epoch, "loss": epoch_loss, "val_acc": val_acc})
            print(f"Epoch {epoch}: Loss {epoch_loss:.3f}, Val Acc {val_acc:.2f}%")
            
            # Save best model to W&B
            if val_acc > best_acc:
                best_acc = val_acc
                
                # Create models directory
                Path("../models").mkdir(parents=True, exist_ok=True)
                
                # Save locally
                model_path = f"../models/model_best_{wandb.run.id}.pth"
                torch.save(model.state_dict(), model_path)
                
                # Log as artifact
                artifact = wandb.Artifact(f"model-best-{wandb.run.id}", type="model")
                artifact.add_file(model_path)
                wandb.log_artifact(artifact)
                print(f"New best model saved with acc: {best_acc}")

In [None]:
# Run the Sweep Agent
# count=5 means run 5 experiments
print(f"Starting sweep agent for ID: {sweep_id}")
wandb.agent(sweep_id, train, count=6)

[34m[1mwandb[0m: Agent Starting Run: dd3hx67d with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.027101044512480844
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: [wandb.login()] Loaded credentials for https://api.wandb.ai from /root/.netrc.
[34m[1mwandb[0m: Currently logged in as: [33mamirbnsl[0m ([33mesi-sba-dz[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


100%|██████████| 170M/170M [00:05<00:00, 29.6MB/s]


Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


100%|██████████| 44.7M/44.7M [00:00<00:00, 194MB/s]


Epoch 0: Loss 1.645, Val Acc 53.17%
New best model saved with acc: 53.17
Epoch 1: Loss 1.224, Val Acc 68.57%
New best model saved with acc: 68.57
Epoch 2: Loss 0.892, Val Acc 73.83%
New best model saved with acc: 73.83
Epoch 3: Loss 0.803, Val Acc 75.15%
New best model saved with acc: 75.15
Epoch 4: Loss 0.701, Val Acc 77.60%
New best model saved with acc: 77.6


0,1
batch_loss,█▇▆▇▅▆▄▅▆▄▅▅▅▄▄▄▃▃▃▂▂▂▃▃▂▂▂▃▂▂▂▂▁▁▂▂▂▂▁▂
epoch,▁▃▅▆█
loss,█▅▂▂▁
val_acc,▁▅▇▇█

0,1
batch_loss,0.96517
epoch,4.0
loss,0.70097
val_acc,77.6


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: ykcxlx5p with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.0891925407881827
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: [wandb.login()] Loaded credentials for https://api.wandb.ai from /root/.netrc.


Epoch 0: Loss 2.637, Val Acc 10.26%
New best model saved with acc: 10.26
Epoch 1: Loss 2.311, Val Acc 10.12%
Epoch 2: Loss 2.313, Val Acc 10.13%
Epoch 3: Loss 2.311, Val Acc 9.58%
Epoch 4: Loss 2.311, Val Acc 9.98%


0,1
batch_loss,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch,▁▃▅▆█
loss,█▁▁▁▁
val_acc,█▇▇▁▅

0,1
batch_loss,2.29867
epoch,4.0
loss,2.31134
val_acc,9.98


[34m[1mwandb[0m: Agent Starting Run: 6b2vg8k4 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.037221685860906406
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: [wandb.login()] Loaded credentials for https://api.wandb.ai from /root/.netrc.


Epoch 0: Loss 2.448, Val Acc 33.46%
New best model saved with acc: 33.46
Epoch 1: Loss 1.777, Val Acc 41.96%
New best model saved with acc: 41.96
Epoch 2: Loss 1.585, Val Acc 51.02%
New best model saved with acc: 51.02
Epoch 3: Loss 1.430, Val Acc 54.85%
New best model saved with acc: 54.85
Epoch 4: Loss 1.327, Val Acc 57.76%
New best model saved with acc: 57.76


0,1
batch_loss,█▄▇▅▄▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▂▂▁▁▂▁▁▁▁▁
epoch,▁▃▅▆█
loss,█▄▃▂▁
val_acc,▁▃▆▇█

0,1
batch_loss,1.4945
epoch,4.0
loss,1.32688
val_acc,57.76


[34m[1mwandb[0m: Agent Starting Run: ve4v73yi with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.031545026622292326
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: [wandb.login()] Loaded credentials for https://api.wandb.ai from /root/.netrc.


Epoch 0: Loss 1.385, Val Acc 63.14%
New best model saved with acc: 63.14
Epoch 1: Loss 1.068, Val Acc 66.82%
New best model saved with acc: 66.82
Epoch 2: Loss 0.861, Val Acc 71.64%
New best model saved with acc: 71.64
Epoch 3: Loss 0.726, Val Acc 76.63%
New best model saved with acc: 76.63
Epoch 4: Loss 0.663, Val Acc 78.07%
New best model saved with acc: 78.07


0,1
batch_loss,█▇▇▅▆▅▄▅▄▃▄▃▅▄▄▂▃▃▃▃▂▁▃▂▃▂▂▂▂▂▁▂▂▂▂▂▂▁▁▂
epoch,▁▃▅▆█
loss,█▅▃▂▁
val_acc,▁▃▅▇█

0,1
batch_loss,0.62986
epoch,4.0
loss,0.66341
val_acc,78.07


[34m[1mwandb[0m: Agent Starting Run: n08k4dxv with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.0023134244198814847
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: [wandb.login()] Loaded credentials for https://api.wandb.ai from /root/.netrc.


Epoch 0: Loss 1.164, Val Acc 71.89%
New best model saved with acc: 71.89
Epoch 1: Loss 0.779, Val Acc 76.89%
New best model saved with acc: 76.89
Epoch 2: Loss 0.669, Val Acc 78.87%
New best model saved with acc: 78.87
Epoch 3: Loss 0.601, Val Acc 79.89%
New best model saved with acc: 79.89
Epoch 4: Loss 0.547, Val Acc 81.42%
New best model saved with acc: 81.42


0,1
batch_loss,█▆▅▅▄▄▄▃▃▄▂▁▄▂▃▁▂▂▁▁▂▁▂▃▂▂▂▂▁▂▂▁▁▂▂▁▁▁▁▁
epoch,▁▃▅▆█
loss,█▄▂▂▁
val_acc,▁▅▆▇█

0,1
batch_loss,0.61433
epoch,4.0
loss,0.54694
val_acc,81.42


[34m[1mwandb[0m: Agent Starting Run: mp6y2eff with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.00039469230291637653
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: [wandb.login()] Loaded credentials for https://api.wandb.ai from /root/.netrc.


Epoch 0: Loss 1.551, Val Acc 58.11%
New best model saved with acc: 58.11
Epoch 1: Loss 1.094, Val Acc 66.10%
New best model saved with acc: 66.1
Epoch 2: Loss 0.948, Val Acc 68.86%
New best model saved with acc: 68.86
Epoch 3: Loss 0.869, Val Acc 71.62%
New best model saved with acc: 71.62
Epoch 4: Loss 0.808, Val Acc 73.22%
New best model saved with acc: 73.22


0,1
batch_loss,███▅▆▅▄▄▄▄▄▃▃▃▃▃▃▂▃▁▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▃▁▃
epoch,▁▃▅▆█
loss,█▄▂▂▁
val_acc,▁▅▆▇█

0,1
batch_loss,0.87027
epoch,4.0
loss,0.80751
val_acc,73.22
