In [None]:
import os
import functools
import json
import wandb
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import numpy as np
from torch.utils.data import Subset, DataLoader

# ==========================================
# 1. Setup & Auth
# ==========================================
WANDB_API_KEY = "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"
PROJECT_NAME = "cifar10_mlops_project"
ENTITY = "esi-sba-dz"

wandb.login(key=WANDB_API_KEY)
print(f"Project: {PROJECT_NAME}, Entity: {ENTITY}")

In [None]:
# ==========================================
# 2. Data Manager (Artifact-Based)
# ==========================================
class Cifar10DataManager:
    def __init__(self, data_dir="./data"):
        self.data_dir = data_dir
        self.mean = (0.4914, 0.4822, 0.4465)
        self.std = (0.2023, 0.1994, 0.2010)

    def get_transforms(self, architecture_option='standard'):
        transform_list = [
            transforms.ToTensor(),
            transforms.Normalize(self.mean, self.std)
        ]
        train_transforms = [
            transforms.RandomHorizontalFlip(),
            transforms.RandomCrop(32, padding=4)
        ] + transform_list

        if architecture_option == 'upsample':
            transform_list.insert(0, transforms.Resize(224))
            train_transforms.insert(0, transforms.Resize(224))

        return transforms.Compose(train_transforms), transforms.Compose(transform_list)

    def get_loaders(self, batch_size, architecture_option='standard'):
        train_transform, test_transform = self.get_transforms(architecture_option)
        
        # KEY CHANGE: download=False
        # We expect data to be present via W&B Artifact download.
        # If this fails, it means we didn't get the artifact correctly.
        try:
            train_set = torchvision.datasets.CIFAR10(root=self.data_dir, train=True, download=False, transform=train_transform)
            test_set = torchvision.datasets.CIFAR10(root=self.data_dir, train=False, download=False, transform=test_transform)
        except RuntimeError:
            print("CRITICAL: Data not found locally. Ensure Artifact is downloaded first.")
            raise

        # Load indices
        indices_path = os.path.join(self.data_dir, "processed", "test_indices.npy")
        if not os.path.exists(indices_path):
             raise FileNotFoundError(f"Indices file missing: {indices_path}")
             
        test_indices = np.load(indices_path)
        real_test_set = Subset(test_set, test_indices)
        
        train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=2)
        test_loader = DataLoader(real_test_set, batch_size=batch_size, shuffle=False, num_workers=2)
        
        return train_loader, test_loader

In [None]:
# Run the Sweep Agent
# count=5 means run 5 experiments
print(f"Starting sweep agent for ID: {sweep_id}")
wandb.agent(sweep_id, train, count=6)

[34m[1mwandb[0m: Agent Starting Run: dd3hx67d with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.027101044512480844
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: [wandb.login()] Loaded credentials for https://api.wandb.ai from /root/.netrc.
[34m[1mwandb[0m: Currently logged in as: [33mamirbnsl[0m ([33mesi-sba-dz[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


100%|██████████| 170M/170M [00:05<00:00, 29.6MB/s]


Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth


100%|██████████| 44.7M/44.7M [00:00<00:00, 194MB/s]


Epoch 0: Loss 1.645, Val Acc 53.17%
New best model saved with acc: 53.17
Epoch 1: Loss 1.224, Val Acc 68.57%
New best model saved with acc: 68.57
Epoch 2: Loss 0.892, Val Acc 73.83%
New best model saved with acc: 73.83
Epoch 3: Loss 0.803, Val Acc 75.15%
New best model saved with acc: 75.15
Epoch 4: Loss 0.701, Val Acc 77.60%
New best model saved with acc: 77.6


0,1
batch_loss,█▇▆▇▅▆▄▅▆▄▅▅▅▄▄▄▃▃▃▂▂▂▃▃▂▂▂▃▂▂▂▂▁▁▂▂▂▂▁▂
epoch,▁▃▅▆█
loss,█▅▂▂▁
val_acc,▁▅▇▇█

0,1
batch_loss,0.96517
epoch,4.0
loss,0.70097
val_acc,77.6


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: ykcxlx5p with config:
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.0891925407881827
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: [wandb.login()] Loaded credentials for https://api.wandb.ai from /root/.netrc.


Epoch 0: Loss 2.637, Val Acc 10.26%
New best model saved with acc: 10.26
Epoch 1: Loss 2.311, Val Acc 10.12%
Epoch 2: Loss 2.313, Val Acc 10.13%
Epoch 3: Loss 2.311, Val Acc 9.58%
Epoch 4: Loss 2.311, Val Acc 9.98%


0,1
batch_loss,█▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
epoch,▁▃▅▆█
loss,█▁▁▁▁
val_acc,█▇▇▁▅

0,1
batch_loss,2.29867
epoch,4.0
loss,2.31134
val_acc,9.98


[34m[1mwandb[0m: Agent Starting Run: 6b2vg8k4 with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.037221685860906406
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: [wandb.login()] Loaded credentials for https://api.wandb.ai from /root/.netrc.


Epoch 0: Loss 2.448, Val Acc 33.46%
New best model saved with acc: 33.46
Epoch 1: Loss 1.777, Val Acc 41.96%
New best model saved with acc: 41.96
Epoch 2: Loss 1.585, Val Acc 51.02%
New best model saved with acc: 51.02
Epoch 3: Loss 1.430, Val Acc 54.85%
New best model saved with acc: 54.85
Epoch 4: Loss 1.327, Val Acc 57.76%
New best model saved with acc: 57.76


0,1
batch_loss,█▄▇▅▄▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▂▂▁▁▂▁▁▁▁▁
epoch,▁▃▅▆█
loss,█▄▃▂▁
val_acc,▁▃▆▇█

0,1
batch_loss,1.4945
epoch,4.0
loss,1.32688
val_acc,57.76


[34m[1mwandb[0m: Agent Starting Run: ve4v73yi with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.031545026622292326
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: [wandb.login()] Loaded credentials for https://api.wandb.ai from /root/.netrc.


Epoch 0: Loss 1.385, Val Acc 63.14%
New best model saved with acc: 63.14
Epoch 1: Loss 1.068, Val Acc 66.82%
New best model saved with acc: 66.82
Epoch 2: Loss 0.861, Val Acc 71.64%
New best model saved with acc: 71.64
Epoch 3: Loss 0.726, Val Acc 76.63%
New best model saved with acc: 76.63
Epoch 4: Loss 0.663, Val Acc 78.07%
New best model saved with acc: 78.07


0,1
batch_loss,█▇▇▅▆▅▄▅▄▃▄▃▅▄▄▂▃▃▃▃▂▁▃▂▃▂▂▂▂▂▁▂▂▂▂▂▂▁▁▂
epoch,▁▃▅▆█
loss,█▅▃▂▁
val_acc,▁▃▅▇█

0,1
batch_loss,0.62986
epoch,4.0
loss,0.66341
val_acc,78.07


[34m[1mwandb[0m: Agent Starting Run: n08k4dxv with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.0023134244198814847
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: [wandb.login()] Loaded credentials for https://api.wandb.ai from /root/.netrc.


Epoch 0: Loss 1.164, Val Acc 71.89%
New best model saved with acc: 71.89
Epoch 1: Loss 0.779, Val Acc 76.89%
New best model saved with acc: 76.89
Epoch 2: Loss 0.669, Val Acc 78.87%
New best model saved with acc: 78.87
Epoch 3: Loss 0.601, Val Acc 79.89%
New best model saved with acc: 79.89
Epoch 4: Loss 0.547, Val Acc 81.42%
New best model saved with acc: 81.42


0,1
batch_loss,█▆▅▅▄▄▄▃▃▄▂▁▄▂▃▁▂▂▁▁▂▁▂▃▂▂▂▂▁▂▂▁▁▂▂▁▁▁▁▁
epoch,▁▃▅▆█
loss,█▄▂▂▁
val_acc,▁▅▆▇█

0,1
batch_loss,0.61433
epoch,4.0
loss,0.54694
val_acc,81.42


[34m[1mwandb[0m: Agent Starting Run: mp6y2eff with config:
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 5
[34m[1mwandb[0m: 	learning_rate: 0.00039469230291637653
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: [wandb.login()] Loaded credentials for https://api.wandb.ai from /root/.netrc.


Epoch 0: Loss 1.551, Val Acc 58.11%
New best model saved with acc: 58.11
Epoch 1: Loss 1.094, Val Acc 66.10%
New best model saved with acc: 66.1
Epoch 2: Loss 0.948, Val Acc 68.86%
New best model saved with acc: 68.86
Epoch 3: Loss 0.869, Val Acc 71.62%
New best model saved with acc: 71.62
Epoch 4: Loss 0.808, Val Acc 73.22%
New best model saved with acc: 73.22


0,1
batch_loss,███▅▆▅▄▄▄▄▄▃▃▃▃▃▃▂▃▁▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▃▁▃
epoch,▁▃▅▆█
loss,█▄▂▂▁
val_acc,▁▅▆▇█

0,1
batch_loss,0.87027
epoch,4.0
loss,0.80751
val_acc,73.22


In [None]:
# ==========================================
# 3. Model & Training Logic
# ==========================================
def build_model(architecture_option='standard', num_classes=10, pretrained=True):
    model = torchvision.models.resnet18(pretrained=pretrained)
    if architecture_option == 'modified':
        model.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        model.maxpool = nn.Identity()
    elif architecture_option == 'upsample':
        # Upsample happens in transforms; model remains standard
        pass
    model.fc = nn.Linear(model.fc.in_features, num_classes)
    return model

def train_epoch(model, loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    for inputs, labels in loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        loss = criterion(model(inputs), labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    return running_loss / len(loader)

def validate(model, loader, criterion, device):
    model.eval()
    correct = 0
    total = 0
    running_loss = 0.0
    with torch.no_grad():
        for inputs, labels in loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            running_loss += criterion(outputs, labels).item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return running_loss / len(loader), 100 * correct / total

def run_training_sweep(config=None, data_dir="./data"):
    with wandb.init(config=config, entity=ENTITY, project=PROJECT_NAME):
        cfg = wandb.config
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        dm = Cifar10DataManager(data_dir=data_dir)
        train_loader, test_loader = dm.get_loaders(cfg.batch_size, cfg.architecture_option)
        model = build_model(cfg.architecture_option).to(device)
        
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.SGD(model.parameters(), lr=cfg.learning_rate, momentum=0.9) if cfg.optimizer == "sgd" else optim.Adam(model.parameters(), lr=cfg.learning_rate)
            
        best_acc = 0.0
        for epoch in range(cfg.epochs):
            train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
            val_loss, val_acc = validate(model, test_loader, criterion, device)
            
            wandb.log({"epoch": epoch, "train_loss": train_loss, "val_loss": val_loss, "val_acc": val_acc})
            
            if val_acc > best_acc:
                best_acc = val_acc
                os.makedirs("models", exist_ok=True)
                model_path = f"models/model_{wandb.run.id}.pth"
                torch.save(model.state_dict(), model_path)
                
                art = wandb.Artifact(f"model-{wandb.run.id}", type="model")
                art.add_file(model_path)
                wandb.log_artifact(art)

In [None]:
# ==========================================
# 4. Download Source Data (Artifact)
# ==========================================
print("Fetch Data Artifact (Pre-Sweep)...")
run = wandb.init(project=PROJECT_NAME, entity=ENTITY, job_type="training_prep")
artifact = run.use_artifact(f'{ENTITY}/{PROJECT_NAME}/cifar10_dataset:latest', type='dataset')
artifact_dir = artifact.download(root="./data")
run.finish()
print(f"Data verified at {artifact_dir}")

In [None]:
# ==========================================
# 5. Execute Sweep
# ==========================================
sweep_config = {
    'method': 'bayes',
    'metric': {'name': 'val_acc', 'goal': 'maximize'},
    'parameters': {
        'learning_rate': {'min': 0.001, 'max': 0.1},
        'batch_size': {'values': [64, 128]},
        'optimizer': {'values': ['adam', 'sgd']},
        'architecture_option': {'values': ['standard', 'upsample', 'modified']},
        'epochs': {'value': 5}
    }
}

sweep_id = wandb.sweep(sweep_config, project=PROJECT_NAME, entity=ENTITY)
print(f"Sweep ID: {sweep_id}")

# Run Agent
train_func = functools.partial(run_training_sweep, data_dir="./data")
wandb.agent(sweep_id, train_func, count=5, project=PROJECT_NAME, entity=ENTITY)

# Save Best Config
api = wandb.Api()
best_run = api.sweep(f"{ENTITY}/{PROJECT_NAME}/{sweep_id}").best_run()
with open("artifacts/best_config.json", "w") as f:
    json.dump(best_run.config, f)
print("Sweep Complete.")