In [5]:
import os
import functools
import json
import wandb
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import numpy as np
from torch.utils.data import Subset, DataLoader

# ==========================================
# 1. Setup & Auth
# ==========================================
WANDB_API_KEY = "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"
PROJECT_NAME = "cifar10_mlops_project"
ENTITY = "esi-sba-dz"

wandb.login(key=WANDB_API_KEY)
print(f"Project: {PROJECT_NAME}, Entity: {ENTITY}")

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Project: cifar10_mlops_project, Entity: esi-sba-dz


In [6]:
# ==========================================
# 2. Data Manager (Artifact-Based)
# ==========================================
class Cifar10DataManager:
    def __init__(self, data_dir="./data"):
        self.data_dir = data_dir
        self.mean = (0.4914, 0.4822, 0.4465)
        self.std = (0.2023, 0.1994, 0.2010)

    def get_transforms(self, architecture_option='standard'):
        transform_list = [
            transforms.ToTensor(),
            transforms.Normalize(self.mean, self.std)
        ]
        train_transforms = [
            transforms.RandomHorizontalFlip(),
            transforms.RandomCrop(32, padding=4)
        ] + transform_list

        if architecture_option == 'upsample':
            transform_list.insert(0, transforms.Resize(224))
            train_transforms.insert(0, transforms.Resize(224))

        return transforms.Compose(train_transforms), transforms.Compose(transform_list)

    def get_loaders(self, batch_size, architecture_option='standard'):
        train_transform, test_transform = self.get_transforms(architecture_option)

        # KEY CHANGE: download=False
        # We expect data to be present via W&B Artifact download.
        # If this fails, it means we didn't get the artifact correctly.
        try:
            train_set = torchvision.datasets.CIFAR10(root=self.data_dir, train=True, download=False, transform=train_transform)
            test_set = torchvision.datasets.CIFAR10(root=self.data_dir, train=False, download=False, transform=test_transform)
        except RuntimeError:
            print("CRITICAL: Data not found locally. Ensure Artifact is downloaded first.")
            raise

        # Load indices
        indices_path = os.path.join(self.data_dir, "processed", "test_indices.npy")
        if not os.path.exists(indices_path):
             raise FileNotFoundError(f"Indices file missing: {indices_path}")

        test_indices = np.load(indices_path)
        real_test_set = Subset(test_set, test_indices)

        train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=2)
        test_loader = DataLoader(real_test_set, batch_size=batch_size, shuffle=False, num_workers=2)

        return train_loader, test_loader

In [7]:
# ==========================================
# 3. Model & Training Logic
# ==========================================
def build_model(architecture_option='standard', num_classes=10, pretrained=True):
    model = torchvision.models.resnet18(pretrained=pretrained)
    if architecture_option == 'modified':
        model.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
        model.maxpool = nn.Identity()
    elif architecture_option == 'upsample':
        # Upsample happens in transforms; model remains standard
        pass
    model.fc = nn.Linear(model.fc.in_features, num_classes)
    return model

def train_epoch(model, loader, criterion, optimizer, device):
    model.train()
    running_loss = 0.0
    for inputs, labels in loader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        loss = criterion(model(inputs), labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    return running_loss / len(loader)

def validate(model, loader, criterion, device):
    model.eval()
    correct = 0
    total = 0
    running_loss = 0.0
    with torch.no_grad():
        for inputs, labels in loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            running_loss += criterion(outputs, labels).item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return running_loss / len(loader), 100 * correct / total

def run_training_sweep(config=None, data_dir="./data"):
    with wandb.init(config=config, entity=ENTITY, project=PROJECT_NAME):
        cfg = wandb.config
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        dm = Cifar10DataManager(data_dir=data_dir)
        train_loader, test_loader = dm.get_loaders(cfg.batch_size, cfg.architecture_option)
        model = build_model(cfg.architecture_option).to(device)

        criterion = nn.CrossEntropyLoss()
        optimizer = optim.SGD(model.parameters(), lr=cfg.learning_rate, momentum=0.9) if cfg.optimizer == "sgd" else optim.Adam(model.parameters(), lr=cfg.learning_rate)

        best_acc = 0.0
        for epoch in range(cfg.epochs):
            train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
            val_loss, val_acc = validate(model, test_loader, criterion, device)

            wandb.log({"epoch": epoch, "train_loss": train_loss, "val_loss": val_loss, "val_acc": val_acc})

            if val_acc > best_acc:
                best_acc = val_acc
                os.makedirs("models", exist_ok=True)
                model_path = f"models/model_{wandb.run.id}.pth"
                torch.save(model.state_dict(), model_path)

                art = wandb.Artifact(f"model-{wandb.run.id}", type="model")
                art.add_file(model_path)
                wandb.log_artifact(art)

In [8]:
# ==========================================
# 4. Download Source Data (Artifact)
# ==========================================
print("Fetch Data Artifact (Pre-Sweep)...")
run = wandb.init(project=PROJECT_NAME, entity=ENTITY, job_type="training_prep")
artifact = run.use_artifact(f'{ENTITY}/{PROJECT_NAME}/cifar10_dataset:latest', type='dataset')
artifact_dir = artifact.download(root="./data")
run.finish()
print(f"Data verified at {artifact_dir}")

Fetch Data Artifact (Pre-Sweep)...


[34m[1mwandb[0m: Downloading large artifact 'cifar10_dataset:latest', 340.26MB. 11 files...
[34m[1mwandb[0m:   11 of 11 files downloaded.  
Done. 00:00:00.3 (1224.5MB/s)


Data verified at ./data


In [10]:
# ==========================================
# 5. Execute Sweep
# ==========================================
sweep_config = {
    'method': 'bayes',
    'metric': {'name': 'val_acc', 'goal': 'maximize'},
    'parameters': {
        'learning_rate': {'min': 0.001, 'max': 0.1},
        'batch_size': {'values': [64, 128]},
        'optimizer': {'values': ['adam', 'sgd']},
        'architecture_option': {'values': ['standard', 'upsample', 'modified']},
        'epochs': {'value': 8}
    }
}

sweep_id = wandb.sweep(sweep_config, project=PROJECT_NAME, entity=ENTITY)
print(f"Sweep ID: {sweep_id}")

# Run Agent
train_func = functools.partial(run_training_sweep, data_dir="./data")
wandb.agent(sweep_id, train_func, count=10, project=PROJECT_NAME, entity=ENTITY)

# Save Best Config
api = wandb.Api()
best_run = api.sweep(f"{ENTITY}/{PROJECT_NAME}/{sweep_id}").best_run()
with open("artifacts/best_config.json", "w") as f:
    json.dump(best_run.config, f)
print("Sweep Complete.")

Create sweep with ID: r5gkaaez
Sweep URL: https://wandb.ai/esi-sba-dz/cifar10_mlops_project/sweeps/r5gkaaez
Sweep ID: r5gkaaez


[34m[1mwandb[0m: Agent Starting Run: 6l1duy57 with config:
[34m[1mwandb[0m: 	architecture_option: modified
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 8
[34m[1mwandb[0m: 	learning_rate: 0.020646677587162453
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: [wandb.login()] Loaded credentials for https://api.wandb.ai from /root/.netrc.


0,1
epoch,▁▂▃▄▅▆▇█
train_loss,█▃▃▂▁▁▁▁
val_acc,▁▃▅▆▇▇▆█
val_loss,█▆▄▃▃▂▄▁

0,1
epoch,7.0
train_loss,0.19967
val_acc,91.75
val_loss,0.23932


[34m[1mwandb[0m: Agent Starting Run: w2tnoy1u with config:
[34m[1mwandb[0m: 	architecture_option: modified
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 8
[34m[1mwandb[0m: 	learning_rate: 0.014538103787406702
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: [wandb.login()] Loaded credentials for https://api.wandb.ai from /root/.netrc.


0,1
epoch,▁▂▃▄▅▆▇█
train_loss,█▄▃▂▂▁▁▁
val_acc,▁▅▆▇█▇██
val_loss,█▃▂▂▁▂▁▁

0,1
epoch,7.0
train_loss,0.11511
val_acc,92.4375
val_loss,0.24529


[34m[1mwandb[0m: Agent Starting Run: d41umkfb with config:
[34m[1mwandb[0m: 	architecture_option: modified
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 8
[34m[1mwandb[0m: 	learning_rate: 0.055418672958100275
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: [wandb.login()] Loaded credentials for https://api.wandb.ai from /root/.netrc.


0,1
epoch,▁▂▃▄▅▆▇█
train_loss,█▄▃▂▂▁▁▁
val_acc,▁▄▆▇▇▇▇█
val_loss,█▅▃▃▂▁▂▁

0,1
epoch,7.0
train_loss,0.28414
val_acc,88.025
val_loss,0.38532


[34m[1mwandb[0m: Agent Starting Run: mllhv4do with config:
[34m[1mwandb[0m: 	architecture_option: upsample
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 8
[34m[1mwandb[0m: 	learning_rate: 0.01616694161250879
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: [wandb.login()] Loaded credentials for https://api.wandb.ai from /root/.netrc.


0,1
epoch,▁▂▃▄▅▆▇█
train_loss,█▂▂▂▁▁▁▁
val_acc,▁▁▆▅▄█▆▇
val_loss,█▃▁▁▁▁▁▁

0,1
epoch,7.0
train_loss,2.08614
val_acc,21.9
val_loss,2.10696


[34m[1mwandb[0m: Agent Starting Run: 1i4d6i2b with config:
[34m[1mwandb[0m: 	architecture_option: modified
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 8
[34m[1mwandb[0m: 	learning_rate: 0.023408166833061735
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: [wandb.login()] Loaded credentials for https://api.wandb.ai from /root/.netrc.


0,1
epoch,▁▂▃▄▅▆▇█
train_loss,█▄▃▂▂▁▁▁
val_acc,▁▇▇▇█▇██
val_loss,█▂▁▂▁▁▂▁

0,1
epoch,7.0
train_loss,0.12429
val_acc,92.025
val_loss,0.25726


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: tc1q1y2p with config:
[34m[1mwandb[0m: 	architecture_option: modified
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 8
[34m[1mwandb[0m: 	learning_rate: 0.08683034695159735
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: [wandb.login()] Loaded credentials for https://api.wandb.ai from /root/.netrc.


0,1
epoch,▁▂▃▄▅▆▇█
train_loss,█▅▄▄▃▂▂▁
val_acc,▁▃▃▄▆▆▇█
val_loss,█▆▆▆▃▃▂▁

0,1
epoch,7.0
train_loss,0.77394
val_acc,74.5125
val_loss,0.73974


[34m[1mwandb[0m: Agent Starting Run: yy009na9 with config:
[34m[1mwandb[0m: 	architecture_option: modified
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 8
[34m[1mwandb[0m: 	learning_rate: 0.06266031208099175
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: [wandb.login()] Loaded credentials for https://api.wandb.ai from /root/.netrc.


0,1
epoch,▁▂▃▄▅▆▇█
train_loss,█▅▄▄▃▂▂▁
val_acc,▁▃▄▅▆▇▆█
val_loss,█▆▅▄▃▂▃▁

0,1
epoch,7.0
train_loss,0.75372
val_acc,75.3875
val_loss,0.72999


[34m[1mwandb[0m: Agent Starting Run: 0p5v9tk2 with config:
[34m[1mwandb[0m: 	architecture_option: standard
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 8
[34m[1mwandb[0m: 	learning_rate: 0.09688513470180304
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: [wandb.login()] Loaded credentials for https://api.wandb.ai from /root/.netrc.


0,1
epoch,▁▂▃▄▅▆▇█
train_loss,█▄▃▂▂▂▁▁
val_acc,▁▄▅▆▇▇██
val_loss,█▅▄▃▂▂▁▁

0,1
epoch,7.0
train_loss,1.21821
val_acc,58.6375
val_loss,1.15509


[34m[1mwandb[0m: Agent Starting Run: siblgfgz with config:
[34m[1mwandb[0m: 	architecture_option: modified
[34m[1mwandb[0m: 	batch_size: 128
[34m[1mwandb[0m: 	epochs: 8
[34m[1mwandb[0m: 	learning_rate: 0.004361266727821469
[34m[1mwandb[0m: 	optimizer: sgd
[34m[1mwandb[0m: [wandb.login()] Loaded credentials for https://api.wandb.ai from /root/.netrc.


0,1
epoch,▁▂▃▄▅▆▇█
train_loss,█▃▂▂▂▁▁▁
val_acc,▁▄▆▇▇███
val_loss,█▅▃▂▁▁▁▁

0,1
epoch,7.0
train_loss,0.0913
val_acc,93.3125
val_loss,0.2129


[34m[1mwandb[0m: Agent Starting Run: 957647u5 with config:
[34m[1mwandb[0m: 	architecture_option: standard
[34m[1mwandb[0m: 	batch_size: 64
[34m[1mwandb[0m: 	epochs: 8
[34m[1mwandb[0m: 	learning_rate: 0.09820915428575348
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: [wandb.login()] Loaded credentials for https://api.wandb.ai from /root/.netrc.


0,1
epoch,▁▂▃▄▅▆▇█
train_loss,█▅▄▃▂▂▁▁
val_acc,▁▂▃▄▆▆▇█
val_loss,█▇▅▆▃▃▂▁

0,1
epoch,7.0
train_loss,1.27957
val_acc,59.0
val_loss,1.15864


[34m[1mwandb[0m: Sorting runs by -summary_metrics.val_acc


Sweep Complete.
