In [1]:
!pip install timm torchvision
!pip install wandb

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->timm)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->timm)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->timm)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch->timm)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch->timm)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch->timm)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch->tim

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms
from torchvision.datasets import CIFAR100
from torch.utils.data import DataLoader, random_split
import timm
from timm.loss import LabelSmoothingCrossEntropy
from timm.models.layers import DropPath
import data_preprocessing
from wandb_logger import WandBLogger
from torch.cuda.amp import autocast, GradScaler
import torch.nn as nn
import torch.optim as optim
import os
import random
import numpy as np
from datetime import datetime



In [3]:
SEED = 42

In [4]:
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(SEED)

In [5]:
def build_optimizer(optimizer_name, model):
    if optimizer_name.lower() == "adamw":
        config = {
            "learning_rate": LR,
            "betas": BETAS,
            "weight_decay": WEIGHT_DECAY
        }
        optimizer = optim.AdamW(
            model.parameters(),
            lr=LR,
            betas=BETAS,
            weight_decay=WEIGHT_DECAY
        )
    elif optimizer_name.lower() == "sgd":
        config = {
            "learning_rate": LR,
            "momentum": MOMENTUM,
            "weight_decay": WEIGHT_DECAY,
            "nesterov": False
        }
        optimizer = optim.SGD(
            model.parameters(),
            lr=LR,
            momentum=MOMENTUM,
            weight_decay=WEIGHT_DECAY,
            nesterov=False
        )
    else:
        raise ValueError(f"Unsupported optimizer: {optimizer_name}")

    print(f"Optimizer '{optimizer_name}' initialized successfully.")
    return config, optimizer

def build_scheduler(scheduler_name, optimizer):
    if scheduler_name.lower() == "cosineannealinglr + warmup":
        config = {
            "warmup_epochs": WARMUP_EPOCHS,
            "start_factor": START_FACTOR,
            "t_max": T_MAX,
            "milestones": [WARMUP_EPOCHS]
        }
        scheduler = optim.lr_scheduler.SequentialLR(
            optimizer,
            schedulers=[
                optim.lr_scheduler.LinearLR(optimizer, start_factor=START_FACTOR, total_iters=WARMUP_EPOCHS),
                optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=T_MAX)
            ],
            milestones=[WARMUP_EPOCHS]
        )
    else:
        raise ValueError(f"Unsupported scheduler: {scheduler_name}")

    print(f"Scheduler '{scheduler_name}' initialized successfully.")
    return config, scheduler

In [6]:
!wandb login f8ad3703c9023ee3f86c7242a87d9280b6c031fb

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


In [7]:
# THE WHOLE TRAINING SETTINGS ARE HERE!

# GENERAL
BATCH_SIZE = 256
EPOCHS = 30
VAL_SPLIT = 0.1
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BACKBONE_FREEZING = True

# LOSS
SMOOTHING=0.1
# OPTIMIZER
OPTIMIZER_NAME = "SGD"
LR=0.1
BETAS=(0.9, 0.999)
WEIGHT_DECAY=5e-4
WARMUP_EPOCHS=5
NESTEROV=False
MOMENTUM=0.9
# SCHEDULER
SCHEDULER_NAME = "CosineAnnealingLr + Warmup"
START_FACTOR=1e-6 / 5e-5
T_MAX=EPOCHS - WARMUP_EPOCHS
# Early Stopping
PATIENCE = 6

In [8]:
pipeline = data_preprocessing.CIFAR100Pipeline(val_split=VAL_SPLIT, use_augment=True)
trainset, valset, testset = pipeline.run_pipeline()
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True)
valloader = DataLoader(valset, batch_size=BATCH_SIZE)
testloader = DataLoader(testset, batch_size=BATCH_SIZE)

100%|██████████| 169M/169M [00:03<00:00, 43.3MB/s]


In [9]:
def create_dino_vit_s16_for_cifar100(freezing=False):
    model = timm.create_model("vit_small_patch16_224_dino", pretrained=True, num_classes=0, drop_path_rate=0.1)

    # Replace the head with CIFAR-100 classification head
    model.head = nn.Linear(model.num_features, 100)

    if freezing:
      # Freeze all parameters except head
      for param in model.parameters():
          param.requires_grad = False

      # Unfreeze only the head
      for param in model.head.parameters():
          param.requires_grad = True

    return model

model = create_dino_vit_s16_for_cifar100(BACKBONE_FREEZING).to(DEVICE)

OPTIMIZER_CONFIG, optimizer = build_optimizer(OPTIMIZER_NAME, model)
SCHEDULER_CONFIG, scheduler = build_scheduler(SCHEDULER_NAME, optimizer)

  model = create_fn(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors:   0%|          | 0.00/86.7M [00:00<?, ?B/s]

Optimizer 'SGD' initialized successfully.
Scheduler 'CosineAnnealingLr + Warmup' initialized successfully.


In [10]:
print(next(model.parameters()).device)
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"Trainable params: {trainable:,} / {total:,}")

torch.backends.cudnn.benchmark = True

cuda:0
Trainable params: 38,500 / 21,704,164


In [11]:
run_name = f"CENTRALIZED_{datetime.now().strftime('%Y%m%d-%H%M%S')}"

logger = WandBLogger(
    project_name="federated-learning-project",
    run_name=run_name,
    config={
        # Training Setup
        "epochs": EPOCHS,
        "batch_size": BATCH_SIZE,
        "device": DEVICE,
        "backbone_freezing": BACKBONE_FREEZING,

        # Loss Configuration
        "label_smoothing": SMOOTHING,

        # Optimizer Configuration
        "optimizer": OPTIMIZER_NAME,
        "optimizer_config": OPTIMIZER_CONFIG,

        # Scheduler Configuration
        "scheduler": SCHEDULER_NAME,
        "scheduler_config": SCHEDULER_CONFIG,

        # Early Stopping
        "patience": PATIENCE
    }
)


[34m[1mwandb[0m: Currently logged in as: [33ms339170[0m ([33mpolito-fl[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [12]:
# Loss with label smoothing
criterion = LabelSmoothingCrossEntropy(smoothing=SMOOTHING)

scaler = GradScaler()

best_val_acc = 0.0
epochs_no_improve = 0

# Training loop
for epoch in range(EPOCHS):
    model.train()
    correct, total, train_loss = 0, 0, 0.0

    for x, y in trainloader:
        x, y = x.to(DEVICE), y.to(DEVICE)
        optimizer.zero_grad()

        with autocast():
            outputs = model(x)
            loss = criterion(outputs, y)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        train_loss += loss.item() * y.size(0)
        _, pred = torch.max(outputs, 1)
        correct += (pred == y).sum().item()
        total += y.size(0)

    scheduler.step()

    train_acc = correct / total
    train_loss /= total

    logger.log_metrics({
      "train_loss": train_loss,
      "train_acc": train_acc,
      "learning_rate": scheduler.get_last_lr()[0]
    }, step=epoch)


    # Validation
    model.eval()
    correct, total, val_loss = 0, 0, 0.0
    with torch.no_grad():
        for x, y in valloader:
            x, y = x.to(DEVICE), y.to(DEVICE)
            outputs = model(x)
            loss = criterion(outputs, y)

            val_loss += loss.item() * y.size(0)
            _, pred = torch.max(outputs, 1)
            correct += (pred == y).sum().item()
            total += y.size(0)

    val_acc = correct / total
    val_loss /= total
    logger.log_metrics({
      "val_loss": val_loss,
      "val_acc": val_acc
    }, step=epoch)

    print(f"Epoch {epoch+1:02d}/{EPOCHS} — Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f}")

    # Early stopping logic
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        epochs_no_improve = 0
        best_model_state = model.state_dict()  # save best model
        logger.log_model(model, path="best_model.pth")
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= PATIENCE:
            print(f"Early stopping triggered at epoch {epoch+1}")
            break

model.load_state_dict(best_model_state)

  scaler = GradScaler()
  with autocast():


Epoch 01/30 — Train Acc: 0.4062 | Val Acc: 0.6162
Epoch 02/30 — Train Acc: 0.5638 | Val Acc: 0.6164
Epoch 03/30 — Train Acc: 0.5245 | Val Acc: 0.5634
Epoch 04/30 — Train Acc: 0.5150 | Val Acc: 0.5738




Epoch 05/30 — Train Acc: 0.5189 | Val Acc: 0.5696
Epoch 06/30 — Train Acc: 0.5214 | Val Acc: 0.5758
Epoch 07/30 — Train Acc: 0.5478 | Val Acc: 0.5598
Epoch 08/30 — Train Acc: 0.5535 | Val Acc: 0.6076
Early stopping triggered at epoch 8


<All keys matched successfully>

In [13]:
#  test
model.eval()
correct, total, test_loss = 0, 0, 0.0
with torch.no_grad():
    for x, y in testloader:
        x, y = x.to(DEVICE), y.to(DEVICE)
        outputs = model(x)
        loss = criterion(outputs, y)

        test_loss += loss.item() * y.size(0)
        _, pred = torch.max(outputs, 1)
        correct += (pred == y).sum().item()
        total += y.size(0)

test_acc = correct / total
test_loss /= total

logger.log_metrics({
    "test_loss": test_loss,
    "test_acc": test_acc
})

logger.finish()

print(f"\n Final Test Accuracy: {test_acc:.4f} | Test Loss: {test_loss:.4f}")


0,1
learning_rate,▁▃▅▆████
test_acc,▁
test_loss,▁
train_acc,▁█▆▆▆▆▇█
train_loss,▁▁▃▄▆██▇
val_acc,██▁▃▂▃▁▇
val_loss,▁▁▃▄▆██▇

0,1
learning_rate,0.09649
test_acc,0.5979
test_loss,11.97108
train_acc,0.55349
train_loss,12.65455
val_acc,0.6076
val_loss,11.87792



 Final Test Accuracy: 0.5979 | Test Loss: 11.9711
