In [1]:
!pip install timm torchvision
!pip install wandb
!pip install torch_optimizer

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->timm)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->timm)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->timm)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch->timm)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch->timm)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch->timm)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch->tim

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import OneCycleLR
from torch_optimizer import Ranger
from torchvision import transforms
from torchvision.datasets import CIFAR100
from torch.utils.data import DataLoader, random_split
import timm
from timm.loss import LabelSmoothingCrossEntropy
from timm.models.layers import DropPath
import data_preprocessing
from wandb_logger import WandBLogger
from torch.cuda.amp import autocast, GradScaler
import os
import random
import numpy as np
from datetime import datetime
import editing
from editing import *



In [3]:
SEED = 42
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(SEED)

In [4]:
# THE WHOLE TRAINING SETTINGS ARE HERE!

# GENERAL
BATCH_SIZE = 128
EPOCHS = 30
VAL_SPLIT = 0.1
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BACKBONE_FREEZING = True

# LOSS
SMOOTHING=0.1
# OPTIMIZER
OPTIMIZER_NAME = "ssgd"
LR=0.001
BETAS=(0.9, 0.999)
WEIGHT_DECAY=0.05
WARMUP_EPOCHS=5
NESTEROV=False
MOMENTUM=0.9
EPSILON = 1e-8
# SCHEDULER
SCHEDULER_NAME = "CosineAnnealingWarmRestarts"
START_FACTOR=1e-6 / 5e-5
T_MAX=EPOCHS - WARMUP_EPOCHS
# Early Stopping
PATIENCE = 6

#TALOS CONFIG
FINAL_SPARSITY = 0.6
NUM_BATCHES = 4
ROUNDS = 4
CALIBRATION_MODE = "least_sensitive" # least_sensitive or most_sensitive
MODE = "full" # full if want to include all layers, head for head only, pfededit for custom topk client selection
K = 3 #pfedit setting
FACTOR = 0.5 #probability of not doing pfededit

TALOS_CONFIG = {
    "final_sparsity": FINAL_SPARSITY,
    "num_batches": NUM_BATCHES,
    "rounds": ROUNDS,
    "calibration_mode": CALIBRATION_MODE,
    "mode": MODE,
    "k": K,
    "factor": FACTOR
}

In [5]:
!wandb login 89e5fee022a3a1cf86f958ee0b3dff6f2aa57aad

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


In [6]:
pipeline = data_preprocessing.CIFAR100Pipeline(val_split=VAL_SPLIT, use_augment=True)
trainset, valset, testset = pipeline.run_pipeline()
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, shuffle=True)
valloader = DataLoader(valset, batch_size=BATCH_SIZE)
testloader = DataLoader(testset, batch_size=BATCH_SIZE)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 169M/169M [00:17<00:00, 9.72MB/s]


In [7]:
# Create model
def create_dino_vit_s16_for_cifar100(freezing=BACKBONE_FREEZING):
    model = timm.create_model("vit_small_patch16_224_dino", pretrained=True, num_classes=0)

    # Replace the head with CIFAR-100 classification head
    model.head = nn.Linear(model.num_features, 100)

    if freezing:
      # Freeze all parameters except head
      for param in model.parameters():
          param.requires_grad = False

      # Unfreeze only the head
      for param in model.head.parameters():
          param.requires_grad = True
    else:

      for param in model.parameters():
          param.requires_grad = True

      for param in model.head.parameters():
          param.requires_grad = True

    return model

model = create_dino_vit_s16_for_cifar100(BACKBONE_FREEZING).to(DEVICE)


  model = create_fn(
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors:   0%|          | 0.00/86.7M [00:00<?, ?B/s]

In [8]:
def build_pruner():
    pruner = TaLoSPruner(
        model = model,
        final_sparsity=FINAL_SPARSITY,
        device = DEVICE,
        num_batches=NUM_BATCHES,
        rounds=ROUNDS,
        mode=MODE
    )
    return pruner

pruner = build_pruner()
pruner.calibrate_masks(trainloader, strategy=CALIBRATION_MODE)

üü¢ Pruning will be applied to the entire model.
üîé Starting multi-round calibration for mode 'full'.
üåÄ Calibration Round 1/4
üìù Calculating Fisher Information on 4 batches...


  with autocast():


‚úÖ Fisher Information Computation Completed.
üåÄ Calibration Round 2/4
üìù Calculating Fisher Information on 4 batches...
‚úÖ Fisher Information Computation Completed.
üåÄ Calibration Round 3/4
üìù Calculating Fisher Information on 4 batches...
‚úÖ Fisher Information Computation Completed.
üåÄ Calibration Round 4/4
üìù Calculating Fisher Information on 4 batches...
‚úÖ Fisher Information Computation Completed.
‚úÖ Mask Calibration Completed!


In [9]:
def build_optimizer(optimizer_name, model, masks=None):
    if optimizer_name.lower() == "ssgd":
        config = {
            "lr": LR,
            "momentum": MOMENTUM,
            "weight_decay": WEIGHT_DECAY,
        }
        optimizer = editing.SparseSGDM(
            model.parameters(),
            lr=LR,
            momentum=MOMENTUM,
            weight_decay=WEIGHT_DECAY,
            masks=pruner.masks
        )
    elif optimizer_name.lower() == "sparseadamw":
        config = {
            "lr": LR,
            "betas": BETAS,
            "weight_decay": WEIGHT_DECAY,
            "eps": EPSILON

        }
        optimizer = editing.SparseAdamW(
            model.parameters(),
            lr=LR,
            betas=BETAS,
            weight_decay=WEIGHT_DECAY,
            eps=EPSILON,
            masks=pruner.masks
        )
    else:
        raise ValueError(f"Unsupported optimizer: {optimizer_name}")

    print(f"Optimizer '{optimizer_name}' initialized successfully.")
    return config, optimizer

def build_scheduler(scheduler_name, optimizer):
    if scheduler_name.lower() == "ranger + onecyclelr":
        config = {
            "max_lr": LR,
            "epochs": EPOCHS,
            "steps_per_epoch": len(trainloader),
            "pct_start": 0.3,
            "anneal_strategy": "cos",
            "div_factor": 25.0,
            "final_div_factor": 1e4
        }
        scheduler = OneCycleLR(
            optimizer,
            max_lr=LR,
            epochs=EPOCHS,
            steps_per_epoch=len(trainloader),
            pct_start=0.3,
            anneal_strategy="cos",
            div_factor=25.0,
            final_div_factor=1e4
        )
    elif scheduler_name.lower() == "cosineannealinglr + warmup":
        config = {
            "warmup_epochs": WARMUP_EPOCHS,
            "start_factor": START_FACTOR,
            "t_max": T_MAX,
            "milestones": [WARMUP_EPOCHS]
        }
        scheduler = optim.lr_scheduler.SequentialLR(
            optimizer,
            schedulers=[
                optim.lr_scheduler.LinearLR(optimizer, start_factor=START_FACTOR, total_iters=WARMUP_EPOCHS),
                optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=T_MAX)
            ],
            milestones=[WARMUP_EPOCHS]
        )
    elif scheduler_name.lower() == "cosineannealingwarmrestarts":
        config = {
            "T_0": 10,
            "T_mult": 2
        }
        scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(
            optimizer,
            T_0=10,
            T_mult=2
        )
    else:
        raise ValueError(f"Unsupported scheduler: {scheduler_name}")

    print(f"Scheduler '{scheduler_name}' initialized successfully.")
    return config, scheduler



In [10]:
OPTIMIZER_CONFIG, optimizer = build_optimizer(OPTIMIZER_NAME, model)
SCHEDULER_CONFIG, scheduler = build_scheduler(SCHEDULER_NAME, optimizer)

üîç Mapping parameters to their masks...
‚úÖ Mapped 150 parameters to masks.
Optimizer 'ssgd' initialized successfully.
Scheduler 'CosineAnnealingWarmRestarts' initialized successfully.


In [11]:
print(next(model.parameters()).device)
trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
total = sum(p.numel() for p in model.parameters())
print(f"Trainable params: {trainable:,} / {total:,}")

torch.backends.cudnn.benchmark = True

cuda:0
Trainable params: 38,500 / 21,704,164


In [12]:
run_name = f"CENTRALIZED_TALOS_{datetime.now().strftime('%Y%m%d-%H%M%S')}"

logger = WandBLogger(
    project_name="federated-learning-project",
    run_name=run_name,
    config={
        # Training Setup
        "epochs": EPOCHS,
        "batch_size": BATCH_SIZE,
        "device": DEVICE,
        "backbone_freezing": BACKBONE_FREEZING,

        # Loss Configuration
        "label_smoothing": SMOOTHING,

        # Optimizer Configuration
        "optimizer": OPTIMIZER_NAME,
        "optimizer_config": OPTIMIZER_CONFIG,

        # Scheduler Configuration
        "scheduler": SCHEDULER_NAME,
        "scheduler_config": SCHEDULER_CONFIG,

        # Talos Configuration
        "talos_config": TALOS_CONFIG,

        # Early Stopping
        "patience": PATIENCE
    }
)


[34m[1mwandb[0m: Currently logged in as: [33ms348517giuseppe[0m ([33ms348517giuseppe-politecnico-di-torino[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [None]:
# Loss with label smoothing
criterion = LabelSmoothingCrossEntropy(smoothing=SMOOTHING)

scaler = GradScaler()

best_val_acc = 0.0
epochs_no_improve = 0

# Training loop
for epoch in range(EPOCHS):
    model.train()
    correct, total, train_loss = 0, 0, 0.0

    for x, y in trainloader:
        x, y = x.to(DEVICE), y.to(DEVICE)
        optimizer.zero_grad()

        with autocast():
            outputs = model(x)
            loss = criterion(outputs, y)

        scaler.scale(loss).backward()
        scaler.step(optimizer) #Optimizer applies the mask!!!
        scaler.update()

        # scheduler.step()

        train_loss += loss.item() * y.size(0)
        _, pred = torch.max(outputs, 1)
        correct += (pred == y).sum().item()
        total += y.size(0)

    scheduler.step()

    train_acc = correct / total
    train_loss /= total

    logger.log_metrics({
      "train_loss": train_loss,
      "train_acc": train_acc,
      "learning_rate": scheduler.get_last_lr()[0]
    }, step=epoch)


    # Validation
    model.eval()
    correct, total, val_loss = 0, 0, 0.0
    with torch.no_grad():
        for x, y in valloader:
            x, y = x.to(DEVICE), y.to(DEVICE)
            outputs = model(x)
            loss = criterion(outputs, y)

            val_loss += loss.item() * y.size(0)
            _, pred = torch.max(outputs, 1)
            correct += (pred == y).sum().item()
            total += y.size(0)

    val_acc = correct / total
    val_loss /= total
    logger.log_metrics({
      "val_loss": val_loss,
      "val_acc": val_acc
    }, step=epoch)

    print(f"Epoch {epoch+1:02d}/{EPOCHS} ‚Äî Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f}")

    # Early stopping logic
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        epochs_no_improve = 0
        best_model_state = model.state_dict()  # save best model
        logger.log_model(model, path="best_model.pth")
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= PATIENCE:
            print(f"Early stopping triggered at epoch {epoch+1}")
            break

model.load_state_dict(best_model_state)

  scaler = GradScaler()
  with autocast():


Epoch 01/30 ‚Äî Train Acc: 0.1161 | Val Acc: 0.2398
Epoch 02/30 ‚Äî Train Acc: 0.3256 | Val Acc: 0.3840
Epoch 03/30 ‚Äî Train Acc: 0.4307 | Val Acc: 0.4502
Epoch 04/30 ‚Äî Train Acc: 0.4864 | Val Acc: 0.4890
Epoch 05/30 ‚Äî Train Acc: 0.5206 | Val Acc: 0.5090
Epoch 06/30 ‚Äî Train Acc: 0.5418 | Val Acc: 0.5232
Epoch 07/30 ‚Äî Train Acc: 0.5542 | Val Acc: 0.5344
Epoch 08/30 ‚Äî Train Acc: 0.5627 | Val Acc: 0.5388
Epoch 09/30 ‚Äî Train Acc: 0.5676 | Val Acc: 0.5408
Epoch 10/30 ‚Äî Train Acc: 0.5691 | Val Acc: 0.5414
Epoch 11/30 ‚Äî Train Acc: 0.5711 | Val Acc: 0.5586
Epoch 12/30 ‚Äî Train Acc: 0.5859 | Val Acc: 0.5730
Epoch 13/30 ‚Äî Train Acc: 0.5967 | Val Acc: 0.5782
Epoch 14/30 ‚Äî Train Acc: 0.6051 | Val Acc: 0.5832
Epoch 15/30 ‚Äî Train Acc: 0.6104 | Val Acc: 0.5902
Epoch 16/30 ‚Äî Train Acc: 0.6140 | Val Acc: 0.5940
Epoch 17/30 ‚Äî Train Acc: 0.6175 | Val Acc: 0.5948
Epoch 18/30 ‚Äî Train Acc: 0.6209 | Val Acc: 0.5974
Epoch 19/30 ‚Äî Train Acc: 0.6229 | Val Acc: 0.5986
Epoch 20/30 

In [None]:
#  test
model.eval()
correct, total, test_loss = 0, 0, 0.0
with torch.no_grad():
    for x, y in testloader:
        x, y = x.to(DEVICE), y.to(DEVICE)
        outputs = model(x)
        loss = criterion(outputs, y)

        test_loss += loss.item() * y.size(0)
        _, pred = torch.max(outputs, 1)
        correct += (pred == y).sum().item()
        total += y.size(0)

test_acc = correct / total
test_loss /= total

logger.log_metrics({
    "test_loss": test_loss,
    "test_acc": test_acc
})

logger.finish()

print(f"\n Final Test Accuracy: {test_acc:.4f} | Test Loss: {test_loss:.4f}")
