In [1]:
# Check directory
import os
print("Current directory:", os.getcwd())
print("Files:", os.listdir())


Current directory: /Users/berkcalisir/fed_talos_project/experiment_notebooks
Files: ['federated_baseline.ipynb', 'centralized_baseline.ipynb', 'data', 'model_editing.ipynb', 'wandb']


In [2]:
import sys
from pathlib import Path

# Go one level up to project root
project_root = Path().resolve().parent
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

In [11]:
import torch
import torch.nn.functional as F
import wandb
from torch.optim.lr_scheduler import CosineAnnealingLR
import torch.utils
from models.dino_vits16 import DINO_ViT
from project_utils.data_split import load_cifar100
from torch.utils.data import DataLoader

# Setup wandb & hyperparameters

In [4]:
import project_utils.wandb_logger
print("Imported from:", project_utils.wandb_logger.__file__)


Imported from: /Users/berkcalisir/fed_talos_project/project_utils/wandb_logger.py


In [5]:
from project_utils.wandb_logger import load_config, init_wandb

raw_config = load_config("../config.yaml")
config = init_wandb(raw_config)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mahmetberk2000[0m ([33mahmetberk2000-politecnico-di-torino[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


# Core training loop

In [6]:
def train_one_epoch(model, loader, optimizer, device):
    model.train()
    total_loss, correct, total = 0, 0, 0

    for x, y in loader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        outputs = model(x)
        loss = F.cross_entropy(outputs, y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * x.size(0)
        _, predicted = outputs.max(1)
        total += y.size(0)
        correct += predicted.eq(y).sum().item()

    return total_loss / total, correct / total


# Evaluation

In [7]:
def evaluate(model, loader, device):
    model.eval()
    loss, correct, total = 0, 0, 0

    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            outputs = model(x)
            loss += F.cross_entropy(outputs, y).item() * x.size(0)
            _, predicted = outputs.max(1)
            total += y.size(0)
            correct += predicted.eq(y).sum().item()

    return loss / total, correct / total


# Main Script

In [None]:
import torch
from models.dino_vits16 import DINO_ViT
from project_utils.data_split import load_cifar100

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if device.type == "cuda":
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Available: {torch.cuda.is_available()}")
    
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using Apple MPS backend (GPU)")
else:
    device = torch.device("cpu")
    print("MPS not available. Falling back to CPU.")



# Load data
train_set, val_set, test_set = load_cifar100()
train_loader = DataLoader(train_set, batch_size=config.batch_size, shuffle=True)
val_loader = DataLoader(val_set, batch_size=config.batch_size)
test_loader = DataLoader(test_set, batch_size=config.batch_size)

# Load model
model = DINO_ViT(num_classes=100).to(device)

# Optimizer & Scheduler
optimizer = torch.optim.SGD(
    model.parameters(),
    lr=config.lr,
    momentum=config.momentum,
    weight_decay=config.weight_decay
)
scheduler = CosineAnnealingLR(optimizer, T_max=config.epochs)

best_val_acc = 0.0

# Check if backbone is frozen
frozen = all(not param.requires_grad for param in model.backbone.parameters())
print("Backbone frozen?" , frozen)


Using device: cpu
Using Apple MPS backend (GPU)


Using cache found in /Users/berkcalisir/.cache/torch/hub/facebookresearch_dino_main


In [15]:
# Training loop
for epoch in range(config.epochs):
    train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, device)
    val_loss, val_acc = evaluate(model, val_loader, device)

    wandb.log({
        "epoch": epoch,
        "train_loss": train_loss,
        "train_acc": train_acc,
        "val_loss": val_loss,
        "val_acc": val_acc,
        "lr": scheduler.get_last_lr()[0]
    })
    
    print(
        f"[Epoch {epoch + 1}/{config.epochs}] "
        f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | "
        f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f} | "
        f"LR: {scheduler.get_last_lr()[0]:.6f}"
    )

    # save best model checkpoint
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), "best_model.pth")
        print(f"Best model updated at epoch {epoch + 1} with Val Acc = {val_acc:.4f}")
    # periodic checkpoint
    if (epoch + 1) % 5 == 0:
        print(f"Checkpoint saved at epoch {epoch + 1}")
    
    scheduler.step()

    
# Final test evaluation
test_loss, test_acc = evaluate(model, test_loader, device)
wandb.log({"test_loss": test_loss, "test_acc": test_acc})
wandb.finish()

print(f"\n TRAINING COMPLETED | Final Test Accuracy: {test_acc:.4f}")

wandb-core(56936) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
wandb-core(56942) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


RuntimeError: MPS backend out of memory (MPS allocated: 9.03 GB, other allocations: 40.83 MB, max allowed: 9.07 GB). Tried to allocate 2.25 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).