In [1]:
# Check directory
import os
print("Current directory:", os.getcwd())
print("Files:", os.listdir())


Current directory: /Users/berkcalisir/fed_talos_project/experiment_notebooks
Files: ['federated_baseline.ipynb', 'centralized_baseline.ipynb', 'data', 'model_editing.ipynb', 'wandb']


In [2]:
import sys
from pathlib import Path

# Go one level up to project root
project_root = Path().resolve().parent
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

In [3]:
import torch
import torch.nn.functional as F
import wandb
from torch.optim.lr_scheduler import CosineAnnealingLR
import torch.utils
from models.dino_vits16 import DINO_ViT
from project_utils.data_split import load_cifar100
from torch.utils.data import DataLoader

# Setup wandb & hyperparameters

In [4]:
import project_utils.wandb_logger
print("Imported from:", project_utils.wandb_logger.__file__)


Imported from: /Users/berkcalisir/fed_talos_project/project_utils/wandb_logger.py


In [5]:
from project_utils.wandb_logger import load_config, init_wandb

raw_config = load_config("../config.yaml")
config = init_wandb(raw_config)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mahmetberk2000[0m ([33mahmetberk2000-politecnico-di-torino[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


# Core training loop

In [6]:
def train_one_epoch(model, loader, optimizer, device):
    model.train()
    total_loss, correct, total = 0, 0, 0

    for x, y in loader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        outputs = model(x)
        loss = F.cross_entropy(outputs, y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * x.size(0)
        _, predicted = outputs.max(1)
        total += y.size(0)
        correct += predicted.eq(y).sum().item()

    return total_loss / total, correct / total


# Evaluation

In [7]:
def evaluate(model, loader, device):
    model.eval()
    loss, correct, total = 0, 0, 0

    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            outputs = model(x)
            loss += F.cross_entropy(outputs, y).item() * x.size(0)
            _, predicted = outputs.max(1)
            total += y.size(0)
            correct += predicted.eq(y).sum().item()

    return loss / total, correct / total


# Main Script

In [8]:
import torch
from models.dino_vits16 import DINO_ViT
from project_utils.data_split import load_cifar100

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if device.type == "cuda":
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Available: {torch.cuda.is_available()}")
    
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using Apple MPS backend (GPU)")
else:
    device = torch.device("cpu")
    print("MPS not available. Falling back to CPU.")



# Load data
train_set, val_set, test_set = load_cifar100()
train_loader = DataLoader(train_set, batch_size=config.batch_size, shuffle=True)
val_loader = DataLoader(val_set, batch_size=config.batch_size)
test_loader = DataLoader(test_set, batch_size=config.batch_size)

# Load model
model = DINO_ViT(num_classes=100, frozen_backbone=True).to(device)

# Optimizer & Scheduler
optimizer = torch.optim.SGD(
    model.parameters(),
    lr=config.lr,
    momentum=config.momentum,
    weight_decay=config.weight_decay
)
scheduler = CosineAnnealingLR(optimizer, T_max=config.epochs)

best_val_acc = 0.0

# Check if backbone is frozen
frozen = all(not param.requires_grad for param in model.backbone.parameters())
print("Backbone frozen?" , frozen)


Using device: cpu
Using Apple MPS backend (GPU)


100%|██████████| 169M/169M [04:51<00:00, 580kB/s]  
Using cache found in /Users/berkcalisir/.cache/torch/hub/facebookresearch_dino_main


Backbone frozen? True


In [9]:
import time

In [10]:
# Training loop
epoch_times = []
for epoch in range(config.epochs):
    start_time = time.time()
    
    train_loss, train_acc = train_one_epoch(model, train_loader, optimizer, device)
    val_loss, val_acc = evaluate(model, val_loader, device)

    epoch_time = time.time() - start_time
    epoch_times.append(epoch_time)
    avg_epoch_time = sum(epoch_times) / len(epoch_times)
    eta = avg_epoch_time * (config.epochs - epoch - 1)

    wandb.log({
        "epoch": epoch,
        "train_loss": train_loss,
        "train_acc": train_acc,
        "val_loss": val_loss,
        "val_acc": val_acc,
        "lr": scheduler.get_last_lr()[0],
        "epoch_time_sec": epoch_time,
        "eta_min": eta / 60
    })
    
    print(
        f"[Epoch {epoch + 1}/{config.epochs}] "
        f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | "
        f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f} | "
        f"LR: {scheduler.get_last_lr()[0]:.6f}"
        f" | Time: {epoch_time:.2f}s | ETA: {eta / 60:.2f} min"
    )

    # save best model checkpoint
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), "best_model.pth")
        print(f"Best model updated at epoch {epoch + 1} with Val Acc = {val_acc:.4f}")
    # periodic checkpoint
    if (epoch + 1) % 5 == 0:
        print(f"Checkpoint saved at epoch {epoch + 1}")
    
    scheduler.step()

    
# Final test evaluation
test_loss, test_acc = evaluate(model, test_loader, device)
wandb.log({"test_loss": test_loss, "test_acc": test_acc})
wandb.finish()

print(f"\n TRAINING COMPLETED | Final Test Accuracy: {test_acc:.4f}")

[Epoch 1/50] Train Loss: 9.7967 | Train Acc: 0.5479 | Val Loss: 7.5045 | Val Acc: 0.6586 | LR: 0.010000 | Time: 99.65s | ETA: 81.38 min
Best model updated at epoch 1 with Val Acc = 0.6586
[Epoch 2/50] Train Loss: 9.6907 | Train Acc: 0.6291 | Val Loss: 8.5594 | Val Acc: 0.6606 | LR: 0.009990 | Time: 114.52s | ETA: 85.67 min
Best model updated at epoch 2 with Val Acc = 0.6606
[Epoch 3/50] Train Loss: 9.6898 | Train Acc: 0.6487 | Val Loss: 8.7978 | Val Acc: 0.6770 | LR: 0.009961 | Time: 234.46s | ETA: 117.14 min
Best model updated at epoch 3 with Val Acc = 0.6770
[Epoch 4/50] Train Loss: 9.5481 | Train Acc: 0.6583 | Val Loss: 8.2030 | Val Acc: 0.6874 | LR: 0.009911 | Time: 258.98s | ETA: 135.63 min
Best model updated at epoch 4 with Val Acc = 0.6874
[Epoch 5/50] Train Loss: 9.4870 | Train Acc: 0.6680 | Val Loss: 8.8004 | Val Acc: 0.6792 | LR: 0.009843 | Time: 195.53s | ETA: 135.47 min
Checkpoint saved at epoch 5
[Epoch 6/50] Train Loss: 9.2710 | Train Acc: 0.6710 | Val Loss: 8.4579 | Val 

KeyboardInterrupt: 