In [None]:
# Cell 1 – Environment Setup & Dependencies
import os, sys, subprocess
from pathlib import Path

print("📦 [DEBUG] Avvio configurazione ambiente...")

# --- Colab detection ---------------------------------------------------------#
IN_COLAB = Path("/content").exists()
if IN_COLAB:
    print("📍 [DEBUG] Ambiente Google Colab rilevato.")
    from google.colab import drive  # type: ignore
    drive.mount("/content/drive", force_remount=False)
else:
    print("💻 [DEBUG] Ambiente locale rilevato (VSCode o simile).")

# --- Project root ------------------------------------------------------------#
ENV_PATHS = {
    "colab": "/content/drive/MyDrive/ColabNotebooks/wsi-ssrl-rcc_project",
    "local": "/Users/stefanoroybisignano/Desktop/MLA/project/wsi-ssrl-rcc_project",
}
PROJECT_ROOT = Path(ENV_PATHS["colab" if IN_COLAB else "local"]).resolve()
sys.path.append(str(PROJECT_ROOT / "src"))
print(f"📁 [DEBUG] PROJECT_ROOT → {PROJECT_ROOT}")

!pip install --quiet torch torchvision webdataset tqdm pillow

# --- Dependencies (installa solo se mancano) ---------------------------------#
def _pip_install(pkgs):
    import importlib.util, sys, subprocess
    missing = [p for p in pkgs if importlib.util.find_spec(p) is None]
    if missing:
        print(f"🔧 [DEBUG] Installazione pacchetti mancanti: {missing}")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "--quiet", *missing])
    else:
        print("✅ [DEBUG] Tutti i pacchetti richiesti sono già installati.")

_pip_install(["torch", "torchvision", "webdataset", "tqdm", "pillow", "pyyaml", "joblib"])

# helper per il tarball del dataset (usato in Cell 2)
DATA_TARBALL = PROJECT_ROOT / "data" / "processed"


📦 [DEBUG] Avvio configurazione ambiente...
💻 [DEBUG] Ambiente locale rilevato (VSCode o simile).
📁 [DEBUG] PROJECT_ROOT → /Users/mimmo/Desktop/mimmo/MLA/project_FP03/wsi-ssrl-rcc_project
🔧 [DEBUG] Installazione pacchetti mancanti: ['pillow', 'pyyaml']


In [8]:
# Cell 3 – Dynamic import of utils.training_utils
import sys
import importlib.util
from pathlib import Path

# 1) locate & load the module file
utils_path = PROJECT_ROOT / "src" / "utils" / "training_utils.py"
spec       = importlib.util.spec_from_file_location("utils.training_utils", str(utils_path))
utils_mod  = importlib.util.module_from_spec(spec)     # type: ignore[arg-type]
assert spec and spec.loader, f"Cannot load spec for {utils_path}"
spec.loader.exec_module(utils_mod)                     # type: ignore[assignment]
sys.modules["utils.training_utils"] = utils_mod        # register in sys.modules
print(f"[DEBUG] Loaded utils.training_utils from {utils_path}")

# 2) import what we need
from utils.training_utils import (
    TRAINER_REGISTRY,
    get_latest_checkpoint,
    load_checkpoint,
)

print("[DEBUG] Imported:")
print("  • TRAINER_REGISTRY keys:", list(TRAINER_REGISTRY.keys()))
print("  • get_latest_checkpoint →", get_latest_checkpoint)
print("  • load_checkpoint       →", load_checkpoint)


FileNotFoundError: [Errno 2] No such file or directory: '/Users/mimmo/Desktop/mimmo/MLA/project_FP03/wsi-ssrl-rcc_project/src/utils/training_utils.py'

In [None]:
# Cell 4 – Configuration & Directory Setup (formatted and absolute paths)
import yaml, datetime, os
from pathlib import Path


# —————— Definisci PROJECT_ROOT come nel Cell 1 ——————
# Adatta il path a dove hai montato il tuo Drive
PROJECT_ROOT = Path("/content/drive/MyDrive/Colab Notebooks/MLA_PROJECT/wsi-ssrl-rcc_project")

# —————— Carica la config da training.yaml ——————
cfg_path = PROJECT_ROOT / "config" / "training.yaml"
cfg      = yaml.safe_load(cfg_path.read_text())


# ------------------------------------------------------------------ #
# 0) EXP_CODE: riprendi da YAML → env → genera nuovo                 #
# ------------------------------------------------------------------ #

yaml_exp = cfg.get("exp_code", "")           # <─ nuovo parametro
env_exp  = os.environ.get("EXP_CODE", "")

if yaml_exp:                                 # 1) priorità al file YAML
    EXP_CODE = yaml_exp
elif env_exp:                                # 2) poi variabile d’ambiente
    EXP_CODE = env_exp
else:                                        # 3) altrimenti nuovo timestamp
    EXP_CODE = datetime.datetime.now().strftime("%Y%m%d%H%M%S")

# salva nel processo per eventuali figli
os.environ["EXP_CODE"] = EXP_CODE

print(f"[DEBUG] EXP_CODE → {EXP_CODE}")


# ------------------------------------------------------------------ #
# 1) Carica configurazione generale                                  #
# ------------------------------------------------------------------ #
cfg_path  = PROJECT_ROOT / "config" / "training.yaml"
cfg       = yaml.safe_load(cfg_path.read_text())
DATASET_ID = cfg["data"]["dataset_id"]
cfg["exp_code"] = EXP_CODE     # lo inseriamo nel dict per eventuali usi downstream

# ------------------------------------------------------------------ #
# 2) Percorsi assoluti (train / val / test)                          #
# ------------------------------------------------------------------ #
for split in ("train", "val", "test"):
    rel = cfg["data"][split].format(dataset_id=DATASET_ID)
    abs_ = (PROJECT_ROOT / rel).resolve()
    cfg["data"][split] = str(abs_)
    print(f"[DEBUG] {split.upper()} → {abs_}")

# ------------------------------------------------------------------ #
# 3) Struttura directory esperimento                                 #
# ------------------------------------------------------------------ #
EXP_ROOT = PROJECT_ROOT / "data" / "processed" / str(DATASET_ID)
EXP_BASE = EXP_ROOT / "experiments" / EXP_CODE                   # unica per tutta la run
EXP_BASE.mkdir(parents=True, exist_ok=True)
(EXP_ROOT / "experiments.md").touch(exist_ok=True)               # indice globale

print(f"[DEBUG] EXP_BASE → {EXP_BASE}")

# ------------------------------------------------------------------ #
# 4) Salva la **copia YAML** solo se non esiste già                  #
# ------------------------------------------------------------------ #
exp_yaml = EXP_BASE / f"training_{EXP_CODE}.yaml"
if not exp_yaml.exists():                          # evita duplicati
    exp_yaml.write_text(yaml.dump(cfg, sort_keys=False))
    print(f"[DEBUG] Scritto   {exp_yaml}")
else:
    print(f"[DEBUG] Config già presente → {exp_yaml}")

[DEBUG] EXP_CODE → 20250626011748
[DEBUG] TRAIN → /Users/stefanoroybisignano/Desktop/MLA/project/wsi-ssrl-rcc_project/data/processed/dataset_9f30917e/webdataset/train/patches-0000.tar
[DEBUG] VAL → /Users/stefanoroybisignano/Desktop/MLA/project/wsi-ssrl-rcc_project/data/processed/dataset_9f30917e/webdataset/val/patches-0000.tar
[DEBUG] TEST → /Users/stefanoroybisignano/Desktop/MLA/project/wsi-ssrl-rcc_project/data/processed/dataset_9f30917e/webdataset/test/patches-0000.tar
[DEBUG] EXP_BASE → /Users/stefanoroybisignano/Desktop/MLA/project/wsi-ssrl-rcc_project/data/processed/dataset_9f30917e/experiments/20250626011748
[DEBUG] Scritto   /Users/stefanoroybisignano/Desktop/MLA/project/wsi-ssrl-rcc_project/data/processed/dataset_9f30917e/experiments/20250626011748/training_20250626011748.yaml


In [None]:
# Cell 5 – Import all trainer modules
import importlib, sys
from utils.training_utils import TRAINER_REGISTRY

trainer_mods = [
    "trainers.simclr",
    "trainers.moco_v2",
    "trainers.rotation",
    #"trainers.jigsaw",
    "trainers.jepa",
    "trainers.supervised",   
    "trainers.transfer",
]

for m in trainer_mods:
    importlib.reload(sys.modules[m]) if m in sys.modules else importlib.import_module(m)

print(f"[DEBUG] Registered trainers: {list(TRAINER_REGISTRY.keys())}")


[DEBUG] Registered trainers: ['simclr', 'moco_v2', 'rotation', 'jigsaw', 'supervised', 'transfer']


In [None]:
# %% -------------------------------------------------------------------- #
# Cell 6 – Helper utilities (Tee, paths, selezione, …)                    #
# ----------------------------------------------------------------------- #
import contextlib, sys, time, inspect
from pathlib import Path
import torch
from utils.training_utils import get_latest_checkpoint, load_checkpoint
from trainers.train_classifier import train_classifier

# ──────────────────────────────────────────────────────────────────────── #
# I/O helpers                                                             #
# ──────────────────────────────────────────────────────────────────────── #
class _Tee:
    """Duplica stdout / stderr su console *e* file."""
    def __init__(self, *targets): self.targets = targets
    def write(self, data):  [t.write(data) and t.flush() for t in self.targets]
    def flush(self):        [t.flush()      for t in self.targets]

def _global_experiments_append(line: str):
    """Aggiunge una riga all’indice globale `experiments.md`."""
    exp_md = EXP_ROOT / "experiments.md"
    with exp_md.open("a") as f:
        f.write(line.rstrip() + "\n")

# ──────────────────────────────────────────────────────────────────────── #
# Path builders & artefact checks                                         #
# ──────────────────────────────────────────────────────────────────────── #
def _paths(model_name: str) -> dict[str, Path]:
    """Restituisce tutti i path (dir/log/ckpt/features/clf) per un modello."""
    mdir = EXP_BASE / model_name
    mdir.mkdir(parents=True, exist_ok=True)
    return {
        "dir"      : mdir,
        "log"      : mdir / f"log_{EXP_CODE}.md",
        "features" : mdir / f"{model_name}_features.pt",
        "clf"      : mdir / f"{model_name}_classifier.joblib",
        "ckpt_pref": f"{model_name}_epoch",        # usato da save_checkpoint
    }

def _completed(paths: dict, is_ssl: bool) -> bool:
    """True se TUTTI gli artefatti richiesti sono già presenti."""
    ckpt_ok = get_latest_checkpoint(paths["dir"]) is not None
    if not ckpt_ok:
        return False
    if is_ssl:
        return paths["features"].exists() and paths["clf"].exists()
    return True

# ──────────────────────────────────────────────────────────────────────── #
# Selezione modelli & trainer helpers                                     #
# ──────────────────────────────────────────────────────────────────────── #
def _select_models(cfg: dict) -> dict[str, dict]:
    sel = cfg.get("run_model", "all").lower()
    return {
        n: c for n, c in cfg["models"].items()
        if sel in ("all", n) or
           (sel == "sl"  and c.get("type") == "sl") or
           (sel == "ssl" and c.get("type") == "ssl")
    }

def _init_trainer(name: str, m_cfg: dict, data_cfg: dict, ckpt_dir: Path):
    Trainer = TRAINER_REGISTRY[name]
    tr = Trainer(m_cfg, data_cfg)
    tr.ckpt_dir = ckpt_dir         # salva i .pt qui
    return tr

# ──────────────────────────────────────────────────────────────────────── #
# Training / resume / artefacts                                           #
# ──────────────────────────────────────────────────────────────────────── #
def _run_full_training(trainer, epochs: int):
    """Loop di training (identico al codice esistente, solo incapsulato)."""
    has_val = hasattr(trainer, "validate_epoch")
    total_batches = getattr(trainer, "batches_train", None)
    if total_batches is None:
        try: total_batches = len(trainer.train_loader)
        except TypeError: total_batches = None

    for epoch in range(1, epochs + 1):
        t0 = time.time(); run_loss = run_corr = seen = 0
        print(f"--- Epoch {epoch}/{epochs} ---")
        for i, batch in enumerate(trainer.train_loader, 1):
            sig = inspect.signature(trainer.train_step)
            res = trainer.train_step(batch) if len(sig.parameters) == 1 \
                  else trainer.train_step(*batch)
            if len(res) == 4: _, loss, corr, bs = res
            else:             loss, bs = res; corr = 0
            run_loss += loss * bs; run_corr += corr; seen += bs

            # progress bar
            if total_batches:
                pct = (i/total_batches)*100
                eta = ((time.time()-t0)/i)*(total_batches-i)
                msg = (f"  Batch {i}/{total_batches} ({pct:.1f}%) | "
                       f"Loss: {run_loss/seen:.4f}")
                if has_val: msg += f" | Acc: {run_corr/seen:.3f}"
                msg += f" | Elapsed: {time.time()-t0:.1f}s | ETA: {eta:.1f}s"
            else:
                msg = f"  Batch {i} | Loss: {run_loss/seen:.4f}"
                if has_val: msg += f" | Acc: {run_corr/seen:.3f}"
                msg += f" | Elapsed: {time.time()-t0:.1f}s"
            print(msg)

        if has_val:
            v_loss, v_acc = trainer.validate_epoch()
            trainer.post_epoch(epoch, v_acc)
            print(f"Val -> Loss: {v_loss:.4f} | Acc: {v_acc:.3f}")
        else:
            trainer.post_epoch(epoch, run_loss/seen)

        print(f"Epoch {epoch} completed in {time.time()-t0:.1f}s\n")

def _resume_or_train(trainer, paths: dict, epochs: int):
    ckpt = get_latest_checkpoint(paths["dir"])
    if ckpt:
        print(f"⏩ Resume da {ckpt.name}")

        # 1) MoCoV2Trainer
        if hasattr(trainer, "encoder_q") and hasattr(trainer, "projector_q"):
            model = torch.nn.Sequential(trainer.encoder_q, trainer.projector_q)
            optim = trainer.optimizer

        # 2) SimCLR (e altri SSL che hanno encoder + projector)
        elif hasattr(trainer, "encoder") and hasattr(trainer, "projector"):
            model = torch.nn.Sequential(trainer.encoder, trainer.projector)
            optim = getattr(trainer, "optimizer", None)

        # 3) Jigsaw (encoder + head)
        elif hasattr(trainer, "encoder") and hasattr(trainer, "head"):
            model = torch.nn.Sequential(trainer.encoder, trainer.head)
            optim = trainer.optimizer

        # 4) RotationTrainer (ha .model)
        elif hasattr(trainer, "model"):
            model = trainer.model
            optim = trainer.optimizer

        else:
            raise AttributeError(f"Impossibile fare resume su {trainer!r}")

        # carica pesi (+ optimizer se presente)
        load_checkpoint(ckpt, model, optim)
        return  # fine: skip training

    # se non c’è checkpoint, fai training da zero
    _run_full_training(trainer, epochs)


def _ensure_ssl_artifacts(trainer, paths: dict):
    if not paths["features"].exists():
        trainer.extract_features_to(str(paths["features"]))
    if not paths["clf"].exists():
        train_classifier(str(paths["features"]), str(paths["clf"]))

In [None]:
# %% -------------------------------------------------------------------- #
# Cell 7 – Modular Launch & Auto-Recover                                  #
# ----------------------------------------------------------------------- #
def launch_training(cfg: dict) -> None:
    """Lancia (o recupera) tutti i modelli selezionati."""
    for name, m_cfg in _select_models(cfg).items():
        paths   = _paths(name)
        is_ssl  = m_cfg.get("type") == "ssl"
        epochs  = int(m_cfg["training"]["epochs"])

        # ---------- logging (append) ----------------------------------- #
        with open(paths["log"], "a") as lf, \
             contextlib.redirect_stdout(_Tee(sys.stdout, lf)), \
             contextlib.redirect_stderr(_Tee(sys.stderr, lf)):

            if _completed(paths, is_ssl):
                print(f"✅ Artefatti completi per '{name}' – skip\n")
                continue

            trainer = _init_trainer(name, m_cfg, cfg["data"], paths["dir"])
            print(f"Device: {trainer.device} 🚀  Starting training for '{name}'")
            print(f"→ Model config: {m_cfg}\n")

            _resume_or_train(trainer, paths, epochs)

            if is_ssl:
                _ensure_ssl_artifacts(trainer, paths)

            # Aggiorna indice globale
            last_ckpt = get_latest_checkpoint(paths["dir"])
            rel = last_ckpt.relative_to(EXP_ROOT) if last_ckpt else "-"
            _global_experiments_append(f"| {EXP_CODE} | {name} | {epochs} | {rel} |")

# 🚀 Avvio immediato
# 🚀 Avvio immediato SOLO se siamo in Colab
if IN_COLAB:
    launch_training(cfg)
else:
    print("⏩ Training delegato a SLURM: skip esecuzione locale.")


⏩ Training delegato a SLURM: skip esecuzione locale.


1. **Terminale VSCode (Remote SSH o locale)**
   Dopo aver sottoposto il job, prendi il `JOBID` dallo stdout di `sbatch`.
   Poi apri un terminale in VSCode e digiti:

   ```bash
   ssh mla_group_01@legionlogin.polito.it
   cd /home/mla_group_01/wsi-ssrl-rcc_project
   tail -f rcc_ssrl_launch_<JOBID>.out
   ```

   In questo modo vedrai **in tempo reale** tutti i `print` man mano che il training avanza.

2. **VSCode “Remote SSH” Extension**
   Se installi l’estensione Remote SSH di VSCode, puoi:

   * Connetterti direttamente al nodo di login (`legionlogin.polito.it`)
   * Aprire il file `rcc_ssrl_launch_<JOBID>.out` nell’editor
   * Abilitare “Auto Save” e “Follow Tail” (clic destro → *Tail Follow*), per visualizzare i nuovi messaggi senza uscire dall’IDE.