In [9]:
# from google.colab import drive
# drive.mount('/content/drive', force_remount=False)

!pip install --quiet torch torchvision webdataset tqdm pillow

python(18973) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


In [10]:
# Cell 1 – Environment Setup & Dependencies
import os, sys, subprocess, importlib
from pathlib import Path

print("📦 [DEBUG] Avvio configurazione ambiente...")

# --- Colab detection ---------------------------------------------------------#
IN_COLAB = Path("/content").exists()
if IN_COLAB:
    print("📍 [DEBUG] Ambiente Google Colab rilevato.")
    from google.colab import drive  # type: ignore
    drive.mount("/content/drive", force_remount=False)
else:
    print("💻 [DEBUG] Ambiente locale rilevato (VSCode o simile).")

# --- Project root ------------------------------------------------------------#
ENV_PATHS = {
    "colab": "/content/drive/MyDrive/ColabNotebooks/wsi-ssrl-rcc_project",
    "local": "/Users/stefanoroybisignano/Desktop/MLA/project/wsi-ssrl-rcc_project",
}
PROJECT_ROOT = Path(ENV_PATHS["colab" if IN_COLAB else "local"]).resolve()
sys.path.append(str(PROJECT_ROOT / "src"))
print(f"📁 [DEBUG] PROJECT_ROOT impostato a: {PROJECT_ROOT}")

# --- Dependencies (installa solo se mancano) ---------------------------------#
def _pip_install(pkgs):
    import importlib.util
    missing = [p for p in pkgs if importlib.util.find_spec(p) is None]
    if missing:
        print(f"🔧 [DEBUG] Installazione pacchetti mancanti: {missing}")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "--quiet", *missing])
    else:
        print("✅ [DEBUG] Tutti i pacchetti richiesti sono già installati.")

_pip_install([
    "torch", "torchvision", "webdataset", "tqdm",
    "pillow", "pyyaml", "joblib"
])


📦 [DEBUG] Avvio configurazione ambiente...
💻 [DEBUG] Ambiente locale rilevato (VSCode o simile).
📁 [DEBUG] PROJECT_ROOT impostato a: /Users/stefanoroybisignano/Desktop/MLA/project/wsi-ssrl-rcc_project
🔧 [DEBUG] Installazione pacchetti mancanti: ['pillow', 'pyyaml']


python(18975) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


In [11]:
# Cell 2 – SLURM Submission via SSH per locale VSCode (debug .env + rsync)
import os, subprocess, traceback
from pathlib import Path
from textwrap import dedent

# Detect VSCode vs Colab
IN_COLAB  = Path("/content").exists()
# Use explicit check for not in Colab for VSCode specific logic
IN_VSCODE = not IN_COLAB and bool(os.environ.get("VSCODE_PID"))
print(f"🚀 Detected Colab={IN_COLAB}, VSCode={IN_VSCODE}")

if IN_VSCODE:
    from dotenv import load_dotenv, find_dotenv

    # 1) Carica .env (ricerca automatica)
    dotenv_path = find_dotenv()
    if not dotenv_path:
        raise FileNotFoundError("❌ Non ho trovato alcun .env! Mettilo nella root del progetto.")
    print(f"🔍 Carico .env da {dotenv_path}")
    load_dotenv(dotenv_path, override=True)

    # 2) Controlla le env vars
    REMOTE_USER      = os.getenv("CLUSTER_USER")
    REMOTE_HOST      = os.getenv("CLUSTER_HOST")
    REMOTE_BASE_PATH = os.getenv("REMOTE_BASE_PATH")
    SBATCH_MODULE    = os.getenv("SBATCH_MODULE", "python/3.9")
    SBATCH_PARTITION = os.getenv("SBATCH_PARTITION", "global")
    MAIL_USER        = os.getenv("RESPONSABILE_EMAIL", os.getenv("MEMBER_EMAIL"))

    missing = [v for v in ("CLUSTER_USER","CLUSTER_HOST","REMOTE_BASE_PATH") if not os.getenv(v)]
    if missing:
        raise KeyError(f"🌱 Mancano queste env vars: {missing}. Controlla il .env.")

    # 3) Prepara lo script sbatch locale per la sottomissione remota
    LOCAL_SCRIPT = Path.cwd() / "hpc_submit.sh"
    print(f"   • SSH target: {REMOTE_USER}@{REMOTE_HOST}:{REMOTE_BASE_PATH}")

    # 4) Genera sbatch script
    header = dedent(f"""\
        #!/bin/bash
        #SBATCH --job-name=rcc_ssrl_launch
        #SBATCH --ntasks=1
        #SBATCH --cpus-per-task=4
        #SBATCH --mem-per-cpu=4G
        #SBATCH --time=2:00:00
        #SBATCH --gres=gpu:1
        #SBATCH --partition={SBATCH_PARTITION}
        #SBATCH --output=%x_%j.out
        #SBATCH --mail-type=END,FAIL
        #SBATCH --mail-user={MAIL_USER}
        #SBATCH --workdir={REMOTE_BASE_PATH}

        module purge
        module load {SBATCH_MODULE}

        cd {REMOTE_BASE_PATH}
    """)
    header += f"\npython {PROJECT_ROOT}/4-launch_training.py --config config/training.yaml\n"

    LOCAL_SCRIPT.write_text(header)
    LOCAL_SCRIPT.chmod(0o755)
    print(f"📝 Wrote sbatch script: {LOCAL_SCRIPT}")

    try:
        # 5) Crea cartella remota
        subprocess.run(
            ["ssh", f"{REMOTE_USER}@{REMOTE_HOST}", f"mkdir -p {REMOTE_BASE_PATH}"],
            check=True
        )
        print("🔄 Remote directory ensured")

        # 6) Sync progetto (esclude dati pesanti)
        subprocess.run([
            "rsync","-avz","--delete",
            "--exclude","data/processed",
            f"{PROJECT_ROOT}/",
            f"{REMOTE_USER}@{REMOTE_HOST}:{REMOTE_BASE_PATH}/"
        ], check=True)
        print("🔄 Project synchronized via rsync")

        # 7) Sottometti job
        res = subprocess.run(
            ["ssh", f"{REMOTE_USER}@{REMOTE_HOST}",
             f"cd {REMOTE_BASE_PATH} && sbatch {LOCAL_SCRIPT.name}"],
            capture_output=True, text=True, check=True
        )
        print(f"🔍 sbatch stdout: {res.stdout.strip()}")
        print(f"📬 Job submitted: {res.stdout.strip().split()[-1]}")

    except subprocess.CalledProcessError as e:
        print("❌ SLURM submission failed:")
        print(e.stdout, e.stderr)
    except Exception:
        print("❌ Unexpected error:")
        traceback.print_exc()

else:
    print("⚠️ SLURM integration skipped: non in locale VSCode.")

python-dotenv could not parse statement starting at line 14
python-dotenv could not parse statement starting at line 20
python-dotenv could not parse statement starting at line 22
python-dotenv could not parse statement starting at line 24
python-dotenv could not parse statement starting at line 26
python-dotenv could not parse statement starting at line 33
python-dotenv could not parse statement starting at line 35
python-dotenv could not parse statement starting at line 37
python-dotenv could not parse statement starting at line 39
python-dotenv could not parse statement starting at line 40
python-dotenv could not parse statement starting at line 45
python-dotenv could not parse statement starting at line 47
python-dotenv could not parse statement starting at line 48
python-dotenv could not parse statement starting at line 53
python-dotenv could not parse statement starting at line 57
python-dotenv could not parse statement starting at line 59
python-dotenv could not parse statement 

🚀 Detected Colab=False, VSCode=True
🔍 Carico .env da /Users/stefanoroybisignano/Desktop/MLA/project/wsi-ssrl-rcc_project/.env
   • SSH target: mla_group_19@hpc-legion.polito.it:/home/mla_group_19/wsi-ssrl-rcc_project
📝 Wrote sbatch script: /Users/stefanoroybisignano/Desktop/MLA/project/wsi-ssrl-rcc_project/notebooks/hpc_submit.sh
❌ SLURM submission failed:
None None


python(18977) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
ssh: Could not resolve hostname hpc-legion.polito.it: nodename nor servname provided, or not known


In [12]:
# Cell 3 – Dynamic import of utils.training_utils
import sys
import importlib.util
from pathlib import Path

# 1) locate & load the module file
utils_path = PROJECT_ROOT / "src" / "utils" / "training_utils.py"
spec       = importlib.util.spec_from_file_location("utils.training_utils", str(utils_path))
utils_mod  = importlib.util.module_from_spec(spec)     # type: ignore[arg-type]
assert spec and spec.loader, f"Cannot load spec for {utils_path}"
spec.loader.exec_module(utils_mod)                     # type: ignore[assignment]
sys.modules["utils.training_utils"] = utils_mod        # register in sys.modules
print(f"[DEBUG] Loaded utils.training_utils from {utils_path}")

# 2) import what we need
from utils.training_utils import (
    TRAINER_REGISTRY,
    get_latest_checkpoint,
    load_checkpoint,
)

print("[DEBUG] Imported:")
print("  • TRAINER_REGISTRY keys:", list(TRAINER_REGISTRY.keys()))
print("  • get_latest_checkpoint →", get_latest_checkpoint)
print("  • load_checkpoint       →", load_checkpoint)


[DEBUG] Loaded utils.training_utils from /Users/stefanoroybisignano/Desktop/MLA/project/wsi-ssrl-rcc_project/src/utils/training_utils.py
[DEBUG] Imported:
  • TRAINER_REGISTRY keys: []
  • get_latest_checkpoint → <function get_latest_checkpoint at 0x3021fa0e0>
  • load_checkpoint       → <function load_checkpoint at 0x3021fa050>


In [13]:
# %%
# Cell 4 – Configuration & Directory Setup (formatted and absolute paths)
import yaml, datetime, os
from pathlib import Path

# ------------------------------------------------------------------ #
# 0) EXP_CODE: ri-usa se esiste, altrimenti crealo e memorizzalo     #
# ------------------------------------------------------------------ #
if "EXP_CODE" in os.environ:          # questa variabile resta valida finché dura il kernel
    EXP_CODE = os.environ["EXP_CODE"]
else:
    EXP_CODE = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
    os.environ["EXP_CODE"] = EXP_CODE            # rende il codice ri-utilizzabile

print(f"[DEBUG] EXP_CODE → {EXP_CODE}")

# ------------------------------------------------------------------ #
# 1) Carica configurazione generale                                  #
# ------------------------------------------------------------------ #
cfg_path  = PROJECT_ROOT / "config" / "training.yaml"
cfg       = yaml.safe_load(cfg_path.read_text())
DATASET_ID = cfg["data"]["dataset_id"]
cfg["experiment_code"] = EXP_CODE     # lo inseriamo nel dict per eventuali usi downstream

# ------------------------------------------------------------------ #
# 2) Percorsi assoluti (train / val / test)                          #
# ------------------------------------------------------------------ #
for split in ("train", "val", "test"):
    rel = cfg["data"][split].format(dataset_id=DATASET_ID)
    abs_ = (PROJECT_ROOT / rel).resolve()
    cfg["data"][split] = str(abs_)
    print(f"[DEBUG] {split.upper()} → {abs_}")

# ------------------------------------------------------------------ #
# 3) Struttura directory esperimento                                 #
# ------------------------------------------------------------------ #
EXP_ROOT = PROJECT_ROOT / "data" / "processed" / str(DATASET_ID)
EXP_BASE = EXP_ROOT / "experiments" / EXP_CODE                   # unica per tutta la run
EXP_BASE.mkdir(parents=True, exist_ok=True)
(EXP_ROOT / "experiments.md").touch(exist_ok=True)               # indice globale

print(f"[DEBUG] EXP_BASE → {EXP_BASE}")

# ------------------------------------------------------------------ #
# 4) Salva la **copia YAML** solo se non esiste già                  #
# ------------------------------------------------------------------ #
exp_yaml = EXP_BASE / f"training_{EXP_CODE}.yaml"
if not exp_yaml.exists():                          # evita duplicati
    exp_yaml.write_text(yaml.dump(cfg, sort_keys=False))
    print(f"[DEBUG] Scritto   {exp_yaml}")
else:
    print(f"[DEBUG] Config già presente → {exp_yaml}")

[DEBUG] EXP_CODE → 20250624121016
[DEBUG] TRAIN → /Users/stefanoroybisignano/Desktop/MLA/project/wsi-ssrl-rcc_project/data/processed/dataset_9f30917e/webdataset/train/patches-0000.tar
[DEBUG] VAL → /Users/stefanoroybisignano/Desktop/MLA/project/wsi-ssrl-rcc_project/data/processed/dataset_9f30917e/webdataset/val/patches-0000.tar
[DEBUG] TEST → /Users/stefanoroybisignano/Desktop/MLA/project/wsi-ssrl-rcc_project/data/processed/dataset_9f30917e/webdataset/test/patches-0000.tar
[DEBUG] EXP_BASE → /Users/stefanoroybisignano/Desktop/MLA/project/wsi-ssrl-rcc_project/data/processed/dataset_9f30917e/experiments/20250624121016
[DEBUG] Config già presente → /Users/stefanoroybisignano/Desktop/MLA/project/wsi-ssrl-rcc_project/data/processed/dataset_9f30917e/experiments/20250624121016/training_20250624121016.yaml


In [14]:
# Cell 5 – Import all trainer modules with debug prints
import importlib, sys
from utils.training_utils import TRAINER_REGISTRY

trainer_mods = [
    "trainers.simclr",
    "trainers.moco_v2",
    "trainers.rotation",
    "trainers.jigsaw",
    "trainers.supervised",
    "trainers.transfer",
]

for module_name in trainer_mods:
    if module_name in sys.modules:
        print(f"[DEBUG] Reloading module: {module_name}")
        importlib.reload(sys.modules[module_name])
    else:
        print(f"[DEBUG] Importing module: {module_name}")
        importlib.import_module(module_name)

print(f"[DEBUG] Registered trainers: {list(TRAINER_REGISTRY.keys())}")


[DEBUG] Reloading module: trainers.simclr
[DEBUG] Reloading module: trainers.moco_v2
[DEBUG] Reloading module: trainers.rotation
[DEBUG] Reloading module: trainers.jigsaw
[DEBUG] Reloading module: trainers.supervised
[DEBUG] Reloading module: trainers.transfer
[DEBUG] Registered trainers: ['simclr', 'moco_v2', 'rotation', 'jigsaw', 'supervised', 'transfer']


In [15]:
# %%
# Cell 6 – Helpers (tee-logger + path)
import contextlib, sys

class _Tee:
    """Duplica stdout/stderr su console e file."""
    def __init__(self, *targets): self.targets = targets
    def write(self, data):  [t.write(data) and t.flush() for t in self.targets]
    def flush(self):        [t.flush() for t in self.targets]

def _prepare_model_dir(model_name: str):
    """
    Crea   data/processed/<dataset_id>/experiments/<EXP_CODE>/<model_name>/ 
    e restituisce (model_dir, log_file_path).
    """
    model_dir = EXP_BASE / model_name
    model_dir.mkdir(parents=True, exist_ok=True)
    log_file  = model_dir / f"log_{EXP_CODE}.md"
    return model_dir, log_file

def _global_experiments_append(line: str):
    """Aggiunge una riga al file   data/processed/<dataset_id>/experiments.md"""
    exp_md = EXP_ROOT / "experiments.md"
    with exp_md.open("a") as f:
        f.write(line.rstrip() + "\n")


In [16]:
# %%
# Cell 7 – launch_training() con gestione WebDataset e nuova struttura
import torch, inspect, time, contextlib, sys
from utils.training_utils import TRAINER_REGISTRY, load_checkpoint, get_latest_checkpoint
from trainers.train_classifier import train_classifier

def launch_training(cfg: dict) -> None:
    sel = cfg.get("run_model", "all").lower()
    todo = {n: c for n, c in cfg["models"].items()
            if sel in ("all", n)
            or (sel == "supervised"     and c.get("type") == "supervised")
            or (sel == "selfsupervised" and c.get("type") == "selfsupervised")}

    for name, m_cfg in todo.items():
        # ---------- directory & log ---------------------------------------- #
        model_dir, log_file = _prepare_model_dir(name)
        with open(log_file, "w") as lf, \
             contextlib.redirect_stdout(_Tee(sys.stdout, lf)), \
             contextlib.redirect_stderr(_Tee(sys.stderr, lf)):

            # ---- trainer init --------------------------------------------- #
            TrainerCls = TRAINER_REGISTRY.get(name)
            if TrainerCls is None:
                raise KeyError(f"Trainer '{name}' non registrato")

            trainer   = TrainerCls(m_cfg, cfg["data"])
            has_val   = hasattr(trainer, "validate_epoch")
            epochs    = int(m_cfg["training"]["epochs"])
            batch_sz  = int(m_cfg["training"]["batch_size"])
            device    = getattr(trainer, "device", "cpu")

            print(f"Device: {device} 🚀  Starting training for model '{name}'")
            print(f"→ Model config: {m_cfg}")
            print(f"Epochs: {epochs} | Batch size: {batch_sz}\n")

            # ---- artefatti -------------------------------------------------- #
            ckpt_best = model_dir / f"{name}_best.pt"
            feat_path = model_dir / f"{name}_features.pt"
            clf_path  = model_dir / f"{name}_classifier.joblib"

            # ---- resume ----------------------------------------------------- #
            latest_ckpt = get_latest_checkpoint(model_dir, prefix=name)
            if latest_ckpt:
                print(f"⏭️  Checkpoint trovato: {latest_ckpt.name} – skip training")
                model = torch.nn.Sequential(
                    trainer.encoder,
                    getattr(trainer, "projector", torch.nn.Identity())
                )
                load_checkpoint(latest_ckpt, model=model)
                trainer.encoder = model[0].to(trainer.device)
            else:
                # ---------- training loop ----------------------------------- #
                total_batches = getattr(trainer, "batches_train", None)
                if total_batches is None:
                    try:
                        total_batches = len(trainer.train_loader)
                    except TypeError:
                        total_batches = None  # WebDataset / IterableDataset
                if total_batches:
                    print(f"TOTAL BATCHES {total_batches}")
                else:
                    print("TOTAL BATCHES unknown (IterableDataset)")

                for epoch in range(1, epochs + 1):
                    t0 = time.time()
                    run_loss = run_corr = seen = 0

                    print(f"--- Epoch {epoch}/{epochs} ---")
                    for i, batch in enumerate(trainer.train_loader, 1):
                        sig = inspect.signature(trainer.train_step)
                        res = trainer.train_step(batch) if len(sig.parameters) == 1 \
                              else trainer.train_step(*batch)

                        if len(res) == 4:
                            _, loss, corr, bs = res
                        else:
                            loss, bs = res; corr = 0

                        run_loss += loss * bs
                        run_corr += corr
                        seen     += bs

                        # --- progress bar only if tot batch noto ------------- #
                        if total_batches:
                            pct = (i / total_batches) * 100
                            eta = ((time.time() - t0) / i) * (total_batches - i)
                            msg = (f"  Batch {i}/{total_batches} ({pct:.1f}%) | "
                                   f"Loss: {run_loss/seen:.4f}")
                            if has_val:
                                msg += f" | Acc: {run_corr/seen:.3f}"
                            msg += f" | Elapsed: {time.time()-t0:.1f}s | ETA: {eta:.1f}s"
                        else:  # no len()
                            msg = (f"  Batch {i} | Loss: {run_loss/seen:.4f}")
                            if has_val:
                                msg += f" | Acc: {run_corr/seen:.3f}"
                            msg += f" | Elapsed: {time.time()-t0:.1f}s"
                        print(msg)

                    # ---------- validation / metric ------------------------- #
                    val_loss = val_acc = None
                    if has_val:
                        val_loss, val_acc = trainer.validate_epoch()
                        metric = val_acc
                        print(f"Val -> Loss: {val_loss:.4f} | Acc: {val_acc:.3f}")
                    else:
                        metric = run_loss / seen

                    trainer.post_epoch(epoch, metric)
                    print(f"Epoch {epoch} completed in {time.time()-t0:.1f}s\n")

                # ---------- save best checkpoint ---------------------------- #
                best = get_latest_checkpoint(model_dir, prefix=name)
                if best and best != ckpt_best:
                    best.replace(ckpt_best)

            # ---------- SSL: feature & classifier --------------------------- #
            if m_cfg.get("type") == "selfsupervised":
                if not feat_path.exists():
                    trainer.extract_features_to(str(feat_path))
                print(f"🔍 Extracting & training classifier for '{name}'")
                train_classifier(str(feat_path), str(clf_path))

            # ---------- aggiorna experiments.md globale ------------------------- #
            if ckpt_best.exists():
                rel = ckpt_best.relative_to(EXP_ROOT)
            elif latest_ckpt:
                rel = latest_ckpt.relative_to(EXP_ROOT)
            else:
                rel = "-"  # nessun checkpoint trovato

            _global_experiments_append(
                f"| {EXP_CODE} | {name} | {epochs} | {rel} |"
            )

# avvia
launch_training(cfg)




Device: mps 🚀  Starting training for model 'simclr'
→ Model config: {'backbone': 'resnet18', 'proj_dim': 128, 'augmentation': {'enabled': True, 'horizontal_flip': True, 'rotation': [0, 90, 180, 270], 'color_jitter': {'brightness': 0.4, 'contrast': 0.4, 'saturation': 0.4, 'hue': 0.1}}, 'training': {'epochs': 2, 'batch_size': 64, 'optimizer': 'adam', 'learning_rate': '1e-3', 'weight_decay': '1e-5', 'temperature': 0.5}}
Epochs: 2 | Batch size: 64

TOTAL BATCHES 24
--- Epoch 1/2 ---
  Batch 1/24 (4.2%) | Loss: 4.8470 | Elapsed: 9.1s | ETA: 210.0s
  Batch 2/24 (8.3%) | Loss: 4.9179 | Elapsed: 11.8s | ETA: 130.0s
  Batch 3/24 (12.5%) | Loss: 4.8866 | Elapsed: 16.4s | ETA: 114.6s
  Batch 4/24 (16.7%) | Loss: 4.8858 | Elapsed: 21.6s | ETA: 108.0s
  Batch 5/24 (20.8%) | Loss: 4.8779 | Elapsed: 26.2s | ETA: 99.6s
  Batch 6/24 (25.0%) | Loss: 4.8593 | Elapsed: 29.2s | ETA: 87.5s
  Batch 7/24 (29.2%) | Loss: 4.8455 | Elapsed: 32.0s | ETA: 77.7s
  Batch 8/24 (33.3%) | Loss: 4.8336 | Elapsed: 34.6s 

KeyboardInterrupt: 