In [1]:
# -------------------------------------------------- Cell 1 -----------------
import os, sys
from pathlib import Path

IN_COLAB = "google.colab" in sys.modules
if IN_COLAB:
    from google.colab import drive
    drive.mount("/content/drive", force_remount=False)

# --- pacchetti -------------------------------------------------------------
!apt-get -qq update
!apt-get -qq install -y openslide-tools      # per OpenSlide (Intel + Apple)

# Pillow-SIMD (JPEG turbo) — se non già presente
try:
    import pillow_simd                       # noqa: F401
except ImportError:
    !pip -q install --upgrade pillow-simd==9.0.0.post1

!pip -q install --upgrade openslide-python webdataset tqdm


Mounted at /content/drive
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Selecting previously unselected package libopenslide0.
(Reading database ... 126109 files and directories currently installed.)
Preparing to unpack .../libopenslide0_3.4.1+dfsg-5build1_amd64.deb ...
Unpacking libopenslide0 (3.4.1+dfsg-5build1) ...
Selecting previously unselected package openslide-tools.
Preparing to unpack .../openslide-tools_3.4.1+dfsg-5build1_amd64.deb ...
Unpacking openslide-tools (3.4.1+dfsg-5build1) ...
Setting up libopenslide0 (3.4.1+dfsg-5build1) ...
Setting up openslide-tools (3.4.1+dfsg-5build1) ...
Processing triggers for man-db (2.10.2-1) ...
Processing triggers for libc-bin (2.35-0ubuntu3.8) ...
/sbin/ldconfig.real: /usr/local/lib/libtbbmalloc_proxy.so.2 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtcm.so.1 is not a symbolic lin

In [2]:
# -------------------------------------------------- Cell 2 -----------------
import yaml, random, json
from pathlib import Path

yaml_path = Path("/content/drive/MyDrive/ColabNotebooks/wsi-ssrl-rcc_project/config/preprocessing.yaml")
with open(yaml_path) as f:
    cfg = yaml.safe_load(f)

colab_root = Path(cfg["env_paths"]["colab"])
local_root = Path(cfg["env_paths"]["local"])
root       = colab_root if colab_root.exists() else local_root
assert root.exists(), "❌ Project root non trovato!"

stage_cfg = (
    cfg["stages"]["debug"]
    if cfg["stages"]["debug"]["downsample_patients"]["enabled"]
    else cfg["stages"]["training"]
)

PATCH_SIZE  = stage_cfg["patching"]["patch_size"]
RANDOM_SEED = stage_cfg["patching"]["random_seed"]
MAX_DBG     = 10
SHARD_SIZE  = 5_000
rng         = random.Random(RANDOM_SEED)

print("✅ root: ", root)
print('✅ stage:', 'debug' if stage_cfg is cfg["stages"]["debug"] else 'training')
print("✅ patch:", PATCH_SIZE, "px")


✅ root:  /content/drive/MyDrive/ColabNotebooks/wsi-ssrl-rcc_project
✅ stage: training
✅ patch: 224 px


In [3]:
# -------------------------------------------------- Cell 3 -----------------
import pandas as pd

patch_df_path = root / "data/processed/patch_df_5000.parquet"
patch_df      = pd.read_parquet(patch_df_path)
assert "split" in patch_df.columns

print(f"✅ patch_df: {len(patch_df)} righe — splits:", patch_df["split"].value_counts().to_dict())


✅ patch_df: 5000 righe — splits: {'train': 3000, 'val': 1000, 'test': 1000}


In [4]:
# -------------------------------------------------- Cell 4 -----------------
from functools import lru_cache
from openslide import OpenSlide
from PIL import Image

@lru_cache(maxsize=128)
def _open_slide(path: str) -> OpenSlide:
    """Open a WSI once and cache it (LRU-128)."""
    return OpenSlide(path)

def extract_patch(row) -> Image.Image:
    """Return RGB patch from a WSI row."""
    src   = row["wsi_path"] if pd.notna(row["wsi_path"]) else row["roi_file"]
    slide = _open_slide(src)
    region = slide.read_region(
        (int(row["x"]), int(row["y"])), 0, (PATCH_SIZE, PATCH_SIZE)
    )
    return region.convert("RGB")


In [5]:
# -------------------------------------------------- Cell 5 -----------------
import io, tarfile, os, threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from queue import Queue
from tqdm.auto import tqdm

# --- directory output ------------------------------------------------------
splits   = ("train", "val", "test")
out_root = root / "data/processed/webdataset"
dbg_root = root / "data/visual_debug/extract_examples"
for p in [out_root, dbg_root]:
    p.mkdir(parents=True, exist_ok=True)

# 📁  assicura sottocartelle
for s in splits:
    (out_root / s).mkdir(parents=True, exist_ok=True)
    (dbg_root / s).mkdir(parents=True, exist_ok=True)


# --- setup iniziale senza checkpoint ---------------------------------------
# CKPT_PATH      = out_root / "extract_ckpt.json" # Rimosso il path del checkpoint
SAVE_EVERY_N   = 100                         # patch
start_idx      = 0 # Si inizia sempre da 0
state          = {}

def _open_shard(split: str, idx: int, mode: str = "w"):
    """Open a tar shard in given mode ('w' new, 'a' append)."""
    path = out_root / split / f"patches-{idx:06d}.tar"
    return tarfile.open(path, mode), path

# ---- setup iniziale senza ripristino da checkpoint --------------------------
print("🚀 Nuova estrazione") # Sempre nuova estrazione
for s in splits:
    # Si aprono sempre i primi shard in modalità scrittura ('w')
    # I vecchi file .tar verranno sovrascritti se esistono
    tar, _ = _open_shard(s, 0, "w")
    # dbg_cnt non viene usato in questa logica senza checkpoint
    state[s] = {"tar": tar, "shard_idx": 0, "img_in_shard": 0}


# ---- writer thread --------------------------------------------------------
q = Queue(maxsize=1024)
state_lock = threading.Lock() # Non strettamente necessario senza checkpoint, ma lasciato per coerenza

def writer():
    while True:
        item = q.get()
        if item is None: break
        split, fname, data = item
        st = state[split]
        # rotazione shard
        if st["img_in_shard"] >= SHARD_SIZE:
            st["tar"].close()
            st["shard_idx"] += 1
            st["img_in_shard"] = 0
            st["tar"], _       = _open_shard(split, st["shard_idx"], "w")
        ti = tarfile.TarInfo(fname)
        ti.size = len(data)
        st["tar"].addfile(ti, io.BytesIO(data))
        st["img_in_shard"] += 1
        q.task_done()

writer_thr = threading.Thread(target=writer, daemon=True)
writer_thr.start()

# ---- worker ---------------------------------------------------------------
# Qui non cambia nulla
def worker(i: int, row):
    try:
        img = extract_patch(row)
    except Exception as e:
        return None, f"[SKIP] {row['patient_id']} ({e})"

    # serializza in JPEG
    buf = io.BytesIO()
    img.save(buf, format="JPEG", quality=85, optimize=True)

    # correzione del subtype “not” → “not_tumor”
    subtype = row["subtype"]
    if subtype == "not":
        subtype = "not_tumor"

    # costruisci la chiave con il subtype corretto
    # Nota: l'indice qui sarà relativo allo shard, non all'intero dataframe
    # Questo è consistente con la logica di non avere un checkpoint globale
    key = f"{subtype}_{row['patient_id']}_{i:06d}" # Usa l'indice locale 'i'
    return (row["split"], f"{key}.jpg", buf.getvalue()), None


# ---- loop parallelo -------------------------------------------------------
# Non si slice più il dataframe con start_idx, si usa sempre l'intero dataframe
patch_df_sh = (
    patch_df.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)
    #.iloc[start_idx:] # Rimosso lo slicing
)

num_workers = min(8, (os.cpu_count() or 2) * 2)
if "COLAB_TPU_ADDR" in os.environ:
    num_workers = 4

debug_buf = []
# La barra di progresso si basa sull'intero dataframe
pbar = tqdm(total=len(patch_df_sh), desc="Extract", unit="patch", dynamic_ncols=True)

with ThreadPoolExecutor(max_workers=num_workers) as pool:
    futures = {
        # Passa l'indice locale 'i' al worker
        pool.submit(worker, i, row): i
        for i, row in patch_df_sh.iterrows()
    }
    processed = 0
    for fut in as_completed(futures):
        data, warn = fut.result()
        if warn:
            pbar.write(warn)
        elif data:
            q.put(data)
            # salva per debug
            split, fname, jpg = data
            subtype = fname.split("_")[0]
            if len(debug_buf) < MAX_DBG * len(splits):
                debug_buf.append((split, subtype, jpg))
            processed += 1
            pbar.update(1)

            # checkpoint periodico - Rimosso
            # if processed % SAVE_EVERY_N == 0:
            #     with state_lock:
            #         ckpt = {
            #             "last_idx": start_idx + processed - 1, # Logica diversa senza start_idx
            #             "state": {
            #                 s: {
            #                     "shard_idx": state[s]["shard_idx"],
            #                     "img_in_shard": state[s]["img_in_shard"],
            #                 } for s in splits
            #             }
            #         }
            #     with open(CKPT_PATH, "w") as f:
            #         json.dump(ckpt, f)

# ---- chiusura -------------------------------------------------------------
q.join()
q.put(None)
writer_thr.join()
for s in splits:
    state[s]["tar"].close()
# CKPT_PATH.unlink(missing_ok=True) # Rimosso la cancellazione del checkpoint
pbar.close()
print("✅ Estrazione completata")

🚀 Nuova estrazione


Extract:   0%|          | 0/5000 [00:00<?, ?patch/s]

KeyboardInterrupt: 

In [None]:
# -------------------------------------------------- Cell 6 -----------------
for split, subtype, jpg in debug_buf:
    out_dir = dbg_root / split
    out_dir.mkdir(parents=True, exist_ok=True)
    cnt = len(list(out_dir.glob(f"{subtype}_*.jpg")))
    if cnt < MAX_DBG:
        (out_dir / f"{subtype}_{cnt}.jpg").write_bytes(jpg)

print(f"✅ Debug images (max {MAX_DBG} per subtype/split) salvate in {dbg_root}")


In [None]:
# -------------------------------------------------- Cell 7 -----------------
import matplotlib.pyplot as plt
from PIL import Image
from pathlib import Path

# Elenco fisso delle 5 classi (puoi cambiare l’ordine se preferisci)
subtypes = ["not", "CHROMO", "ONCO", "ccRCC", "pRCC"]

def pretty_label(name: str) -> str:
    """Mette in forma leggibile il nome della classe."""
    if name == "not":
        return "Not tumor"
    return name.capitalize()

cols = MAX_DBG
rows = len(subtypes)

fig, axes = plt.subplots(rows, cols,
                         figsize=(cols * 2.2, max(2, rows) * 2.2),
                         squeeze=False)

for r, st in enumerate(subtypes):
    for c in range(cols):
        ax = axes[r][c]
        img_path = None
        # cerco l’immagine di debug per questa classe e posizione
        for split in splits:
            candidate = dbg_root / split / f"{st}_{c}.jpg"
            if candidate.exists():
                img_path = candidate
                break

        if img_path:
            ax.imshow(Image.open(img_path))
        else:
            ax.text(0.5, 0.5, "N/A",
                    ha="center", va="center",
                    color="gray", fontsize=9)
        ax.axis("off")

        if c == 0:
            ax.set_title(pretty_label(st), loc="left", fontsize=11)

plt.tight_layout()
plt.show()


In [None]:
# --------------------------------------------------------------------------- #
# Statistiche sui tar generati
# --------------------------------------------------------------------------- #
import glob

print("\n📦 Statistiche sui .tar generati per split:\n")
for s in splits:
    split_dir = out_root / s
    tar_paths = sorted(glob.glob(str(split_dir / "patches-*.tar")))
    print(f"Split '{s}': {len(tar_paths)} shard")
    for tp in tar_paths:
        with tarfile.open(tp, "r") as t:
            members = t.getmembers()
            n_items = len(members)
        size_mb = os.path.getsize(tp) / (1024**2)
        print(f"  • {os.path.basename(tp)} → {n_items:5d} immagini, {size_mb:6.2f} MB")
    print()
