In [1]:
# ===== Cell 1 – mount / install (solo Colab) =================================
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

!apt-get -qq update
!apt-get -qq install -y openslide-tools
!pip -q install openslide-python tqdm

Mounted at /content/drive
W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Selecting previously unselected package libopenslide0.
(Reading database ... 126102 files and directories currently installed.)
Preparing to unpack .../libopenslide0_3.4.1+dfsg-5build1_amd64.deb ...
Unpacking libopenslide0 (3.4.1+dfsg-5build1) ...
Selecting previously unselected package openslide-tools.
Preparing to unpack .../openslide-tools_3.4.1+dfsg-5build1_amd64.deb ...
Unpacking openslide-tools (3.4.1+dfsg-5build1) ...
Setting up libopenslide0 (3.4.1+dfsg-5build1) ...
Setting up openslide-tools (3.4.1+dfsg-5build1) ...
Processing triggers for man-db (2.10.2-1) ...
Processing triggers for libc-bin (2.35-0ubuntu3.8) ...
/sbin/ldconfig.real: /usr/local/lib/libumf.so.0 is not a symbolic link

/sbin/ldconfig.real: /usr/local/lib/libtcm_debug.so.1 is not a symbolic link

/sb

In [2]:
from pathlib import Path
import yaml, json, tarfile, random, io
from tqdm.auto import tqdm
import pandas as pd
from openslide import OpenSlide
from PIL import Image

In [3]:
# ===== Cell 2: Config & ambiente ============================================
# 1) YAML
yaml_path = Path('/content/drive/MyDrive/ColabNotebooks/wsi-ssrl-rcc_project/config/preprocessing.yaml')
with open(yaml_path) as f:
    cfg = yaml.safe_load(f)

# 2) ambiente (Colab / local)
colab_root = Path(cfg['env_paths']['colab'])
local_root = Path(cfg['env_paths']['local'])
root       = colab_root if colab_root.exists() else local_root
if not root.exists():
    raise FileNotFoundError('Impossibile trovare project_root')

base_dir = root / 'data/RCC_WSIs'

# 3) stage
stage_cfg  = cfg['stages']['debug'] if cfg['stages']['debug']['downsample_patients']['enabled'] else cfg['stages']['training']
PATCH_SIZE = stage_cfg['patching']['patch_size']
SHARD_SIZE = 5_000                     # img per tar
RANDOM_SEED= stage_cfg['patching']['random_seed']
MAX_DBG    = 10                        # jpg per subtype per split
rng        = random.Random(RANDOM_SEED)

print('✅ root:', root)
print('✅ stage:', 'debug' if stage_cfg is cfg["stages"]["debug"] else 'training')
print('✅ patch:', PATCH_SIZE, 'px')
print('✅ shard size:', SHARD_SIZE)

✅ root: /content/drive/MyDrive/ColabNotebooks/wsi-ssrl-rcc_project
✅ stage: training
✅ patch: 224 px
✅ shard size: 5000


In [4]:
# ===== Cell 3: Carica patch_df ==============================================
patch_df_path = root / 'data/processed/patch_df.parquet'
patch_df      = pd.read_parquet(patch_df_path)
if 'split' not in patch_df.columns:
    raise RuntimeError("'split' column mancante nel parquet!")

print(f"✅ patch_df: {len(patch_df)} righe –  splits: {patch_df['split'].value_counts().to_dict()}")

✅ patch_df: 2500 righe –  splits: {'train': 1500, 'val': 500, 'test': 500}


In [5]:
# ===== Cell 4: Utility ======================================================
def extract_patch(row, cache):
    """Restituisce PIL.Image RGB della patch richiesta."""
    src = row['wsi_path'] if pd.notna(row['wsi_path']) else row['roi_file']
    if src not in cache:
        cache[src] = OpenSlide(src)
    slide = cache[src]
    region = slide.read_region(
        (int(row['x']), int(row['y'])), 0, (PATCH_SIZE, PATCH_SIZE)
    )
    return region.convert('RGB')

In [6]:
# ===== Cell 5 – Estrazione patch → WebDataset, suddivisa per split ===========
import tarfile, io, os
from tqdm.auto import tqdm

# --------------------------------------------------------------------------- #
# directory di output (.tar) e di debug (.jpg)                                #
# --------------------------------------------------------------------------- #
out_root = root / "data/processed/webdataset"
dbg_root = root / "data/visual_debug/extract_examples"
out_root.mkdir(parents=True, exist_ok=True)
dbg_root.mkdir(parents=True, exist_ok=True)

splits = ["train", "val", "test"]
for s in splits:
    (out_root / s).mkdir(parents=True, exist_ok=True)
    (dbg_root / s).mkdir(parents=True, exist_ok=True)

# --------------------------------------------------------------------------- #
# parametri globali (già definiti a monte, li ribadiamo qui per chiarezza)    #
# --------------------------------------------------------------------------- #
SHARD_SIZE      = 5_000        # nr. immagini per .tar
MAX_DEBUG_JPG   = 10           # max jpg di ispezione per (split, classe)

# --------------------------------------------------------------------------- #
# utilità: apertura/rotazione shard                                           #
# --------------------------------------------------------------------------- #
def _open_shard(split: str, idx: int):
    """Apre un nuovo tar in write-mode per lo split indicato."""
    path = out_root / split / f"patches-{idx:04d}.tar"
    return tarfile.open(path, "w"), path

# --------------------------------------------------------------------------- #
# stato per ciascuno split                                                    #
# --------------------------------------------------------------------------- #
state = {}
for s in splits:
    tar, _ = _open_shard(s, 0)
    state[s] = dict(
        tar         = tar,
        shard_idx   = 0,
        img_in_shard= 0,
        dbg_cnt     = {}        # subtype -> immagini salvate
    )

def _next_shard(split: str):
    """Chiude lo shard corrente e ne apre uno nuovo."""
    st = state[split]
    st["tar"].close()
    st["shard_idx"]   += 1
    st["img_in_shard"] = 0
    st["tar"], _       = _open_shard(split, st["shard_idx"])

# --------------------------------------------------------------------------- #
# estraiamo le patch – shuffle per distribuire meglio le classi               #
# --------------------------------------------------------------------------- #
slide_cache       = {}
patch_df_shuffled = patch_df.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

pbar = tqdm(total=len(patch_df_shuffled), desc="Extract", unit="patch")

for idx, row in patch_df_shuffled.iterrows():
    split = row["split"]
    st    = state[split]

    # 1) rotazione shard se necessario
    if st["img_in_shard"] >= SHARD_SIZE:
        _next_shard(split)

    # 2) estrai la patch
    try:
        img = extract_patch(row, slide_cache)
    except Exception as e:
        pbar.write(f"⚠️  skip {row['patient_id']} @({row['x']},{row['y']}): {e}")
        pbar.update(1)
        continue

    # 3) scrittura nel tar
    fname = f"{row['subtype']}_{row['patient_id']}_{idx:06d}.jpg"   # univoco nello split
    buf   = io.BytesIO()
    img.save(buf, format="JPEG", quality=90)
    ti = tarfile.TarInfo(fname)
    ti.size = buf.tell()
    buf.seek(0)
    st["tar"].addfile(ti, buf)
    st["img_in_shard"] += 1

    # 4) salvataggio debug (max N per classe e split)
    cnt = st["dbg_cnt"].get(row["subtype"], 0)
    if cnt < MAX_DEBUG_JPG:
        img.save(dbg_root / split / f"{row['subtype']}_{cnt}.jpg")
        st["dbg_cnt"][row["subtype"]] = cnt + 1

    pbar.update(1)

pbar.close()

# --------------------------------------------------------------------------- #
# chiusura definitiva di tutti gli shard                                      #
# --------------------------------------------------------------------------- #
for s in splits:
    state[s]["tar"].close()

print("\n✅ Estratti shard:")
for s in splits:
    print(f"  {s:<5}: {state[s]['shard_idx'] + 1} tar in {out_root / s}")

print(f"\n✅ Immagini di debug (max {MAX_DEBUG_JPG} / classe e split) in {dbg_root}")


Extract:   0%|          | 0/2500 [00:00<?, ?patch/s]


✅ Estratti shard:
  train: 1 tar in /content/drive/MyDrive/ColabNotebooks/wsi-ssrl-rcc_project/data/processed/webdataset/train
  val  : 1 tar in /content/drive/MyDrive/ColabNotebooks/wsi-ssrl-rcc_project/data/processed/webdataset/val
  test : 1 tar in /content/drive/MyDrive/ColabNotebooks/wsi-ssrl-rcc_project/data/processed/webdataset/test

✅ Immagini di debug (max 10 / classe e split) in /content/drive/MyDrive/ColabNotebooks/wsi-ssrl-rcc_project/data/visual_debug/extract_examples


In [7]:
# ===== Cell 6: Visualizzazione esempi estratti ===============================
import matplotlib.pyplot as plt
from PIL import Image
import math, itertools

# Retrieve debug counts from the state variable defined in Cell 5
# We need to combine the counts from all splits (train, val, test)
debug_count = {}
for s in splits:
    for subtype, count in state[s]["dbg_cnt"].items():
        if subtype not in debug_count:
            debug_count[subtype] = 0
        debug_count[subtype] = max(debug_count[subtype], count) # Use the max count across splits

# Define jpg_debug_dir which was used but not defined in the original snippet
# This should point to the directory where the debug images were saved in Cell 5
jpg_debug_dir = dbg_root # Assuming dbg_root from Cell 5 is the correct directory

subtypes = sorted(debug_count.keys())
cols     = MAX_DEBUG_JPG
rows     = len(subtypes)

fig, axes = plt.subplots(rows, cols,
                         figsize=(cols*2.2, max(2, rows)*2.2),
                         squeeze=False)

for r, st in enumerate(subtypes):
    for c in range(cols):
        ax = axes[r][c]
        img_found = False
        # Iterate through splits to find the debug image
        for s in splits:
            img_p = jpg_debug_dir / s / f"{st}_{c}.jpg"
            if img_p.exists():
                ax.imshow(Image.open(img_p))
                img_found = True
                break # Stop searching once the image is found in one split

        if not img_found:
             # Optionally, display a placeholder or leave blank if no image is found
             ax.text(0.5, 0.5, 'N/A', horizontalalignment='center', verticalalignment='center', transform=ax.transAxes, color='gray', fontsize=10)


        ax.axis("off")
        if c == 0:
            ax.set_title(st, fontsize=12, loc="left")

plt.tight_layout()
plt.show()

Output hidden; open in https://colab.research.google.com to view.

In [8]:
# --------------------------------------------------------------------------- #
# Statistiche sui tar generati
# --------------------------------------------------------------------------- #
import glob

print("\n📦 Statistiche sui .tar generati per split:\n")
for s in splits:
    split_dir = out_root / s
    tar_paths = sorted(glob.glob(str(split_dir / "patches-*.tar")))
    print(f"Split '{s}': {len(tar_paths)} shard")
    for tp in tar_paths:
        with tarfile.open(tp, "r") as t:
            members = t.getmembers()
            n_items = len(members)
        size_mb = os.path.getsize(tp) / (1024**2)
        print(f"  • {os.path.basename(tp)} → {n_items:5d} immagini, {size_mb:6.2f} MB")
    print()



📦 Statistiche sui .tar generati per split:

Split 'train': 1 shard
  • patches-0000.tar →  1500 immagini,  26.16 MB

Split 'val': 1 shard
  • patches-0000.tar →   500 immagini,   8.60 MB

Split 'test': 1 shard
  • patches-0000.tar →   500 immagini,   8.98 MB

