In [19]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [20]:
!apt-get update -qq && apt-get install -qq -y libopenslide-dev
!pip install openslide-python

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)


In [21]:
# ===== Cell 1: Monta Drive e carica YAML =====
from pathlib import Path
import os, yaml
import random
import numpy as np
import pandas as pd, json, random, yaml, numpy as np
import xml.etree.ElementTree as ET

from openslide import OpenSlide
from tqdm.notebook import tqdm  # usa tqdm in Colab


# 1) Carica raw YAML
yaml_path = Path('/content/drive/MyDrive/ColabNotebooks/wsi-ssrl-rcc_project/config/preprocessing.yaml')
if not yaml_path.exists():
    yaml_path = Path('config/preprocessing.yaml')
with open(yaml_path, 'r') as f:
    cfg = yaml.safe_load(f)

# 2) Rileva ambiente
colab_root = Path(cfg['env_paths']['colab'])
local_root = Path(cfg['env_paths']['local'])
if colab_root.exists():
    resolved = str(colab_root)
elif local_root.exists():
    resolved = str(local_root)
else:
    raise FileNotFoundError("Neither colab nor local root found.")

# 3) Sostituisci i placeholder
cfg['RESOLVED_BASE_DIR'] = resolved
cfg['project_root'] = resolved
cfg['base_dir'] = f"{resolved}/data/RCC_WSIs"

print("✅ Ambiente:", "Colab" if colab_root.exists() else "Locale")


✅ Ambiente: Colab


In [22]:
# ===== Cell 2: Imports, risolvi paths e stage =====
import random
from pathlib import Path

# 1) Risolvi base_dir e project_root
base_dir     = Path(cfg['base_dir'])
project_root = Path(cfg['project_root'])

# 2) Risolvi tutti i raw data paths
paths = {
    k: Path(v.replace('${base_dir}', str(base_dir)))
    for k, v in cfg['paths'].items()
}

# 3) Definisci anche WSI_DIR e XML_DIR per comodità
WSI_DIR = {
    'ccRCC':  paths['ccrcc_wsi'],
    'pRCC':   paths['prcc_wsi'],
    'CHROMO': paths['chromo_wsi'],
    'ONCO':   paths['onco_wsi'],
}
XML_DIR = {
    'ccRCC': paths['ccrcc_xml'],
    'pRCC':  paths['prcc_xml'],
}

# 4) Carica le percentuali di split e il seed dal YAML (non serve più JSON)
SPLIT_RATIOS     = cfg['split']['ratios']
SPLIT_RANDOM_SEED = cfg['split']['random_seed']

# 5) Determina lo stage attivo e definisci DEV_ENABLED e DEV_PER_CLASS
DEV_ENABLED    = cfg['stages']['debug']['downsample_patients']['enabled']
DEV_PER_CLASS  = cfg['stages']['debug']['downsample_patients']['per_class']

if DEV_ENABLED:
    stage_name = 'debug'
else:
    stage_name = 'training'
stage_cfg = cfg['stages'][stage_name]

# 6) Estrai parametri dal blocco attivo
PER_CLASS         = stage_cfg['downsample_patients']['per_class']
patch_cfg         = stage_cfg['patching']
PATCH_SIZE        = patch_cfg['patch_size']
STRIDE            = patch_cfg['stride']
PATCHES_PER_CLASS = patch_cfg['patches_per_class']
RANDOM_SEED       = patch_cfg['random_seed']

# 7) Imposta seed globale
random.seed(RANDOM_SEED)

# 8) Stampa riepilogo
print(f"✅ Stage attivo:       {stage_name}")
print(f"➡️  DEV_ENABLED:       {DEV_ENABLED}")
print(f"➡️  DEV_PER_CLASS:     {DEV_PER_CLASS}")
print(f"➡️  PER_CLASS:         {PER_CLASS}")
print(f"➡️  PATCHES_PER_CLASS: {PATCHES_PER_CLASS}")
print(f"➡️  PATCH_SIZE:        {PATCH_SIZE}")
print(f"➡️  STRIDE:            {STRIDE}")
print(f"➡️  RANDOM_SEED:       {RANDOM_SEED}")
print(f"➡️  SPLIT_RATIOS:      {SPLIT_RATIOS}")
print(f"➡️  SPLIT_RANDOM_SEED: {SPLIT_RANDOM_SEED}")

✅ Stage attivo:       training
➡️  DEV_ENABLED:       False
➡️  DEV_PER_CLASS:     5
➡️  PER_CLASS:         5
➡️  PATCHES_PER_CLASS: 500
➡️  PATCH_SIZE:        224
➡️  STRIDE:            112
➡️  RANDOM_SEED:       123
➡️  SPLIT_RATIOS:      {'train': 0.6, 'val': 0.2, 'test': 0.2}
➡️  SPLIT_RANDOM_SEED: 42


In [23]:
# ===== Cell 3: parse_rois & is_patch_informative =====
import xml.etree.ElementTree as ET
from pathlib import Path
import numpy as np

def parse_rois(xml_path: Path):
    """
    Estrae tutte le ROI da un file ASAP XML, restituendo una lista di
    tuple (minx, maxx, miny, maxy, label) dove label è 'tumor' o 'not_tumor'.
    """
    tree = ET.parse(str(xml_path))
    root = tree.getroot()
    rois = []
    annots = root.find('Annotations')
    if annots is None:
        return rois

    for a in annots.findall('Annotation'):
        group = a.attrib.get('PartOfGroup')
        if group is None:
            continue

        coords = a.find('Coordinates')
        xs = [float(c.attrib['X']) for c in coords.findall('Coordinate')]
        ys = [float(c.attrib['Y']) for c in coords.findall('Coordinate')]
        if not xs or not ys:
            continue

        minx, maxx = int(min(xs)), int(max(xs))
        miny, maxy = int(min(ys)), int(max(ys))
        # Raggruppa tutte le annotazioni non-"tumor" in 'not_tumor'
        label = 'tumor' if group.lower() == 'tumor' else 'not_tumor'
        rois.append((minx, maxx, miny, maxy, label))

    return rois

def is_patch_informative(pil_img, thresh=10):
    """
    Restituisce True se la patch (PIL) ha deviazione standard del canale
    in scala di grigi maggiore di thresh (evita aree troppo uniformi).
    """
    gray = pil_img.convert("L")
    return np.array(gray).std() > thresh


In [24]:
# ===== Cell 4: Load metadata & filter =====
import pandas as pd

# 1) Carica metadata (tutte le colonne come stringhe)
metadata = pd.read_csv(project_root / 'data/processed/metadata.csv', dtype=str)

# 2) Tieni solo le righe che hanno annotazioni XML o ROI
has_xml = metadata['annotation_xml'].notna() & (metadata['annotation_xml'] != '')
has_roi = metadata['roi_files'].notna()       & (metadata['roi_files']    != '')
metadata = metadata[has_xml | has_roi].reset_index(drop=True)

# 3) Se siamo in modalità debug, downsampling per pazienti distinti
if DEV_ENABLED:
    selected_pats = []
    for subtype, grp in metadata.groupby('subtype'):
        # prendi i primi DEV_PER_CLASS pazienti distinti per ogni sottotipo
        pats = grp['patient_id'].unique()[:DEV_PER_CLASS].tolist()
        print(f"→ {subtype}: selezionati pazienti {pats}")
        selected_pats.extend(pats)
    metadata = metadata[metadata['patient_id'].isin(selected_pats)].reset_index(drop=True)

# 4) Riepilogo
print(f"✅ Metadata totale: {len(metadata)} righe")
print(metadata['subtype'].value_counts())

✅ Metadata totale: 196 righe
subtype
ccRCC     124
pRCC       48
ONCO       13
CHROMO     11
Name: count, dtype: int64


In [25]:
# ===== Cell 5: Build patch_df with stage overrides =====
!apt-get update -qq && apt-get install -qq -y libopenslide-dev
!pip install openslide-python

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)


In [26]:
# ===== Cell 5 – patch sampling bilanciato per split =========================
import math, random, tarfile
from tqdm import tqdm
from collections import defaultdict, Counter
from openslide import OpenSlide

rng = random.Random(RANDOM_SEED)

# ---------------------------------------------------------------------#
# 1) patient_split (una sola volta qui, prima di estrarre le patch)    #
# ---------------------------------------------------------------------#
patient_split = {}
rng_ps = random.Random(SPLIT_RANDOM_SEED)

for sub in metadata['subtype'].unique():          # ccRCC, pRCC, CHROMO, ONCO
    pats = sorted(metadata.loc[metadata.subtype == sub, 'patient_id'].unique())
    rng_ps.shuffle(pats)

    n = len(pats)
    n_train = max(1, math.floor(n * SPLIT_RATIOS['train']))
    n_val   = max(1 if n >= 3 else 0, math.floor(n * SPLIT_RATIOS['val']))
    n_test  = n - n_train - n_val
    if n_test == 0 and n_val > 1:                 # garanzia minima
        n_val -= 1
        n_test += 1

    idx = 0
    for p in pats[idx: idx + n_train]:
        patient_split[p] = 'train'
    idx += n_train
    for p in pats[idx: idx + n_val]:
        patient_split[p] = 'val'
    idx += n_val
    for p in pats[idx: idx + n_test]:
        patient_split[p] = 'test'

# debug rapido
print("\n✅ Pazienti per split (check)")
for sub in sorted(metadata.subtype.unique()):
    c = Counter(patient_split[p] for p in metadata.loc[metadata.subtype==sub,'patient_id'])
    print(f"  {sub:7}: {dict(c)}")

# ---------------------------------------------------------------------#
# 2) pre-costruzione mappa ROI (come prima, ma riusabile)              #
# ---------------------------------------------------------------------#
def build_roi_map(sub_meta, sub):
    rm = defaultdict(list)
    for _, r in sub_meta.iterrows():
        pid  = r['patient_id']
        if sub in ('ccRCC', 'pRCC'):           # XML
            xml_dir = paths[f"{'pre_' if r['source_dir'].startswith('pre') else ''}{sub.lower()}_xml"]
            wsi_dir = paths[f"{'pre_' if r['source_dir'].startswith('pre') else ''}{sub.lower()}_wsi"]
            wsi_p   = wsi_dir / r['wsi_filename']
            for xm in r['annotation_xml'].split(';'):
                xml_p = xml_dir / xm
                try:
                    for *box, label in parse_rois(xml_p):
                        eff = sub if label == 'tumor' else 'not_tumor'
                        rm[pid].append(('xml', wsi_p, xml_p, box, eff))
                except FileNotFoundError:
                    continue
        else:                                  # ROI singole
            ann_dir = paths['onco_ann'] if sub == 'ONCO' else paths['chromo_ann']
            for f in r['roi_files'].split(';'):
                rm[pid].append(('roi', ann_dir/f, None, None, sub))
    return rm

roi_maps = {s: build_roi_map(metadata[metadata.subtype == s], s)
            for s in metadata.subtype.unique()}

# ---------------------------------------------------------------------#
# 3) sampling patch → rows (bilanciato)                                #
# ---------------------------------------------------------------------#
rows = []
split_names = ['train', 'val', 'test']
ratio_order = ['train', 'val', 'test']         # per somma = PATCHES_PER_CLASS

for sub in list(metadata.subtype.unique()) + ['not_tumor']:

    print(f"\n➡️  Sampling {sub}")
    # calcola quante patch servono per ciascuno split
    target_split = {sp: int(PATCHES_PER_CLASS * SPLIT_RATIOS[sp])
                    for sp in split_names}
    # aggiusta rounding → tutto torna a PATCHES_PER_CLASS
    diff = PATCHES_PER_CLASS - sum(target_split.values())
    target_split['train'] += diff                  # piccolo scarto sul train

    # costruiamo lista candidati
    if sub != 'not_tumor':
        roi_map = roi_maps[sub]
    else:  # not_tumor prende dalle ROI 'not_tumor' di ccRCC+pRCC
        roi_map = defaultdict(list)
        for orig in ('ccRCC', 'pRCC'):
            for pid, lst in roi_maps[orig].items():
                roi_map[pid] += [r for r in lst if r[4] == 'not_tumor']

    # distribuzione
    for split in ratio_order:
        need = target_split[split]
        if need == 0:
            continue

        # tutti i pid di questo split
        split_pids = [p for p in roi_map if patient_split.get(p) == split]
        if not split_pids:
            raise RuntimeError(f"Nessun paziente {sub} nel split {split}")

        # ciclo finché raccolgo need patch (con replacement se necessario)
        pbar = tqdm(total=need, desc=f"{sub}-{split}", leave=False, unit='p')
        collected = 0
        while collected < need:
            pid = rng.choice(split_pids)
            cand = [r for r in roi_map[pid] if r[4] == sub]  # not_tumor ok
            if not cand:
                continue
            kind, p1, p2, box, _ = rng.choice(cand)
            try:
                slide = OpenSlide(str(p1))
            except Exception:
                continue

            if kind == 'xml':
                minx, maxx, miny, maxy = box
                if maxx - minx < PATCH_SIZE or maxy - miny < PATCH_SIZE:
                    continue
                x = rng.randint(minx, maxx - PATCH_SIZE)
                y = rng.randint(miny, maxy - PATCH_SIZE)
                rec = {'wsi_path': str(p1), 'xml_path': str(p2), 'roi_file': None}
            else:
                W, H = slide.dimensions
                if W < PATCH_SIZE or H < PATCH_SIZE:
                    continue
                x = rng.randint(0, W - PATCH_SIZE)
                y = rng.randint(0, H - PATCH_SIZE)
                rec = {'wsi_path': None, 'xml_path': None, 'roi_file': str(p1)}

            rows.append({
                'subtype': sub,
                'patient_id': pid,
                **rec,
                'x': x, 'y': y,
                'patch_size': PATCH_SIZE,
                'split': split          # ← assegnato qui, già bilanciato
            })
            collected += 1
            pbar.update(1)
        pbar.close()


✅ Pazienti per split (check)
  CHROMO : {'train': 8, 'val': 1, 'test': 2}
  ONCO   : {'train': 9, 'test': 2, 'val': 2}
  ccRCC  : {'train': 64, 'test': 30, 'val': 30}
  pRCC   : {'train': 28, 'test': 15, 'val': 5}

➡️  Sampling ccRCC





➡️  Sampling pRCC





➡️  Sampling CHROMO





➡️  Sampling ONCO





➡️  Sampling not_tumor




In [27]:
# ===== Cell 6: compute patient_split over ALL patients, then build & save =====
import math, random
from collections import Counter
import pandas as pd

# 1) Costruisci patient_split usando tutti i pazienti disponibili per sottotipo
rng = random.Random(cfg['split']['random_seed'])
patient_split = {}
for sub in metadata['subtype'].unique():
    pats = sorted(metadata[metadata['subtype']==sub]['patient_id'].unique())
    rng.shuffle(pats)
    n = len(pats)

    # calcolo iniziale
    n_train = max(1, math.floor(n * SPLIT_RATIOS['train']))
    n_val   = math.floor(n * SPLIT_RATIOS['val'])
    n_test  = n - n_train - n_val

    # === Aggiustamento per avere almeno 1 val se possibile ===
    if n >= 3 and n_val == 0:
        n_val = 1
        # tolgo 1 da test se test > train altrimenti da train
        if n_test > n_train:
            n_test -= 1
        else:
            n_train -= 1

    # assegna i pazienti
    idx = 0
    for p in pats[idx: idx + n_train]:
        patient_split[p] = 'train'
    idx += n_train
    for p in pats[idx: idx + n_val]:
        patient_split[p] = 'val'
    idx += n_val
    for p in pats[idx: idx + n_test]:
        patient_split[p] = 'test'

# Debug: controllo quanti pazienti per split
print("\n✅ Pazienti per sottotipo e split:")
for sub in metadata['subtype'].unique():
    cnt = Counter(patient_split[p] for p in set(metadata[metadata['subtype']==sub]['patient_id']))
    print(f"  {sub}: {cnt}")

# 2) Ora costruisci il DataFrame dei patch sampling (Cell 5)

# 3) Applica le split e salva
patch_df = pd.DataFrame(rows)
patch_df['split'] = patch_df['patient_id'].map(patient_split).fillna('train')

print("\n✅ Patch per sottotipo e split:")
print(patch_df.groupby(['subtype','split']).size())



✅ Pazienti per sottotipo e split:
  ccRCC: Counter({'train': 34, 'test': 12, 'val': 11})
  pRCC: Counter({'train': 13, 'test': 5, 'val': 4})
  CHROMO: Counter({'train': 3, 'test': 1, 'val': 1})
  ONCO: Counter({'train': 3, 'test': 1, 'val': 1})

✅ Patch per sottotipo e split:
subtype    split
CHROMO     test     100
           train    300
           val      100
ONCO       test     100
           train    300
           val      100
ccRCC      test     100
           train    300
           val      100
not_tumor  test     100
           train    300
           val      100
pRCC       test     100
           train    300
           val      100
dtype: int64


In [28]:
# ===== Cell 7: Final DataFrame & save =====

out = project_root/'data/processed/patch_df.parquet'
out.parent.mkdir(parents=True, exist_ok=True)
patch_df.to_parquet(out, index=False)
print(f"\n✅ patch_df saved: {out}")




✅ patch_df saved: /content/drive/MyDrive/ColabNotebooks/wsi-ssrl-rcc_project/data/processed/patch_df.parquet


In [29]:
# ===== Cell 8: Load DataFrame and print unique patients =====
from pathlib import Path
import pandas as pd

# 1) Percorso al parquet
# Use the project_root variable to reconstruct the correct path
patch_df_path = project_root / 'data/processed/patch_df.parquet'

# 2) Carica il DataFrame
df = pd.read_parquet(patch_df_path)

# 3) Raggruppa per sottotipo e ottieni i pazienti unici
grouped = (
    df
    .groupby('subtype')['patient_id']
    .unique()
    .reset_index()
    .rename(columns={'patient_id': 'patients'})
)

# 4) Stampa conteggio e lista
print("✅ Conteggio pazienti unici nel patch_df:")
for _, row in grouped.iterrows():
    subtype = row['subtype']
    pats = sorted(row['patients'])
    print(f"  {subtype}: {len(pats)} pazienti → {', '.join(pats)}")

✅ Conteggio pazienti unici nel patch_df:
  CHROMO: 5 pazienti → HP17008718, HP18014084, HP19012316, HP20.2506, HP20002300
  ONCO: 5 pazienti → HP18005453, HP18009209, HP20.5602, HP20001530, HP20002450
  ccRCC: 51 pazienti → H19.754, HP02.10180, HP10.2695, HP10.2986_A4_ccRCC, HP10.5813, HP11.12277, HP11.12318, HP12.13588, HP12.390, HP12.4271, HP12.5998, HP12.6073, HP12.6073_A5_ccRCC, HP12.6691, HP12.7225, HP12.8355, HP12.8793, HP12.9282, HP13.6992, HP13.7465, HP14.11034, HP14.13101, HP14.1749, HP14.1993, HP14.5347, HP14.69, HP14.7813, HP14.9097, HP14.9685, HP15.11259, HP15.12550, HP15.1480, HP15.2902, HP16.6209, HP16.6211, HP16.819, HP19.10064, HP19.2434, HP19.4075, HP19.4372, HP19.5254, HP19.5524, HP19.7421, HP19.754, HP19.7715, HP19.7840, HP19.7864, HP19.8394, HP19.9347, HP19.9421, HP19.999
  not_tumor: 77 pazienti → H19.754, HP02.10180, HP09.5392, HP10.2695, HP10.5813, HP10.9650, HP11.12277, HP11.12318, HP11.6090, HP12.13358, HP12.13588, HP12.3187, HP12.390, HP12.4271, HP12.5904, HP1