Preparing Unified Metadata for the RCC Dataset
This notebook performs the following steps:

1. Loads mapping JSON files for ccRCC, pRCC, CHROMO, and ONCO.
2. Loads dataset statistics.
3. Verifies that the number of mapped files matches the statistics.
4. Constructs a unified table with one row per WSI, including annotations and ROIs.
5. Exports the result to a CSV file for the downstream training pipeline.

---

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [11]:
# Cell 1: Monta Google Drive e carica il YAML di configurazione
from pathlib import Path
import yaml

# Percorso al file YAML (adatta se necessario)
yaml_path = Path('/content/drive/MyDrive/ColabNotebooks/wsi-ssrl-rcc_project/config/preprocessing.yaml')
if not yaml_path.exists():
    yaml_path = Path('config/preprocessing.yaml')

# Carica il file
with open(yaml_path, 'r') as f:
    cfg = yaml.safe_load(f)

# Risolvi ambiente (Colab vs locale)
colab_root = Path(cfg['env_paths']['colab'])
local_root = Path(cfg['env_paths']['local'])
if colab_root.exists():
    resolved_base = str(colab_root)
elif local_root.exists():
    resolved_base = str(local_root)
else:
    raise FileNotFoundError("Né root colab né root locale trovati.")

# Inietta nei placeholder
cfg['RESOLVED_BASE_DIR'] = resolved_base
cfg['base_dir']           = f"{resolved_base}/data/RCC_WSIs"
cfg['project_root']       = resolved_base

print("✅ Config caricata. Base dir:", cfg['base_dir'])


✅ Config caricata. Base dir: /content/drive/MyDrive/ColabNotebooks/wsi-ssrl-rcc_project/data/RCC_WSIs


In [12]:
# Cell 2: installa le dipendenze necessarie
!pip install pyyaml pandas



In [13]:
# Cell 3: Definizione di helper e funzioni principali
import json
import subprocess
from pathlib import Path

import yaml
import pandas as pd

def load_json(path: Path) -> dict:
    """Load JSON file and return its content as dict."""
    return json.loads(path.read_text())

def check_equal(name: str, actual: int, expected: int):
    """Raise error if actual != expected for consistency checks."""
    if actual != expected:
        raise ValueError(f"Stat mismatch for {name}: mapping has {actual}, stats expect {expected}")

def rel(path: Path, base: Path) -> str:
    """Return POSIX relative path from base to path."""
    return str(path.relative_to(base).as_posix())

def extract_patient_id(wsi_filename: str) -> str:
    """
    Extract patient ID from WSI filename.
    For 'HP02.10180.1A2.ccRCC.scn' returns 'HP02.10180'.
    """
    parts = wsi_filename.split(".")
    return ".".join(parts[:2]) if len(parts) >= 2 else parts[0]

def build_rows_from_map(mapping: dict, subtype: str, path_std: Path, path_pre: Path, base_dir: Path) -> list:
    """Build rows for ccRCC/pRCC mappings."""
    rows = []
    for wsi, xmls in mapping.items():
        std_path = path_std / wsi
        pre_path = path_pre / wsi
        if std_path.exists():
            src = rel(path_std, base_dir)
        elif pre_path.exists():
            src = rel(path_pre, base_dir)
        else:
            src = ""
        rows.append({
            "subtype":         subtype,
            "patient_id":         extract_patient_id(wsi),
            "wsi_filename":    wsi,
            "annotation_xml":  ";".join(xmls),
            "num_annotations": len(xmls),
            "roi_files":       "",
            "num_rois":        0,
            "source_dir":      src
        })
    return rows

def build_roi_rows(patient_map: dict, subtype: str, wsi_dir: Path, base_dir: Path) -> list:
    """Build rows for CHROMO/ONCO mappings."""
    rows = []
    for patient, maps in patient_map.items():
        roi_list = maps.get("roi_files", [])
        for wsi in maps.get("wsi_files", []):
            rows.append({
                "subtype":         subtype,
                "patient_id":         patient,
                "wsi_filename":    wsi,
                "annotation_xml":  "",
                "num_annotations": 0,
                "roi_files":       ";".join(roi_list),
                "num_rois":        len(roi_list),
                "source_dir":      rel(wsi_dir, base_dir)
            })
    return rows

def resolve_paths(cfg: dict):
    """Genera i Path oggetto da cfg dopo sostituzione."""
    base = Path(cfg['base_dir'])
    p = cfg['paths']
    return {
        'ccrcc_wsi':     Path(p['ccrcc_wsi'].replace('${base_dir}', str(base))),
        'pre_ccrcc_wsi': Path(p['pre_ccrcc_wsi'].replace('${base_dir}', str(base))),
        'prcc_wsi':      Path(p['prcc_wsi'].replace('${base_dir}', str(base))),
        'pre_prcc_wsi':  Path(p['pre_prcc_wsi'].replace('${base_dir}', str(base))),
        'chromo_wsi':    Path(p['chromo_wsi'].replace('${base_dir}', str(base))),
        'onco_wsi':      Path(p['onco_wsi'].replace('${base_dir}', str(base))),
    }


In [15]:
# Cell 4: Caricamento mapping, verifica, costruzione e salvataggio CSV
# Impostazioni di percorso
mapdir  = f"{cfg['project_root']}/data/processed/mapping"
out_csv = f"{cfg['project_root']}/data/processed/metadata.csv"

/content/drive/MyDrive/ColabNotebooks/wsi-ssrl-rcc_project/data/processed/metadata.csv
# 2) Caricamento JSON
cc_map     = load_json(Path(mapdir) / "ccRCC_mapping.json")
pr_map     = load_json(Path(mapdir) / "pRCC_mapping.json")
chromo_map = load_json(Path(mapdir) / "CHROMO_patient_mapping.json")
onco_map   = load_json(Path(mapdir) / "ONCO_patient_mapping.json")
stats      = load_json(Path(mapdir) / "rcc_dataset_stats.json")

# 3) Verifiche consistenza
check_equal("ccRCC", len(cc_map), stats["ccRCC"]["n_wsis"])
check_equal("pRCC", len(pr_map), stats["pRCC"]["n_wsis"])
actual_ch = sum(len(v["wsi_files"]) for v in chromo_map.values())
check_equal("CHROMO", actual_ch, stats["CHROMO"]["n_wsis"])
actual_on = sum(len(v["wsi_files"]) for v in onco_map.values())
check_equal("ONCO", actual_on, stats["ONCO"]["n_wsis"])

# 4) Costruzione righe
paths = resolve_paths(cfg)
rows = []
rows += build_rows_from_map(cc_map, "ccRCC", paths['ccrcc_wsi'], paths['pre_ccrcc_wsi'], Path(cfg['base_dir']))
rows += build_rows_from_map(pr_map, "pRCC",  paths['prcc_wsi'],  paths['pre_prcc_wsi'],  Path(cfg['base_dir']))
rows += build_roi_rows(chromo_map, "CHROMO", paths['chromo_wsi'], Path(cfg['base_dir']))
rows += build_roi_rows(onco_map,   "ONCO",   paths['onco_wsi'],   Path(cfg['base_dir']))

# 5) Salvataggio CSV
df = pd.DataFrame(rows)
out_path = Path(out_csv)
out_path.parent.mkdir(parents=True, exist_ok=True)
df.to_csv(out_path, index=False)

print(f"✅ Consolidated metadata saved to {out_csv}")
print(f"   Total entries: {len(df)}")


✅ Consolidated metadata saved to /content/drive/MyDrive/ColabNotebooks/wsi-ssrl-rcc_project/data/processed/metadata.csv
   Total entries: 197
