Preparing Unified Metadata for the RCC Dataset
This notebook performs the following steps:

1. Loads mapping JSON files for ccRCC, pRCC, CHROMO, and ONCO.
2. Loads dataset statistics.
3. Verifies that the number of mapped files matches the statistics.
4. Constructs a unified table with one row per WSI, including annotations and ROIs.
5. Exports the result to a CSV file for the downstream training pipeline.

---

In [85]:
"""
load_json(path: Path) -> dict

Reads a JSON file from the given path and returns its contents as a dictionary.
"""
import json
from pathlib import Path

base_path = Path.cwd().parent  # cartella principale del progetto

def load_json(path: Path) -> dict:
    return json.loads(path.read_text())

In [86]:
"""
resolve_config(cfg_path: Path) -> (Path, dict)

Reads a YAML configuration file and returns:
  - base_dir: the expanded base directory
  - paths: a dict mapping keys to Path objects with ${base_dir} resolved
"""
import yaml

def resolve_config(cfg_path: Path):
    cfg = yaml.safe_load(cfg_path.read_text())
    base_dir = Path(cfg["base_dir"]).expanduser()
    def R(p: str) -> Path:
        return Path(str(p).replace("${base_dir}", str(base_dir))).expanduser()
    paths = {key: R(val) for key, val in cfg["paths"].items()}
    return base_dir, paths

In [87]:
"""
check_equal(name: str, actual: int, expected: int)

Raises a ValueError if actual and expected integers differ.
"""

def check_equal(name: str, actual: int, expected: int):
    if actual != expected:
        raise ValueError(f"Statistics mismatch for {name}: mapped {actual} vs expected {expected}")

In [88]:
"""
rel(path: Path, base: Path) -> str

Returns the POSIX-style relative path from base to path.
"""

def rel(path: Path, base: Path) -> str:
    return str(path.relative_to(base).as_posix())

# Building the Metadata Table
The following cell:
- Runs `scripts/build_mapping.py` to update mapping JSONs and statistics.
- Loads mapping and stats JSON files.
- Performs consistency checks.
- Aggregates entries into a list of rows for each subtype.

In [None]:
import subprocess
import pandas as pd


def build_metadata(cfg_file: str, mapdir: str, out_csv: str):
    base_dir, paths = resolve_config(Path(cfg_file))

    # Update mapping JSONs and statistics
    subprocess.run([
        "python3",
        str(base_path / "scripts/build_mapping.py"),
        "--cfg", str(cfg_file),
        "--out", str(mapdir)
    ], check=True)

    # Load mappings and stats
    cc_map     = load_json(Path(mapdir) / "ccRCC_mapping.json")
    pr_map     = load_json(Path(mapdir) / "pRCC_mapping.json")
    chromo_map = load_json(Path(mapdir) / "CHROMO_patient_mapping.json")
    onco_map   = load_json(Path(mapdir) / "ONCO_patient_mapping.json")
    stats      = load_json(Path(mapdir) / "rcc_dataset_stats.json")

    # Consistency checks
    check_equal("ccRCC", len(cc_map), stats["ccRCC"]["n_wsis"])
    check_equal("pRCC", len(pr_map), stats["pRCC"]["n_wsis"])
    total_chromo = sum(len(v["wsi_files"]) for v in chromo_map.values())
    check_equal("CHROMO", total_chromo, stats["CHROMO"]["n_wsis"])
    total_onco = sum(len(v["wsi_files"]) for v in onco_map.values())
    check_equal("ONCO", total_onco, stats["ONCO"]["n_wsis"])

    rows = []

    # ccRCC entries
    for wsi_name, xml_list in cc_map.items():
        std_path = paths["ccrcc_wsi"] / wsi_name
        pre_path = paths["pre_ccrcc_wsi"] / wsi_name
        if std_path.exists():
            source_dir = rel(paths["ccrcc_wsi"], base_dir)
        elif pre_path.exists():
            source_dir = rel(paths["pre_ccrcc_wsi"], base_dir)
        else:
            source_dir = ""

        rows.append({
            "subtype":        "ccRCC",
            "patient_id":     wsi_name.split(".")[0],
            "wsi_filename":   wsi_name,
            "annotation_xml": ";".join(xml_list),
            "num_annotations": len(xml_list),
            "roi_files":      "",
            "num_rois":       0,
            "source_dir":     source_dir
        })

    # pRCC entries
    for wsi_name, xml_list in pr_map.items():
        std_path = paths["prcc_wsi"] / wsi_name
        pre_path = paths["pre_prcc_wsi"] / wsi_name
        if std_path.exists():
            source_dir = rel(paths["prcc_wsi"], base_dir)
        elif pre_path.exists():
            source_dir = rel(paths["pre_prcc_wsi"], base_dir)
        else:
            source_dir = ""

        rows.append({
            "subtype":        "pRCC",
            "patient_id":     wsi_name.split(".")[0],
            "wsi_filename":   wsi_name,
            "annotation_xml": ";".join(xml_list),
            "num_annotations": len(xml_list),
            "roi_files":      "",
            "num_rois":       0,
            "source_dir":     source_dir
        })

    # CHROMO entries
    for patient_id, info in chromo_map.items():
        roi_files = info["roi_files"]
        for wsi_name in info["wsi_files"]:
            rows.append({
                "subtype":        "CHROMO",
                "patient_id":     patient_id,
                "wsi_filename":   wsi_name,
                "annotation_xml": "",
                "num_annotations": 0,
                "roi_files":      ";".join(roi_files),
                "num_rois":       len(roi_files),
                "source_dir":     rel(paths["chromo_wsi"], base_dir)
            })

    # ONCO entries
    for patient_id, info in onco_map.items():
        roi_files = info["roi_files"]
        for wsi_name in info["wsi_files"]:
            rows.append({
                "subtype":        "ONCO",
                "patient_id":     patient_id,
                "wsi_filename":   wsi_name,
                "annotation_xml": "",
                "num_annotations": 0,
                "roi_files":      ";".join(roi_files),
                "num_rois":       len(roi_files),
                "source_dir":     rel(paths["onco_wsi"], base_dir)
            })

    # Create DataFrame and export to CSV
    df = pd.DataFrame(rows)
    output_path = Path(out_csv)
    output_path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(output_path, index=False)

    print(f"✅ Unified metadata CSV saved to: {out_csv}")
    print(f"Total entries: {len(df)}")

# Running the Notebook
Adjust the configuration file path, mapping directory, and output CSV path below, then run the notebook.

In [90]:
config_path = base_path / 'config/preprocessing.yaml'
mapping_dir = base_path / 'data/processed/mapping'
metadata_csv = base_path / 'data/processed/metadata.csv'

if not config_path.exists():
    print(f"❌ Config file not found: {config_path}")
else:
    build_metadata(str(config_path), str(mapping_dir), str(metadata_csv))


⚠️  ccRCC slides without any XML:
    /Users/stefanoroybisignano/Library/CloudStorage/GoogleDrive-stefano2001roy@gmail.com/.shortcut-targets-by-id/1Vr1oH1irxY5UUXdrAgvX4dPpgb8-01Ma/RCC_WSIs/pre/ccRCC/HP10.2986.A4.ccRCC.scn
⚠️  CHROMO WSI without mapping:
    /Users/stefanoroybisignano/Library/CloudStorage/GoogleDrive-stefano2001roy@gmail.com/.shortcut-targets-by-id/1Vr1oH1irxY5UUXdrAgvX4dPpgb8-01Ma/RCC_WSIs/CHROMO/HP70605.svs
⚠️  ONCO WSI without mapping:
    /Users/stefanoroybisignano/Library/CloudStorage/GoogleDrive-stefano2001roy@gmail.com/.shortcut-targets-by-id/1Vr1oH1irxY5UUXdrAgvX4dPpgb8-01Ma/RCC_WSIs/ONCOCYTOMA/HP19008963-2-A-HE_1971.svs
    /Users/stefanoroybisignano/Library/CloudStorage/GoogleDrive-stefano2001roy@gmail.com/.shortcut-targets-by-id/1Vr1oH1irxY5UUXdrAgvX4dPpgb8-01Ma/RCC_WSIs/ONCOCYTOMA/HP19008963-2-B-HE_1972.svs
    /Users/stefanoroybisignano/Library/CloudStorage/GoogleDrive-stefano2001roy@gmail.com/.shortcut-targets-by-id/1Vr1oH1irxY5UUXdrAgvX4dPpgb8-01Ma/RCC_W