In [86]:
# ===== Cell 1: Monta Google Drive (solo se usi Colab) =====
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [87]:
# ===== Cell 1: Detect environment and load YAML config =====
import os
from pathlib import Path
import yaml

# 1. Load raw YAML (Colab path fallback to local)
yaml_path = Path('/content/drive/MyDrive/ColabNotebooks/wsi-ssrl-rcc_project/config/preprocessing.yaml')
if not yaml_path.exists():
    yaml_path = Path('config/preprocessing.yaml')

with open(yaml_path, 'r') as f:
    cfg = yaml.safe_load(f)

# 2. Detect environment by checking env_paths in YAML
colab_root = Path(cfg['env_paths']['colab'])
local_root = Path(cfg['env_paths']['local'])
if colab_root.exists():
    resolved_root = str(colab_root)
elif local_root.exists():
    resolved_root = str(local_root)
else:
    raise FileNotFoundError(f"Né percorso colab ({colab_root}) né local ({local_root}) trovati.")

# 3. Override base_dir and project_root in config dict
cfg['project_root'] = resolved_root
cfg['base_dir']     = f"{resolved_root}/data/RCC_WSIs"

print(f"✅ Ambiente rilevato: {'Colab' if colab_root.exists() else 'Locale'}")


✅ Ambiente rilevato: Colab


In [88]:
# ===== Cell 2: Imports and resolve all paths & settings =====
import pandas as pd
import json
import xml.etree.ElementTree as ET
import random

# Resolve directories
base_dir     = Path(cfg['base_dir']).expanduser()
project_root = Path(cfg['project_root']).expanduser()

# Resolve raw data paths
paths = {
    k: Path(v.replace('${base_dir}', str(base_dir))).expanduser()
    for k, v in cfg['paths'].items()
}

# Resolve split JSON
split_json = Path(cfg['split_json'].replace('${project_root}', str(project_root))).expanduser()

# DEV_MODE settings
down_cfg      = cfg.get('downsample_patients', {})
DEV_ENABLED   = down_cfg.get('enabled', False)
DEV_PER_CLASS = down_cfg.get('per_class', 1)

# Patching defaults
patch_cfg           = cfg.get('patching', {})
PATCH_SIZE          = patch_cfg.get('patch_size', 512)
STRIDE              = patch_cfg.get('stride', PATCH_SIZE)
DEV_PATCHES_PER_WSI = patch_cfg.get('dev_patches_per_wsi', 10)
RANDOM_SEED         = patch_cfg.get('random_seed', 42)

random.seed(RANDOM_SEED)

# Verification printout
print(f"✅ base_dir: {base_dir}")
print(f"✅ project_root: {project_root}")
print(f"✅ paths: {paths}")
print(f"✅ split_json: {split_json}")
print(f"✅ DEV_MODE: {DEV_ENABLED} ({DEV_PER_CLASS} pazienti per classe)")
print(f"✅ Patching: size={PATCH_SIZE}, stride={STRIDE}, dev_per_wsi={DEV_PATCHES_PER_WSI}, seed={RANDOM_SEED}")


✅ base_dir: /content/drive/MyDrive/ColabNotebooks/wsi-ssrl-rcc_project/data/RCC_WSIs
✅ project_root: /content/drive/MyDrive/ColabNotebooks/wsi-ssrl-rcc_project
✅ paths: {'ccrcc_wsi': PosixPath('/content/drive/MyDrive/ColabNotebooks/wsi-ssrl-rcc_project/data/RCC_WSIs/ccRCC'), 'ccrcc_xml': PosixPath('/content/drive/MyDrive/ColabNotebooks/wsi-ssrl-rcc_project/data/RCC_WSIs/ccRCC/ccRCC_xml'), 'prcc_wsi': PosixPath('/content/drive/MyDrive/ColabNotebooks/wsi-ssrl-rcc_project/data/RCC_WSIs/pRCC'), 'prcc_xml': PosixPath('/content/drive/MyDrive/ColabNotebooks/wsi-ssrl-rcc_project/data/RCC_WSIs/pRCC/pRCC_xml'), 'chromo_wsi': PosixPath('/content/drive/MyDrive/ColabNotebooks/wsi-ssrl-rcc_project/data/RCC_WSIs/CHROMO'), 'onco_wsi': PosixPath('/content/drive/MyDrive/ColabNotebooks/wsi-ssrl-rcc_project/data/RCC_WSIs/ONCOCYTOMA'), 'chromo_ann': PosixPath('/content/drive/MyDrive/ColabNotebooks/wsi-ssrl-rcc_project/data/RCC_WSIs/Annotations_chromo'), 'onco_ann': PosixPath('/content/drive/MyDrive/ColabNo

In [89]:
def parse_rois(xml_path: Path):
    """
    Parse ASAP-style XML and return a list of tumor ROIs as bounding boxes
    [(min_x, max_x, min_y, max_y), ...]. Only PartOfGroup="tumor" is retained.
    """
    tree = ET.parse(xml_path)
    root = tree.getroot()
    rois = []

    annotations = root.find('Annotations')
    if annotations is None:
        return rois  # nothing to parse

    for annotation in annotations.findall('Annotation'):
        if annotation.attrib.get('PartOfGroup') != 'tumor':
            continue
        coords = annotation.find('Coordinates')
        if coords is None:
            continue
        xs = []
        ys = []
        for coord in coords.findall('Coordinate'):
            xs.append(float(coord.attrib['X']))
            ys.append(float(coord.attrib['Y']))
        if xs and ys:
            rois.append((int(min(xs)), int(max(xs)), int(min(ys)), int(max(ys))))
    return rois


In [90]:
# ===== Cell 4: Load metadata and splits =====
# 1) Load metadata.csv generated by step 1
metadata_csv = Path(project_root) / 'data/processed/metadata.csv'
metadata = pd.read_csv(metadata_csv)

# 2) Load patient splits
with open(split_json, 'r') as f:
    splits_map = json.load(f)

# 3) Filter only entries with XML annotations OR ROI files
metadata = metadata[
    ((metadata['annotation_xml'].notna()) & (metadata['annotation_xml'] != ''))
    | ((metadata['roi_files'].notna()) & (metadata['roi_files'] != ''))
].reset_index(drop=True)

# 4) Apply DEV_MODE downsampling if enabled
if DEV_ENABLED:
    metadata = metadata.groupby('subtype').head(DEV_PER_CLASS).reset_index(drop=True)

print(f"✅ Metadata entries to process: {len(metadata)}")
print(metadata['subtype'].value_counts())


✅ Metadata entries to process: 20
subtype
ccRCC     5
pRCC      5
CHROMO    5
ONCO      5
Name: count, dtype: int64


In [91]:
def is_patch_informative(patch, std_thresh=10):
    """Check if a patch contains sufficient tissue content via grayscale std deviation."""
    gray = patch.convert("L")
    std = np.array(gray).std()
    return std > std_thresh

In [92]:
rows = []
MAX_ROI_PER_CLASS = 2 if DEV_ENABLED else float('inf')
roi_counter = {cls: 0 for cls in ['ccRCC', 'pRCC', 'CHROMO', 'ONCO']}

for _, row in metadata.iterrows():
    subtype  = row['subtype']
    pid      = row['patient_id']
    wsi_file = WSI_DIR[subtype] / row['wsi_filename']

    annotation_xml = row.get('annotation_xml')
    if isinstance(annotation_xml, str) and annotation_xml.strip():
        xml_list = annotation_xml.split(';')
        for xml_name in xml_list:
            xml_path = XML_DIR[subtype] / xml_name
            rois = parse_rois(xml_path)

            available = MAX_ROI_PER_CLASS - roi_counter[subtype]
            if available <= 0:
                break
            rois = rois[:available]

            for roi_idx, (minx, maxx, miny, maxy) in enumerate(rois):
                roi_counter[subtype] += 1
                if DEV_ENABLED:
                    for _ in range(DEV_PATCHES_PER_WSI):
                        x = random.randint(minx, maxx - PATCH_SIZE)
                        y = random.randint(miny, maxy - PATCH_SIZE)
                        rows.append({
                            'subtype':    subtype,
                            'patient_id': pid,
                            'wsi_path':   str(wsi_file),
                            'xml_path':   str(xml_path),
                            'roi_file':   None,
                            'roi_idx':    roi_idx,
                            'x':          x,
                            'y':          y,
                            'patch_size': PATCH_SIZE
                        })
                else:
                    for x in range(minx, maxx - PATCH_SIZE + 1, STRIDE):
                        for y in range(miny, maxy - PATCH_SIZE + 1, STRIDE):
                            rows.append({
                                'subtype':    subtype,
                                'patient_id': pid,
                                'wsi_path':   str(wsi_file),
                                'xml_path':   str(xml_path),
                                'roi_file':   None,
                                'roi_idx':    roi_idx,
                                'x':          x,
                                'y':          y,
                                'patch_size': PATCH_SIZE
                            })
    else:
        roi_list = row.get('roi_files', '')
        if isinstance(roi_list, str) and roi_list.strip():
            roi_list = roi_list.split(';')
            available = MAX_ROI_PER_CLASS - roi_counter[subtype]
            if available <= 0:
                continue
            roi_list = roi_list[:available]

            for roi_idx, roi_rel_path in enumerate(roi_list):
                roi_counter[subtype] += 1
                roi_file = (paths['onco_ann'] if subtype == 'ONCO' else paths['chromo_ann']) / roi_rel_path
                slide = openslide.OpenSlide(str(roi_file))
                W, H = slide.dimensions

                if DEV_ENABLED:
                    attempts = 0
                    patches_added = 0
                    while patches_added < DEV_PATCHES_PER_WSI and attempts < 50:
                        x = random.randint(0, W - PATCH_SIZE)
                        y = random.randint(0, H - PATCH_SIZE)
                        patch = slide.read_region((x, y), 0, (PATCH_SIZE, PATCH_SIZE)).convert("RGB")
                        if is_patch_informative(patch):
                            rows.append({
                                'subtype':    subtype,
                                'patient_id': pid,
                                'wsi_path':   None,
                                'xml_path':   None,
                                'roi_file':   str(roi_file),
                                'roi_idx':    roi_idx,
                                'x':          x,
                                'y':          y,
                                'patch_size': PATCH_SIZE
                            })
                            patches_added += 1
                        attempts += 1
                else:
                    for x in range(0, W - PATCH_SIZE + 1, STRIDE):
                        for y in range(0, H - PATCH_SIZE + 1, STRIDE):
                            patch = slide.read_region((x, y), 0, (PATCH_SIZE, PATCH_SIZE)).convert("RGB")
                            if is_patch_informative(patch):
                                rows.append({
                                    'subtype':    subtype,
                                    'patient_id': pid,
                                    'wsi_path':   None,
                                    'xml_path':   None,
                                    'roi_file':   str(roi_file),
                                    'roi_idx':    roi_idx,
                                    'x':          x,
                                    'y':          y,
                                    'patch_size': PATCH_SIZE
                                })


KeyboardInterrupt: 

In [None]:
# Create DataFrame and assign split
patch_df = pd.DataFrame(rows)
patch_df['split'] = patch_df['patient_id'].map(splits_map).fillna('train')
print(f"✅ Total patches generated: {len(patch_df)}")
print(patch_df['subtype'].value_counts())


In [None]:
# ===== Cell 6: Save to Parquet =====
out_path = Path(project_root) / 'data/processed/patch_df.parquet'
out_path.parent.mkdir(parents=True, exist_ok=True)
patch_df.to_parquet(out_path, index=False)

print(f"✅ patch_df saved to {out_path}")

In [None]:
import pandas as pd

# Caricamento
df = pd.read_parquet("/content/drive/MyDrive/ColabNotebooks/wsi-ssrl-rcc_project/data/processed/patch_df.parquet")

# Ispezione rapida
print(df.shape)              # dimensione
print(df.columns)            # colonne
# Corrected column name from 'label' to 'subtype'
print(df['subtype'].value_counts())  # conteggio per classe

# Visualizza prime righe
df.head(100)

In [None]:
!pip install openslide-python

In [None]:
import pandas as pd
import numpy as np
from PIL import Image
import openslide
import os
import random
from pathlib import Path
from google.colab import drive
import matplotlib.pyplot as plt

In [None]:
# === CONFIG ===
patch_df_path = "/content/drive/MyDrive/ColabNotebooks/wsi-ssrl-rcc_project/data/processed/patch_df.parquet"
out_dir = Path("/content/drive/MyDrive/ColabNotebooks/wsi-ssrl-rcc_project/data/visual_debug/class_examples")
out_dir.mkdir(parents=True, exist_ok=True)

# === CARICA patch_df E CLASSI ===
df = pd.read_parquet(patch_df_path)
subtypes = df["subtype"].unique()
print("✔️ Classi trovate:", subtypes)


In [None]:
# === FUNZIONE PER ESTRARRE UNA PATCH ===
def extract_patch(row):
    """
    Extracts a patch from WSI or ROI image based on patch_df row.
    """
    slide_path = row["wsi_path"] if pd.notna(row["wsi_path"]) else row["roi_file"]
    slide = openslide.OpenSlide(str(slide_path))
    region = slide.read_region((int(row["x"]), int(row["y"])), 0, (int(row["patch_size"]), int(row["patch_size"])))
    return region.convert("RGB")


In [None]:
# === ESTRAI E SALVA 10 IMMAGINI PER CLASSE ===
for subtype in subtypes:
    subtype_rows = df[df["subtype"] == subtype].sample(n=10, random_state=42)
    for i, (_, row) in enumerate(subtype_rows.iterrows()):
        img = extract_patch(row)
        save_path = out_dir / f"{subtype}_{i}.jpg"
        img.save(save_path)
        print(f"✅ Saved {save_path.name}")


In [None]:
# === VISUALIZZA IN GRIGLIA 10×N CLASSI ===
n_per_class = 10
n_classes = len(subtypes)
fig, axes = plt.subplots(n_classes, n_per_class, figsize=(n_per_class * 2, n_classes * 2))

for row_idx, subtype in enumerate(subtypes):
    for col_idx in range(n_per_class):
        img_path = out_dir / f"{subtype}_{col_idx}.jpg"
        img = Image.open(img_path)
        ax = axes[row_idx, col_idx] if n_classes > 1 else axes[col_idx]
        ax.imshow(img)
        ax.set_title(f"{subtype}" if col_idx == 0 else "")
        ax.axis("off")

plt.tight_layout()
plt.show()


In [None]:
from collections import defaultdict

# Dizionario per tracciare i file per paziente
images_by_patient = defaultdict(list)

# Estrai 10 patch per classe e salva con nome contenente il patient_id
for subtype in subtypes:
    subtype_rows = df[df["subtype"] == subtype].sample(n=10, random_state=42)
    for i, (_, row) in enumerate(subtype_rows.iterrows()):
        img = extract_patch(row)
        patient_id = row["patient_id"]
        filename = f"{subtype}_{patient_id}_{i}.jpg"
        save_path = out_dir / filename
        img.save(save_path)
        images_by_patient[patient_id].append(filename)
        print(f"✅ Saved {filename}")


In [None]:
import matplotlib.pyplot as plt
from PIL import Image
from pathlib import Path
import re
from collections import defaultdict

# === CONFIG ===
out_dir = Path("/content/drive/MyDrive/ColabNotebooks/wsi-ssrl-rcc_project/data/visual_debug/class_examples")

# === Raggruppa immagini per paziente ===
images_by_patient = defaultdict(list)
pattern = re.compile(r"(\w+)_(HP[\d.]+)_(\d+)\.jpg")  # es. ccRCC_HP19_0.jpg

for img_path in sorted(out_dir.glob("*.jpg")):
    match = pattern.match(img_path.name)
    if match:
        subtype, patient_id, idx = match.groups()
        images_by_patient[patient_id].append((int(idx), img_path, subtype))

# === Visualizza 10 immagini per ogni paziente ===
for patient_id, images in images_by_patient.items():
    images = sorted(images)  # Ordina per idx
    fig, axes = plt.subplots(1, 10, figsize=(20, 2))
    fig.suptitle(f"Paziente {patient_id} – {images[0][2]}", fontsize=14)

    for i, (idx, img_path, subtype) in enumerate(images):
        img = Image.open(img_path)
        axes[i].imshow(img)
        axes[i].axis("off")
        axes[i].set_title(f"{idx}", fontsize=10)

    plt.tight_layout()
    plt.show()
