In [32]:
# ===== Cell 1: Monta Google Drive (solo se usi Colab) =====
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [33]:
import os
from pathlib import Path
import pandas as pd
import yaml
from PIL import Image
import openslide
import tarfile
import io
from collections import defaultdict

In [34]:
# === CONFIG ===
yaml_path = Path("/content/drive/MyDrive/ColabNotebooks/wsi-ssrl-rcc_project/config/preprocessing.yaml")
with open(yaml_path, "r") as f:
    cfg = yaml.safe_load(f)

# Paths
root_path = Path(cfg["env_paths"]["colab"])
patch_df_path = root_path / "data/processed/patch_df.parquet"
tar_out_path  = root_path / "data/processed/dev_patches.tar"
jpg_out_dir   = root_path / "data/visual_debug/class_examples"
jpg_out_dir.mkdir(parents=True, exist_ok=True)

# Modalità DEV (subset)
patch_df = pd.read_parquet(patch_df_path)

# Corretto: forza esattamente 10 per ciascun sottotipo se presenti
patch_df = patch_df.groupby("subtype").apply(lambda g: g.head(10)).reset_index(drop=True)


print(f"✅ Loaded patch_df: {len(patch_df)} patches")


✅ Loaded patch_df: 40 patches


  patch_df = patch_df.groupby("subtype").apply(lambda g: g.head(10)).reset_index(drop=True)


In [35]:
def extract_patch(row):
    """
    Extracts a patch from WSI (or ROI) based on info from patch_df.
    Returns a PIL.Image in RGB.
    """
    path = row["wsi_path"] if pd.notna(row["wsi_path"]) else row["roi_file"]
    slide = openslide.OpenSlide(str(path))
    region = slide.read_region((int(row["x"]), int(row["y"])), 0, (int(row["patch_size"]), int(row["patch_size"])))
    return region.convert("RGB")


In [37]:
import tarfile
from PIL import Image
from collections import defaultdict
from io import BytesIO
import matplotlib.pyplot as plt
import os # Import os for path manipulation

# Parametri
tar_out_path.parent.mkdir(parents=True, exist_ok=True)
img_counts = defaultdict(int)
MAX_JPG_PER_CLASS = 10

# Crea il file tar
print(f"ℹ️ Creating WebDataset tar: {tar_out_path}")
with tarfile.open(tar_out_path, "w") as tar:
    for index, row in patch_df.iterrows():
        # Genera un nome univoco per il file nel tar
        # Usa subtype, patient_id e un contatore per garantire l'unicità
        filename = f"{row['subtype']}_{row['patient_id']}_{index}.jpg"

        try:
            # Estrai il patch come immagine PIL
            patch_img = extract_patch(row)

            # Salva l'immagine in un buffer di memoria
            buffer = BytesIO()
            patch_img.save(buffer, format="JPEG")
            buffer.seek(0)

            # Crea un oggetto TarInfo
            tar_info = tarfile.TarInfo(name=filename)
            tar_info.size = len(buffer.getvalue())

            # Aggiungi il buffer al tar file
            tar.addfile(tar_info, buffer)

        except Exception as e:
            print(f"⚠️ Error processing patch {index}: {e}")
            continue # Salta questo patch e continua con il prossimo

print(f"✅ WebDataset tar creato: {tar_out_path}")


# Estrai immagini da .tar e salvale in .jpg per debug
# Questa parte rimane per la visualizzazione di debug
print(f"ℹ️ Extracting images for debug from: {tar_out_path}")
with tarfile.open(tar_out_path, "r") as tar:
    members = sorted(tar.getmembers(), key=lambda m: m.name)
    for member in members:
        if not member.name.endswith(".jpg"):
            continue

        # Estrai subtype e patient_id
        # Assicurati che il nome del file nel tar sia compatibile con questo parsing
        # es. "subtype_patientid_index.jpg"
        parts = member.name.split("/")[-1].replace(".jpg", "").split("_")
        if len(parts) < 3: # Modificato per considerare il nome "subtype_patientid_index"
            print(f"⚠️ Skipping member with unexpected name format: {member.name}")
            continue
        subtype, pid = parts[0], parts[1] # Il terzo elemento parts[2] è l'indice, non usato qui

        # Salva solo le prime 10 per classe
        if img_counts[subtype] < MAX_JPG_PER_CLASS:
            extracted = tar.extractfile(member)
            if extracted is not None:
                img = Image.open(BytesIO(extracted.read())).convert("RGB")
                # Usa il conteggio per nominare i file jpg estratti per evitare sovrascritture
                out_path = jpg_out_dir / f"{subtype}_{pid}_{img_counts[subtype]}.jpg"
                try:
                    img.save(out_path)
                    img_counts[subtype] += 1
                except Exception as e:
                    print(f"⚠️ Error saving debug image {out_path}: {e}")


print(f"✅ Immagini per ispezione salvate in: {jpg_out_dir}")
print(f"✅ Immagini .jpg salvate direttamente da {tar_out_path}")

ℹ️ Creating WebDataset tar: /content/drive/MyDrive/ColabNotebooks/wsi-ssrl-rcc_project/data/processed/dev_patches.tar
✅ WebDataset tar creato: /content/drive/MyDrive/ColabNotebooks/wsi-ssrl-rcc_project/data/processed/dev_patches.tar
ℹ️ Extracting images for debug from: /content/drive/MyDrive/ColabNotebooks/wsi-ssrl-rcc_project/data/processed/dev_patches.tar
✅ Immagini per ispezione salvate in: /content/drive/MyDrive/ColabNotebooks/wsi-ssrl-rcc_project/data/visual_debug/class_examples
✅ Immagini .jpg salvate direttamente da /content/drive/MyDrive/ColabNotebooks/wsi-ssrl-rcc_project/data/processed/dev_patches.tar


In [38]:
# === Visualizzazione in griglia ===
subtypes = sorted(img_counts.keys())
print(img_counts.keys())
fig, axes = plt.subplots(len(subtypes), MAX_JPG_PER_CLASS,
                         figsize=(MAX_JPG_PER_CLASS * 2, len(subtypes) * 2))

for r, subtype in enumerate(subtypes):
    for c in range(MAX_JPG_PER_CLASS):
        # Trova l'immagine salvata
        matches = list(jpg_out_dir.glob(f"{subtype}_*_{c}.jpg"))
        if not matches:
            continue
        img = Image.open(matches[0])
        axes[r][c].imshow(img)
        axes[r][c].axis("off")
        if c == 0:
            axes[r][c].set_title(subtype, fontsize=12)

plt.tight_layout()
plt.show()

Output hidden; open in https://colab.research.google.com to view.

In [39]:
patch_df["subtype"].value_counts()


Unnamed: 0_level_0,count
subtype,Unnamed: 1_level_1
CHROMO,10
ONCO,10
ccRCC,10
pRCC,10


In [40]:
with tarfile.open(tar_out_path, "r") as tar:
    all_subtypes = set()
    for member in tar.getmembers():
        if member.name.endswith(".jpg"):
            subtype = member.name.split("/")[-1].split("_")[0]
            all_subtypes.add(subtype)
print("✅ Sottotipi presenti nel .tar:", all_subtypes)


✅ Sottotipi presenti nel .tar: {'ccRCC', 'ONCO', 'pRCC', 'CHROMO'}
