This script handles the organization of the preprocessed MRI data into a structured folder hierarchy. Data are separated by modality and patient, with each patient folder containing the corresponding image slices and segmentation masks. It also filters out slices containing only background to reduce class imbalance, generating the final dataset structure used in subsequent experiments.

In [None]:
import os
import shutil
from tqdm import tqdm

# Paths
preprocessed_base = "/content/drive/My Drive/CHAOS_Train_Sets/Train_Sets/preprocessed_data/min_max_normalized_images"
dataset_ready_2D = "/content/drive/My Drive/CHAOS_Train_Sets/Train_Sets/dataset_ready_2D"

modalities = ["T2SPIR", "T1DUAL_InPhase", "T1DUAL_OutPhase"]

# Create dataset_ready_2D folder structure
for modality in modalities:
    modality_dir = os.path.join(dataset_ready_2D, modality, "images")
    os.makedirs(modality_dir, exist_ok=True)

# List all patients (use one modality as reference)
patients = sorted([
    p for p in os.listdir(os.path.join(preprocessed_base, "T2SPIR"))
    if os.path.isdir(os.path.join(preprocessed_base, "T2SPIR", p))
])

# Copy DICOM slices into a folder per patient
for modality in modalities:
    out_modality_dir = os.path.join(dataset_ready_2D, modality, "images")
    
    for patient in tqdm(patients, desc=f"Copying DICOM slices for {modality}"):
        patient_mod_path = os.path.join(preprocessed_base, modality, patient)
        if not os.path.isdir(patient_mod_path):
            continue

        dicom_files = sorted([f for f in os.listdir(patient_mod_path) if f.lower().endswith(".dcm")])
        if not dicom_files:
            continue

        # Create a folder for this patient inside the modality folder
        patient_out_dir = os.path.join(out_modality_dir, patient)
        os.makedirs(patient_out_dir, exist_ok=True)

        # Copy all slices into the patient's folder
        for f in dicom_files:
            src = os.path.join(patient_mod_path, f)
            dst = os.path.join(patient_out_dir, f)
            shutil.copy2(src, dst)

print("✅ Dataset ready 2D: each patient has its own folder per modality.")

In [None]:
special_patients = ['13', '19', '2', '20', '3', '38','8']
resized_masks_path = r"/content/drive/My Drive/CHAOS_Train_Sets/Train_Sets/resized_data"
converted_masks_path = r"/content/drive/My Drive/CHAOS_Train_Sets/Train_Sets/converted_masks"
dataset_ready_2D = r"/content/drive/My Drive/CHAOS_Train_Sets/Train_Sets/dataset_ready_2D"

modalities = ["T2SPIR", "T1DUAL_InPhase", "T1DUAL_OutPhase"]

# Process each modality separately
for modality in modalities:
    modality_dir = os.path.join(dataset_ready_2D, modality)
    masks_dir = os.path.join(modality_dir, "masks")
    os.makedirs(masks_dir, exist_ok=True)

    # Determine base modality for masks (T1DUAL special case)
    base_mod = "T1DUAL" if "T1DUAL" in modality else modality

    # List patient folders in images_dir
    images_dir = os.path.join(modality_dir, "images")
    patients = sorted([p for p in os.listdir(images_dir)
                       if os.path.isdir(os.path.join(images_dir, p))])

    for patient in tqdm(patients, desc=f"Organizing masks for {modality}"):

        # Determine mask folder
        if patient in special_patients:
            mask_path = os.path.join(resized_masks_path, patient, base_mod, "Ground")
        else:
            mask_path = os.path.join(converted_masks_path, patient, base_mod)

        if not os.path.isdir(mask_path):
            print(f"Warning: mask folder not found: {mask_path}")
            continue

        mask_files = sorted([f for f in os.listdir(mask_path) if f.lower().endswith((".png", ".jpg"))])
        if not mask_files:
            print(f"No mask files for patient {patient} in {mask_path}")
            continue

        # --- Create a folder for this patient ---
        patient_mask_dir = os.path.join(masks_dir, patient)
        os.makedirs(patient_mask_dir, exist_ok=True)

        # Copy all masks into the patient folder
        for f in mask_files:
            src = os.path.join(mask_path, f)
            
            dst = os.path.join(patient_mask_dir, f)  # keep original filename
            shutil.copy2(src, dst)

print("✅ All masks are organized into patient folders per modality.")

In [None]:
import cv2
import numpy as np

def mask_has_foreground(mask_path):
    mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)
    if mask is None:
        return False
    return np.any(mask > 0)

In [None]:
special_patients = ['13', '19', '2', '20', '3', '38', '8']

resized_masks_path = r"/content/drive/My Drive/CHAOS_Train_Sets/Train_Sets/resized_data"
converted_masks_path = r"/content/drive/My Drive/CHAOS_Train_Sets/Train_Sets/converted_masks"

dataset_src = r"/content/drive/My Drive/CHAOS_Train_Sets/Train_Sets/dataset_ready_2D"
dataset_dst = r"/content/drive/My Drive/CHAOS_Train_Sets/Train_Sets/dataset_ready_2D_filtered"

modalities = ["T2SPIR", "T1DUAL_InPhase", "T1DUAL_OutPhase"]

for modality in modalities:
    print(f"\nProcessing modality: {modality}")

    base_mod = "T1DUAL" if "T1DUAL" in modality else modality

    src_img_root = os.path.join(dataset_src, modality, "images")
    dst_img_root = os.path.join(dataset_dst, modality, "images")
    dst_mask_root = os.path.join(dataset_dst, modality, "masks")

    os.makedirs(dst_img_root, exist_ok=True)
    os.makedirs(dst_mask_root, exist_ok=True)

    patients = sorted([
        p for p in os.listdir(src_img_root)
        if os.path.isdir(os.path.join(src_img_root, p))
    ])

    for patient in tqdm(patients, desc=f"Filtering {modality}"):

        # source folders
        img_src_dir = os.path.join(src_img_root, patient)

        if patient in special_patients:
            mask_src_dir = os.path.join(resized_masks_path, patient, base_mod, "Ground")
        else:
            mask_src_dir = os.path.join(converted_masks_path, patient, base_mod)

        if not os.path.isdir(mask_src_dir):
            continue

        img_files = sorted([f for f in os.listdir(img_src_dir) if f.endswith(".dcm")])
        mask_files = sorted([f for f in os.listdir(mask_src_dir) if f.lower().endswith((".png", ".jpg"))])

        if not img_files or not mask_files:
            continue

        kept_pairs = []

        # ✅ align by order (as you verified)
        for img_f, mask_f in zip(img_files, mask_files):
            mask_path = os.path.join(mask_src_dir, mask_f)

            if mask_has_foreground(mask_path):
                kept_pairs.append((img_f, mask_f))

        # ❗ if nothing to keep → no folders created
        if len(kept_pairs) == 0:
            continue

        # create folders ONLY now
        img_dst_dir = os.path.join(dst_img_root, patient)
        mask_dst_dir = os.path.join(dst_mask_root, patient)
        os.makedirs(img_dst_dir, exist_ok=True)
        os.makedirs(mask_dst_dir, exist_ok=True)

        for img_f, mask_f in kept_pairs:
            shutil.copy2(
                os.path.join(img_src_dir, img_f),
                os.path.join(img_dst_dir, img_f)
            )
            shutil.copy2(
                os.path.join(mask_src_dir, mask_f),
                os.path.join(mask_dst_dir, mask_f)
            )

        print(f"Patient {patient}: kept {len(kept_pairs)} slices")

print("\n✅ Images AND masks filtered correctly (no empty folders).")