In [14]:
import os
import numpy as np
import pickle
from PIL import Image
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Paths to image folders (adjust if needed)
LOW_ENERGY_DIR = '/kaggle/input/pkg-cdd-cesm/PKG - CDD-CESM/CDD-CESM/Low energy images of CDD-CESM'
SUBTRACTED_DIR = '/kaggle/input/pkg-cdd-cesm/PKG - CDD-CESM/CDD-CESM/Subtracted images of CDD-CESM'
OUTPUT_PATH = '/kaggle/working/cdd_cesm_dataset.pkl'

# Image processing parameters
CROP_SIZE = (2048, 2048)    # Center crop to this size first
DOWNSCALE_SIZE = (256, 256)  # Final output size

def center_crop_and_downscale(pil_img):
    """Center crops and downscales an image using LANCZOS resampling"""
    # Center crop
    width, height = pil_img.size
    left = (width - CROP_SIZE[0])//2
    top = (height - CROP_SIZE[1])//2
    right = left + CROP_SIZE[0]
    bottom = top + CROP_SIZE[1]
    cropped = pil_img.crop((left, top, right, bottom))
    
    # Downscale
    return cropped.resize(DOWNSCALE_SIZE, Image.Resampling.LANCZOS)

def pairing_key(filename):
    """Extracts (patient, laterality, view) from filenames like 'P100_L_DM_CC.jpg'"""
    parts = filename.split('_')
    return (parts[0], parts[1], parts[3].split('.')[0])  # (P100, L, CC)

# List and pair files
le_files = [f for f in os.listdir(LOW_ENERGY_DIR) if f.lower().endswith('.jpg')]
sub_files = [f for f in os.listdir(SUBTRACTED_DIR) if f.lower().endswith('.jpg')]

le_dict = {pairing_key(f): f for f in le_files}
sub_dict = {pairing_key(f): f for f in sub_files}
pairs = [(le_dict[k], sub_dict[k], k) for k in le_dict if k in sub_dict]

print(f"Successfully paired {len(pairs)} images")

# Process and save dataset
samples = []
for le_fname, sub_fname, key in tqdm(pairs, desc="Processing images"):
    # Load and preprocess
    le_img = center_crop_and_downscale(Image.open(os.path.join(LOW_ENERGY_DIR, le_fname)).convert('L'))
    sub_img = center_crop_and_downscale(Image.open(os.path.join(SUBTRACTED_DIR, sub_fname)).convert('L'))
    
    # Convert to normalized tensors
    le_arr = np.array(le_img, dtype=np.float32)[np.newaxis, ...] / 255.0  # [1, H, W]
    sub_arr = np.array(sub_img, dtype=np.float32)[np.newaxis, ...] / 255.0
    
    samples.append({
        'image': le_arr,
        'mask': sub_arr,
        'class': 0,  # Update with actual labels if available
        'metadata': {
            'patient': key[0],
            'laterality': key[1],
            'view': key[2],
            'filenames': (le_fname, sub_fname)
        }
    })

# Split and save
train, test = train_test_split(samples, test_size=0.2, random_state=42)
valid, test = train_test_split(test, test_size=0.5, random_state=42)

with open(OUTPUT_PATH, 'wb') as f:
    pickle.dump({'train': train, 'valid': valid, 'test': test}, f)

print(f"Dataset saved to {OUTPUT_PATH}")


Successfully paired 1002 images


Processing images: 100%|██████████| 1002/1002 [02:53<00:00,  5.78it/s]


Dataset saved to /kaggle/working/cdd_cesm_dataset.pkl


In [12]:
!rm cdd_cesm_dataset.pkl