In [6]:
import os
import shutil
import pandas as pd

# ---- User paths ----
metadata_path = "Downloads/archive/infant_retinal_database_info.xlsx"
image_dir = "Downloads/archive/images_stack_without_captions/images_stack_without_captions"
out_dir = "Downloads/archive/Organized_dataset"

# ---- Step 1: Load metadata ----
metadata = pd.read_excel(metadata_path)
metadata = metadata.rename(columns={
    "ID": "PatientID",
    "DIAGNOSIS CODE (DG)": "Diagnosis"
})

# ---- Step 2: Map patient ID to stage label ----
def label_rop_stage(dg):
    if dg == 0:
        return 'No_ROP'
    elif dg in [1, 2]:
        return 'Mild_ROP'
    else:
        return 'Severe_ROP'
# Use the median diagnosis per patient
patient_stage = metadata.groupby("PatientID")["Diagnosis"].median().apply(label_rop_stage).to_dict()

# ---- Step 3: Prepare output folders ----
class_names = ['No_ROP', 'Mild_ROP', 'Severe_ROP']
for cls in class_names:
    os.makedirs(os.path.join(out_dir, cls), exist_ok=True)

# ---- Step 4: Classify and copy images ----
def extract_patient_id(filename):
    # Extract patient ID ("001" from e.g., 001_F_GA41_BW2905_...)
    part = filename.split("_")[0]
    return int(part) if part.isdigit() else None

all_images = [f for f in os.listdir(image_dir) if f.endswith('.jpg')]

for img in all_images:
    pid = extract_patient_id(img)
    if pid is not None and pid in patient_stage:
        stage = patient_stage[pid]
        src = os.path.join(image_dir, img)
        dst = os.path.join(out_dir, stage, img)
        if not os.path.exists(dst):
            shutil.copy2(src, dst)

print("Dataset reorganized into:", out_dir)


Dataset reorganized into: Downloads/archive/Organized_dataset
