In [None]:
import os
import pandas as pd
import random
from collections import defaultdict

# Directorios donde están almacenadas las imágenes  / Folder where images are stored
low_quality_dir = "/data/ob_process/low_quality_images"
high_quality_dir = "/data/ob_process/high_quality_images"
output_dir = "/data/ob_process/csv"

# Obtener listas de archivos / Get lists of files
low_images = sorted([f for f in os.listdir(low_quality_dir) if f.endswith(".png")])
high_images = sorted([f for f in os.listdir(high_quality_dir) if f.endswith(".png")])

# Extraer IDs de pacientes / Extract patient IDs
def extract_patient_id(filename):
    """Extrae el identificador del paciente desde el nombre del archivo."""
    return filename.split("_")[3]  # Ejemplo: "low_quality_image_001_1.png" → "001" // Example: "low_quality_image_001_1.png" → "001"

# Organizar imágenes por paciente / Organize images by patient
patients_low = defaultdict(list)
patients_high = defaultdict(list)

for img in low_images:
    patients_low[extract_patient_id(img)].append(img)

for img in high_images:
    patients_high[extract_patient_id(img)].append(img)

# Obtener listas de pacientes únicos / Get unique patient lists
low_patient_ids = sorted(patients_low.keys())
high_patient_ids = sorted(patients_high.keys())

random.seed(42)  # Semilla para reproducibilidad / Random seed for reproducibility
random.shuffle(low_patient_ids)
random.shuffle(high_patient_ids)

# División de datos (70% train, 15% validation, 15% test) / Data split (70% train, 15% validation, 15% test)
def split_patients(patient_list):
    num_patients = len(patient_list)
    train_split = int(0.7 * num_patients)
    val_split = int(0.85 * num_patients)
    
    return (
        patient_list[:train_split], 
        patient_list[train_split:val_split], 
        patient_list[val_split:]
    )

low_train, low_val, low_test = split_patients(low_patient_ids)
high_train, high_val, high_test = split_patients(high_patient_ids)

# Función para obtener archivos de pacientes seleccionados / Function to get files from selected patients
def get_images_from_patients(patients_dict, patient_list):
    images = []
    for patient in patient_list:
        images.extend(patients_dict[patient])  # Agregar todas las imágenes del paciente / Add all images of the patient
    return images

# Obtener imágenes de cada partición / Get images from each partition
train_low = get_images_from_patients(patients_low, low_train)
val_low = get_images_from_patients(patients_low, low_val)
test_low = get_images_from_patients(patients_low, low_test)

total=low_train+low_val+low_test
print(len(total))
print(len(set(total)))


train_high = get_images_from_patients(patients_high, high_train)
val_high = get_images_from_patients(patients_high, high_val)
test_high = get_images_from_patients(patients_high, high_test)

total=high_train+high_val+high_test
print(len(total))
print(len(set(total)))

# Asegurar que tenemos el mismo número de muestras (sin emparejar, solo aseguramos el tamaño) / Ensure we have the same number of samples (without pairing, just ensure size)
min_train = min(len(train_low), len(train_high))
min_val = min(len(val_low), len(val_high))
min_test = min(len(test_low), len(test_high))

train_data = list(zip(train_low[:min_train], train_high[:min_train]))
val_data = list(zip(val_low[:min_val], val_high[:min_val]))
test_data = list(zip(test_low[:min_test], test_high[:min_test]))
print(len(train_data), len(val_data), len(test_data))
# Guardar en CSV / Save to CSV
os.makedirs(output_dir, exist_ok=True)

def save_csv(data, filename):
    df = pd.DataFrame(data, columns=["low_quality", "high_quality"])
    df.to_csv(os.path.join(output_dir, filename), index=False)

# save_csv(train_data, "train.csv")
# save_csv(val_data, "validation.csv")
# save_csv(test_data, "test.csv")

print(f"✅ CSVs guardados en {output_dir}") #Saved csv in path: