In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import shutil
from sklearn.model_selection import train_test_split
import yaml
from pathlib import Path
import os

In [3]:
PROJECT_ROOT = (Path.cwd() / ".." / "..").resolve()
DATA_PATH = PROJECT_ROOT / "datasets"
ORIGINAL_DATA = DATA_PATH / "raw" / "images"
PROCESSED_DATA = DATA_PATH / "processed" / "images"
# Rutas específicas
TRAIN_IMAGES = ORIGINAL_DATA / "train"
TEST_IMAGES = ORIGINAL_DATA / "test"
CSV_PATH = ORIGINAL_DATA / "train.csv"

In [4]:
CANCER_CLASSES = ["Nodule/Mass", "Other lesion"]
CLASS_MAPPING = {class_name: idx for idx, class_name in enumerate(CANCER_CLASSES)}
print(f"Dataset: {DATA_PATH}")
print(f"Raw images: {ORIGINAL_DATA}")
print(f"Processed: {PROCESSED_DATA}")
print(f"Clases: {CANCER_CLASSES}")

Dataset: C:\Users\Sebastian\Desktop\MachineLearningPC2\TF_Machine_Learning_1888\datasets
Raw images: C:\Users\Sebastian\Desktop\MachineLearningPC2\TF_Machine_Learning_1888\datasets\raw\images
Processed: C:\Users\Sebastian\Desktop\MachineLearningPC2\TF_Machine_Learning_1888\datasets\processed\images
Clases: ['Nodule/Mass', 'Other lesion']


In [5]:
df = pd.read_csv(CSV_PATH)
print(f"Total registros: {len(df):,}")
print(f"Imágenes únicas: {df['image_id'].nunique():,}")
df.head()


Total registros: 67,914
Imágenes únicas: 15,000


Unnamed: 0.1,Unnamed: 0,image_id,class_name,class_id,rad_id,x_min,y_min,x_max,y_max,raw_x_min,raw_x_max,raw_y_min,raw_y_max,raw_width,raw_height,scale_x,scale_y
0,0,78aa8415fbf1c792f7d7c53349d44d4f,No finding,14,R16,,,,,,,,,3000.0,3000.0,0.341333,0.341333
1,1,78aa8415fbf1c792f7d7c53349d44d4f,No finding,14,R17,,,,,,,,,3000.0,3000.0,0.341333,0.341333
2,2,78aa8415fbf1c792f7d7c53349d44d4f,No finding,14,R11,,,,,,,,,3000.0,3000.0,0.341333,0.341333
3,3,183015e171f5159d7e60d43578632a3f,Aortic enlargement,0,R8,567.0,295.0,671.0,417.0,1134.0,1342.0,721.0,1019.0,2048.0,2500.0,0.5,0.4096
4,4,183015e171f5159d7e60d43578632a3f,Pleural thickening,11,R9,58.0,794.0,116.0,851.0,117.0,232.0,1938.0,2077.0,2048.0,2500.0,0.5,0.4096


In [6]:
cancer_df = df[df['class_name'].isin(CANCER_CLASSES)].copy()
cancer_df = cancer_df.dropna(subset=['x_min', 'y_min', 'x_max', 'y_max'])
cancer_df['class_id'] = cancer_df['class_name'].map(CLASS_MAPPING)

print(f"Registros cáncer: {len(cancer_df):,}")
print(f"Imágenes cáncer: {cancer_df['image_id'].nunique():,}")
cancer_df['class_name'].value_counts()

Registros cáncer: 4,783
Imágenes cáncer: 1,632


class_name
Nodule/Mass     2580
Other lesion    2203
Name: count, dtype: int64

In [7]:
cancer_df = df[df['class_name'].isin(CANCER_CLASSES)].copy()
cancer_df = cancer_df.dropna(subset=['x_min', 'y_min', 'x_max', 'y_max'])
cancer_df['class_id'] = cancer_df['class_name'].map(CLASS_MAPPING)

print(f"Registros cáncer: {len(cancer_df):,}")
print(f"Imágenes cáncer: {cancer_df['image_id'].nunique():,}")

# Distribución por clase
cancer_df['class_name'].value_counts()

Registros cáncer: 4,783
Imágenes cáncer: 1,632


class_name
Nodule/Mass     2580
Other lesion    2203
Name: count, dtype: int64

In [8]:
train_images = set([f.stem for f in TRAIN_IMAGES.glob("*.jpg") if f.is_file()])
test_images = set([f.stem for f in TEST_IMAGES.glob("*.jpg") if f.is_file()])

print(f"Archivos en train: {len(train_images)}")
print(f"Archivos en test: {len(test_images)}")

# Ver nombres en CSV
csv_images = set(cancer_df['image_id'].unique())
print(f"Imágenes en CSV: {len(csv_images)}")

# Verificar coincidencias
train_matches = train_images.intersection(csv_images)
test_matches = test_images.intersection(csv_images)

print(f"Coincidencias train: {len(train_matches)}")
print(f"Coincidencias test: {len(test_matches)}")

train_cancer = cancer_df[cancer_df['image_id'].isin(train_images)].copy()
test_cancer = cancer_df[cancer_df['image_id'].isin(test_images)].copy()

print(f"Train: {train_cancer['image_id'].nunique():,} imágenes")
print(f"Test: {test_cancer['image_id'].nunique():,} imágenes")

Archivos en train: 15000
Archivos en test: 3000
Imágenes en CSV: 1632
Coincidencias train: 1632
Coincidencias test: 0
Train: 1,632 imágenes
Test: 0 imágenes


In [9]:
unique_train_images = train_cancer['image_id'].unique()

# 90% train, 10% val
train_imgs, val_imgs = train_test_split(unique_train_images, test_size=0.1, random_state=42)

train_final = train_cancer[train_cancer['image_id'].isin(train_imgs)].copy()
val_final = train_cancer[train_cancer['image_id'].isin(val_imgs)].copy()

print(f"Train final: {train_final['image_id'].nunique():,} imágenes ({len(train_final):,} anotaciones)")
print(f"Val final: {val_final['image_id'].nunique():,} imágenes ({len(val_final):,} anotaciones)")

# Verificar distribución por clase en cada split
for split_name, split_df in [('Train', train_final), ('Val', val_final)]:
    print(f"\n{split_name}:")
    print(split_df['class_name'].value_counts())

Train final: 1,468 imágenes (4,341 anotaciones)
Val final: 164 imágenes (442 anotaciones)

Train:
class_name
Nodule/Mass     2349
Other lesion    1992
Name: count, dtype: int64

Val:
class_name
Nodule/Mass     231
Other lesion    211
Name: count, dtype: int64


In [10]:
def to_yolo(df, img_w=1024, img_h=1024):
    """Convertir coordenadas a formato YOLO"""
    df = df.copy()
    df['x_center'] = ((df['x_min'] + df['x_max']) / 2) / img_w
    df['y_center'] = ((df['y_min'] + df['y_max']) / 2) / img_h
    df['width'] = (df['x_max'] - df['x_min']) / img_w
    df['height'] = (df['y_max'] - df['y_min']) / img_h
    
    for coord in ['x_center', 'y_center', 'width', 'height']:
        df[coord] = df[coord].clip(0, 1)
    
    return df

In [11]:
def save_yolo_labels(df, split_name):
    """Guardar labels en formato YOLO (.txt)"""
    labels_path = PROCESSED_DATA / split_name / "labels"
    labels_path.mkdir(parents=True, exist_ok=True)
    
    saved_files = 0
    for image_id, group in df.groupby('image_id'):
        label_file = labels_path / f"{image_id}.txt"
        
        with open(label_file, 'w') as f:
            for _, row in group.iterrows():
                f.write(f"{int(row['class_id'])} {row['x_center']:.6f} {row['y_center']:.6f} "
                       f"{row['width']:.6f} {row['height']:.6f}\n")
        saved_files += 1
    
    return saved_files

In [12]:
def copy_images_split(df, split_name):
    """Copiar imágenes del split"""
    source_path = TRAIN_IMAGES  # Todas vienen de train
    target_path = PROCESSED_DATA / split_name / "images"
    target_path.mkdir(parents=True, exist_ok=True)
    
    copied = 0
    for image_id in df['image_id'].unique():
        source_file = source_path / f"{image_id}.jpg"
        target_file = target_path / f"{image_id}.jpg"
        
        if source_file.exists():
            shutil.copy2(source_file, target_file)
            copied += 1
    
    return copied

# Convertir a YOLO
train_yolo = to_yolo(train_final)
val_yolo = to_yolo(val_final)

print("✅ Coordenadas convertidas a YOLO")

✅ Coordenadas convertidas a YOLO


In [13]:
# Guardar labels .txt
train_labels = save_yolo_labels(train_yolo, 'train')
val_labels = save_yolo_labels(val_yolo, 'val')

print(f"📄 Labels guardados:")
print(f"  Train: {train_labels} archivos .txt")
print(f"  Val: {val_labels} archivos .txt")

📄 Labels guardados:
  Train: 1468 archivos .txt
  Val: 164 archivos .txt


In [14]:
train_copied = copy_images_split(train_yolo, 'train')
val_copied = copy_images_split(val_yolo, 'val')

print(f"📁 Imágenes copiadas:")
print(f"  Train: {train_copied} imágenes")
print(f"  Val: {val_copied} imágenes")

📁 Imágenes copiadas:
  Train: 1468 imágenes
  Val: 164 imágenes


In [15]:
dataset_config = {
    'path': str(PROCESSED_DATA),
    'train': 'train/images',
    'val': 'val/images',
    'nc': len(CANCER_CLASSES),
    'names': CANCER_CLASSES
}
yaml_path = PROCESSED_DATA / "dataset.yaml"
with open(yaml_path, 'w') as f:
    yaml.dump(dataset_config, f, default_flow_style=False)

print(f"⚙️ Dataset YAML creado: {yaml_path}")

⚙️ Dataset YAML creado: C:\Users\Sebastian\Desktop\MachineLearningPC2\TF_Machine_Learning_1888\datasets\processed\images\dataset.yaml
