# Fase 2 model with train 716 pages 

In [None]:
import os
import shutil
import json
from sklearn.model_selection import train_test_split
from ultralytics import YOLO
import yaml

In [None]:
# ============ Configuración ============
# Ruta al JSON exportado de Label Studio (formato Common Raw JSON)
LABEL_JSON    = r'C:\Users\juans\Documents\proarchitecg\version_2_docker\model_clasification_image_v2\train.json'

# Carpeta donde están las imágenes originales
IMAGES_ROOT   = r'C:\Users\juans\Documents\proarchitecg\version_2_docker\model_clasification_image_v2\images'

# Carpeta de salida (train/val preparado)
OUTPUT_ROOT   = r'C:\Users\juans\Documents\proarchitecg\version_2_docker\model_clasification_image_v2\dataset_cls'

VAL_SPLIT     = 0.2
PRETRAINED    = 'yolov8n-cls.pt'
EPOCHS        = 50
BATCH_SIZE    = 16
IMG_SIZE      = 224

In [None]:
# =======================================

def load_annotations(label_json_path):
    """
    Lee export.json de Label Studio y devuelve lista de tuplas (image_path, label_name).
    Asume que en cada item:
      - item['data']['image'] = ruta o filename de la imagen
      - item['annotations'][0]['result'][0]['value']['choices'][0] = nombre de la clase
    Ajusta según tu estructura exacta.
    """
    with open(label_json_path, encoding='utf-8') as f:
        raw = json.load(f)

    samples = []
    for item in raw:
        # Ruta de imagen en el campo data.image
        img_ref = item['data'].get('image')
        # Nombre de la imagen (basename)
        img_name = os.path.basename(img_ref)
        # Clase anotada (primer annotation → primer result → choices[0])
        ann = item.get('annotations') or item.get('predictions')
        if not ann:
            continue
        result = ann[0].get('result') if isinstance(ann[0], dict) else None
        if not result or not isinstance(result, list) or len(result) == 0:
            continue
        value = result[0].get('value') if isinstance(result[0], dict) else None
        if not value or 'choices' not in value or not value['choices']:
            continue
        label = value['choices'][0]
        samples.append((img_name, label))
    return samples

def prepare_dataset_from_labels():
    # 1) Cargar anotaciones
    samples = load_annotations(LABEL_JSON)
    # 2) Agrupar por etiqueta
    from collections import defaultdict
    by_label = defaultdict(list)
    for img_name, label in samples:
        by_label[label].append(img_name)

    # 3) Separar singleton vs multies
    multi, single = [], []
    for label, imgs in by_label.items():
        if len(imgs) > 1:
            multi += [(img, label) for img in imgs]
        else:
            single += [(imgs[0], label)]

    # 4) Estratified split solo sobre multi
    if multi:
        train_m, val_m = train_test_split(
            multi,
            test_size=VAL_SPLIT,
            stratify=[lbl for _,lbl in multi],
            random_state=42
        )
    else:
        train_m, val_m = [], []

    # 5) Todos los single van a train
    train_samples = train_m + single
    val_samples   = val_m

    # 6) Limpiar y copiar
    if os.path.exists(OUTPUT_ROOT):
        shutil.rmtree(OUTPUT_ROOT)
    os.makedirs(OUTPUT_ROOT)
    def copy_split(split, split_name):
        for img_name, label in split:
            src = os.path.join(IMAGES_ROOT, img_name)
            dst_dir = os.path.join(OUTPUT_ROOT, split_name, label)
            os.makedirs(dst_dir, exist_ok=True)
            dst = os.path.join(dst_dir, img_name)
            if os.path.exists(src) and not os.path.exists(dst):
                shutil.copy(src, dst)

    copy_split(train_samples, 'train')
    copy_split(val_samples,   'val')

    # 7) Generar data.yaml
    labels = sorted(by_label.keys())
    data_dict = {
        'path': OUTPUT_ROOT,  
        'train': 'train',
        'val':   'val',
        'nc':    len(labels),
        'names': labels
    }
    with open(os.path.join(OUTPUT_ROOT, 'data.yaml'), 'w') as f:
        yaml.dump(data_dict, f, sort_keys=False)

    print(f"Dataset preparado:")
    print(f" • Clases totales: {labels}")
    print(f" • Train: {len(train_samples)} imágenes")
    print(f" • Val:   {len(val_samples)} imágenes")
    print(f" • data.yaml generado en {OUTPUT_ROOT}")
def train_model():
    # En lugar de pasar data=yaml_path, pasamos OUTPUT_ROOT
    model = YOLO(PRETRAINED)
    model.train(
        data=OUTPUT_ROOT,   # <-- Carpeta que contiene data.yaml, train/ y val/
        epochs=EPOCHS,
        imgsz=IMG_SIZE,
        batch=BATCH_SIZE,
        name='person_cls'
    )
    print("✅ Entrenamiento finalizado. Revisa runs/classify/person_cls")

if __name__ == '__main__':
    prepare_dataset_from_labels()
    train_model()