<a href="https://colab.research.google.com/github/0somens/Analysis/blob/main/Correcci%C3%B3n_Dataset_Ropa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import os, shutil, random, shutil, traceback
from sklearn.model_selection import train_test_split

from pathlib import Path
from collections import defaultdict


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [15]:
SRC_DIR = "/content/drive/MyDrive/Clothes"  # carpeta donde cargarás el dataset de ropa
DEST_TRAIN = "/content/drive/MyDrive/DatasetImg/Train"
DEST_VAL   = "/content/drive/MyDrive/DatasetImg/Validation"
DEST_TEST  = "/content/drive/MyDrive/DatasetImg/Test"

In [2]:
RATIOS = (0.70, 0.15, 0.15)  # train, val, test
MOVE_INSTEAD_OF_COPY = False  # True => mover archivos; False => copiar
SEED = 42
VALID_EXT = {".jpg", ".jpeg", ".png", ".bmp", ".gif", ".tif", ".tiff"}

In [6]:
random.seed(SEED)

In [7]:
def ensure_dir(path):
    os.makedirs(path, exist_ok=True)


In [8]:
def unique_dest_path(dest_dir, filename):
    base, ext = os.path.splitext(filename)
    candidate = filename
    i = 1
    while os.path.exists(os.path.join(dest_dir, candidate)):
        candidate = f"{base}_{i}{ext}"
        i += 1
    return os.path.join(dest_dir, candidate)


In [9]:
def gather_files(folder):
    files = []
    for entry in os.listdir(folder):
        full = os.path.join(folder, entry)
        if os.path.isfile(full):
            if Path(entry).suffix.lower() in VALID_EXT:
                files.append(full)
    return files

In [10]:
def split_counts(n, ratios):
    # devuelve cantidades (train, val, test) que sumen n (aplicando floor y ajustando sobrante)
    r = list(ratios)
    assert len(r) == 3 and abs(sum(r) - 1.0) < 1e-6, "Ratios must sum to 1.0"
    t = int(n * r[0])
    v = int(n * r[1])
    te = int(n * r[2])
    # ajustar por diferencia
    assigned = t + v + te
    i = 0
    while assigned < n:
        # distribuir el resto en el orden train, val, test
        if i % 3 == 0:
            t += 1
        elif i % 3 == 1:
            v += 1
        else:
            te += 1
        assigned += 1
        i += 1
    return t, v, te

In [11]:
def process_class(class_name, src_class_folder, dest_train, dest_val, dest_test, move=False):
    files = gather_files(src_class_folder)
    n = len(files)
    if n == 0:
        print(f"[WARN] Clase '{class_name}' no tiene archivos en: {src_class_folder}")
        return {"class": class_name, "total":0, "train":0, "val":0, "test":0}

    random.shuffle(files)
    t_count, v_count, te_count = split_counts(n, RATIOS)
    splits = {
        "train": files[:t_count],
        "val": files[t_count:t_count+v_count],
        "test": files[t_count+v_count:]
    }
    # crear carpetas por clase dentro de cada destino
    dest_train_class = os.path.join(dest_train, class_name)
    dest_val_class   = os.path.join(dest_val, class_name)
    dest_test_class  = os.path.join(dest_test, class_name)
    ensure_dir(dest_train_class); ensure_dir(dest_val_class); ensure_dir(dest_test_class)

    counts = {"class": class_name, "total": n, "train":0, "val":0, "test":0}

    for split_name, file_list in splits.items():
        for src_path in file_list:
            filename = os.path.basename(src_path)
            if split_name == "train":
                dest_folder = dest_train_class
            elif split_name == "val":
                dest_folder = dest_val_class
            else:
                dest_folder = dest_test_class

            dest_path = unique_dest_path(dest_folder, filename)
            try:
                if move:
                    shutil.move(src_path, dest_path)
                else:
                    shutil.copy2(src_path, dest_path)
                counts[split_name] += 1
            except Exception as e:
                print(f"[ERROR] al copiar/mover {src_path} -> {dest_path}: {e}")

    return counts

In [12]:
def main():
    # chequeos iniciales
    for p in (SRC_DIR, DEST_TRAIN, DEST_VAL, DEST_TEST):
        if not os.path.exists(p):
            print(f"[ERROR] No existe ruta: {p}")
            return

    class_folders = [d for d in os.listdir(SRC_DIR)
                     if os.path.isdir(os.path.join(SRC_DIR, d))]

    if not class_folders:
        print(f"[ERROR] No se encontraron subcarpetas en {SRC_DIR}. Asegúrate de que cada clase tiene su propia carpeta.")
        return

    summary = []
    totals = defaultdict(int)

    print(f"Procesando {len(class_folders)} clases desde: {SRC_DIR}\nRatios (train/val/test): {RATIOS}\nMove mode: {MOVE_INSTEAD_OF_COPY}\n")

    for cls in class_folders:
        src_cls_folder = os.path.join(SRC_DIR, cls)
        counts = process_class(cls, src_cls_folder, DEST_TRAIN, DEST_VAL, DEST_TEST, move=MOVE_INSTEAD_OF_COPY)
        summary.append(counts)
        totals["total"] += counts["total"]
        totals["train"] += counts["train"]
        totals["val"] += counts["val"]
        totals["test"] += counts["test"]
        print(f"Clase: {cls}  -> total: {counts['total']}  train: {counts['train']}  val: {counts['val']}  test: {counts['test']}")

    print("\n=== RESUMEN GLOBAL ===")
    print(f"Clases procesadas: {len(summary)}")
    print(f"Total archivos procesados: {totals['total']}")
    print(f"Train: {totals['train']}  |  Validation: {totals['val']}  |  Test: {totals['test']}")
    print("\nHecho.")


In [16]:
if __name__ == "__main__":
    main()

Procesando 12 clases desde: /content/drive/MyDrive/Clothes
Ratios (train/val/test): (0.7, 0.15, 0.15)
Move mode: False

Clase: Jaket_Denim  -> total: 500  train: 350  val: 75  test: 75
Clase: Rok  -> total: 500  train: 350  val: 75  test: 75
Clase: Jeans  -> total: 500  train: 350  val: 75  test: 75
Clase: Jaket  -> total: 500  train: 350  val: 75  test: 75
Clase: Jaket_Olahraga  -> total: 500  train: 350  val: 75  test: 75
Clase: Polo  -> total: 500  train: 350  val: 75  test: 75
Clase: Sweter  -> total: 500  train: 350  val: 75  test: 75
Clase: Hoodie  -> total: 500  train: 350  val: 75  test: 75
Clase: Gaun  -> total: 500  train: 350  val: 75  test: 75
Clase: Mantel  -> total: 500  train: 350  val: 75  test: 75
Clase: Kaos  -> total: 500  train: 350  val: 75  test: 75
Clase: Kemeja  -> total: 500  train: 350  val: 75  test: 75

=== RESUMEN GLOBAL ===
Clases procesadas: 12
Total archivos procesados: 6000
Train: 4200  |  Validation: 900  |  Test: 900

Hecho.
