# 02 - Entrenar modelo con AutoGluon

Este notebook entrena un clasificador tabular para predecir la clase de transicion (`EI`, `IE`, `ZE`, `EZ`) usando `transition_dataset.csv`.

In [None]:
from pathlib import Path
import json

import pandas as pd
from autogluon.tabular import TabularPredictor

In [None]:
def find_project_root(start: Path) -> Path:
    current = start.resolve()
    for candidate in [current, *current.parents]:
        if (candidate / "data").exists() and (candidate / "modeling").exists():
            return candidate
    raise FileNotFoundError("No se encontro la raiz del proyecto.")


PROJECT_ROOT = find_project_root(Path.cwd())
DATASET_PATH = PROJECT_ROOT / "modeling" / "data" / "processed" / "transition_dataset.csv"
MODEL_DIR = PROJECT_ROOT / "modeling" / "artifacts" / "autogluon_model"
REPORT_DIR = PROJECT_ROOT / "modeling" / "outputs"

LABEL_COLUMN = "transition_label"
DROP_COLUMNS = ["gene_id", "chromosome", "global_position", "local_position"]
TIME_LIMIT = 900
PRESETS = "medium_quality"
EVAL_METRIC = "accuracy"
TEST_SIZE = 0.2
SEED = 42

DATASET_PATH

In [None]:
def split_train_test(df: pd.DataFrame, label_column: str, test_size: float, seed: int):
    label_counts = df[label_column].value_counts(dropna=False)

    if test_size <= 0:
        return df.sample(frac=1, random_state=seed).reset_index(drop=True), None

    if label_counts.min() < 2:
        return df.sample(frac=1, random_state=seed).reset_index(drop=True), None

    train_parts = []
    test_parts = []

    for _, group in df.groupby(label_column):
        shuffled = group.sample(frac=1, random_state=seed)
        test_count = int(round(len(shuffled) * test_size))
        test_count = max(1, test_count)
        if test_count >= len(shuffled):
            test_count = len(shuffled) - 1

        test_parts.append(shuffled.iloc[:test_count])
        train_parts.append(shuffled.iloc[test_count:])

    train_df = pd.concat(train_parts, ignore_index=True).sample(frac=1, random_state=seed).reset_index(drop=True)
    test_df = pd.concat(test_parts, ignore_index=True).sample(frac=1, random_state=seed).reset_index(drop=True)
    return train_df, test_df


def json_safe(value):
    if isinstance(value, dict):
        return {k: json_safe(v) for k, v in value.items()}
    if isinstance(value, list):
        return [json_safe(v) for v in value]
    if hasattr(value, "item"):
        try:
            return value.item()
        except Exception:
            return str(value)
    return value

In [None]:
dataset = pd.read_csv(DATASET_PATH, low_memory=False)
if dataset.empty:
    raise ValueError(f"Dataset vacio: {DATASET_PATH}")

if LABEL_COLUMN not in dataset.columns:
    raise ValueError(f"No existe la columna objetivo '{LABEL_COLUMN}'")

columns_to_drop = [c for c in DROP_COLUMNS if c in dataset.columns and c != LABEL_COLUMN]
modeling_df = dataset.drop(columns=columns_to_drop, errors="ignore")

feature_columns = [c for c in modeling_df.columns if c != LABEL_COLUMN]
if not feature_columns:
    raise ValueError("No quedan columnas de entrada para entrenar.")
if modeling_df[LABEL_COLUMN].nunique(dropna=False) < 2:
    raise ValueError("Se requieren al menos 2 clases para entrenar.")

train_df, test_df = split_train_test(modeling_df, LABEL_COLUMN, TEST_SIZE, SEED)

print("Train rows:", len(train_df))
print("Test rows:", 0 if test_df is None else len(test_df))
print(modeling_df[LABEL_COLUMN].value_counts(dropna=False).to_string())

In [None]:
MODEL_DIR.mkdir(parents=True, exist_ok=True)
REPORT_DIR.mkdir(parents=True, exist_ok=True)

predictor = TabularPredictor(
    label=LABEL_COLUMN,
    path=str(MODEL_DIR),
    eval_metric=EVAL_METRIC,
)

predictor.fit(
    train_data=train_df,
    presets=PRESETS,
    time_limit=TIME_LIMIT,
)

if test_df is not None and not test_df.empty:
    metrics = predictor.evaluate(test_df, silent=True)
    leaderboard = predictor.leaderboard(test_df, silent=True)
    evaluated_on = "test"
    evaluated_rows = len(test_df)
else:
    metrics = predictor.evaluate(train_df, silent=True)
    leaderboard = predictor.leaderboard(silent=True)
    evaluated_on = "train"
    evaluated_rows = len(train_df)

leaderboard_path = REPORT_DIR / "leaderboard.csv"
metrics_path = REPORT_DIR / "metrics.json"

leaderboard.to_csv(leaderboard_path, index=False)

metrics_payload = {
    "dataset_path": str(DATASET_PATH),
    "label_column": LABEL_COLUMN,
    "dropped_columns": columns_to_drop,
    "train_rows": len(train_df),
    "test_rows": 0 if test_df is None else len(test_df),
    "evaluated_on": evaluated_on,
    "evaluated_rows": evaluated_rows,
    "eval_metric": EVAL_METRIC,
    "metrics": json_safe(metrics),
}

with metrics_path.open("w", encoding="utf-8") as f:
    json.dump(metrics_payload, f, indent=2, ensure_ascii=True)

print(f"Modelo guardado en: {MODEL_DIR}")
print(f"Leaderboard: {leaderboard_path}")
print(f"Metricas: {metrics_path}")

In [None]:
leaderboard.head(20)