# 01 - Construir dataset de entrenamiento

Este notebook unifica los 4 CSV de transiciones (`EI`, `IE`, `ZE`, `EZ`) en un solo dataset etiquetado para entrenar AutoGluon.

## Salida esperada

- `modeling/data/processed/transition_dataset.csv`
- Columna objetivo: `transition_label`

In [None]:
from pathlib import Path
import re

import pandas as pd

In [None]:
def find_project_root(start: Path) -> Path:
    current = start.resolve()
    for candidate in [current, *current.parents]:
        if (candidate / "data").exists() and (candidate / "modeling").exists():
            return candidate
    raise FileNotFoundError("No se encontro la raiz del proyecto.")


PROJECT_ROOT = find_project_root(Path.cwd())
INPUT_DIR = PROJECT_ROOT / "data"
OUTPUT_PATH = PROJECT_ROOT / "modeling" / "data" / "processed" / "transition_dataset.csv"

LABEL_MAPPING = {
    "data_ei.csv": "EI",
    "data_ie.csv": "IE",
    "data_ze.csv": "ZE",
    "data_ez.csv": "EZ",
}

PROJECT_ROOT

In [None]:
POSITION_COLUMNS = ["Intron_Start", "Exon_Start", "First_Exon_Start", "Last_Exon_End"]
B_COLUMN_REGEX = re.compile(r"^B(\\d+)$")


def get_b_columns(columns: list[str]) -> list[str]:
    numbered = []
    for col in columns:
        match = B_COLUMN_REGEX.match(col)
        if match:
            numbered.append((int(match.group(1)), col))
    return [name for _, name in sorted(numbered, key=lambda item: item[0])]


def normalize_position_column(df: pd.DataFrame, source_name: str) -> pd.DataFrame:
    available = [col for col in POSITION_COLUMNS if col in df.columns]
    if len(available) != 1:
        raise ValueError(
            f"Se esperaba exactamente una columna de posicion local en {source_name}. Encontradas: {available}"
        )
    return df.rename(columns={available[0]: "local_position"})


def load_labeled_dataframe(csv_path: Path, label: str) -> pd.DataFrame:
    if not csv_path.exists():
        raise FileNotFoundError(f"No existe el archivo: {csv_path}")

    df = pd.read_csv(csv_path, low_memory=False)
    if df.empty:
        raise ValueError(f"El archivo esta vacio: {csv_path}")

    df = normalize_position_column(df, csv_path.name)
    b_columns = get_b_columns(df.columns.tolist())
    if not b_columns:
        raise ValueError(f"No se encontraron columnas B1..Bn en {csv_path}")

    for col in b_columns:
        df[col] = df[col].fillna("n").astype("string").str.strip().str.lower()

    df["transition_label"] = label
    return df

In [None]:
frames = []
for file_name, label in LABEL_MAPPING.items():
    csv_path = INPUT_DIR / file_name
    frame = load_labeled_dataframe(csv_path, label)
    frames.append(frame)

merged = pd.concat(frames, ignore_index=True, sort=False)
b_columns = get_b_columns(merged.columns.tolist())

for col in b_columns:
    merged[col] = merged[col].fillna("n").astype("string").str.strip().str.lower()

ordered_prefix = ["gene_id", "chromosome", "global_position", "local_position"]
ordered_columns = [col for col in ordered_prefix if col in merged.columns]
ordered_columns.extend(b_columns)
ordered_columns.append("transition_label")
extra_columns = [col for col in merged.columns if col not in ordered_columns]

dataset = merged[ordered_columns + extra_columns]
dataset.head()

In [None]:
OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
dataset.to_csv(OUTPUT_PATH, index=False)

print(f"Dataset guardado en: {OUTPUT_PATH}")
print(f"Total de filas: {len(dataset)}")
print("Distribucion de clases:")
print(dataset["transition_label"].value_counts(dropna=False).to_string())