# 01 - Preparacion de datos para entrenamiento

Este notebook prepara los CSV de transiciones genomicas para entrenamiento de modelos.

Objetivos:
- Cargar `data_ei.csv`, `data_ie.csv`, `data_ze.csv`, `data_ez.csv`.
- Construir una columna `sequence` concatenando `B1..Bn`.
- Soportar datasets con o sin columna `label` binaria (`true/false`, `1/0`, etc.).
- Agregar `transition_type` (EI/IE/ZE/EZ).
- Exportar datasets listos para modelado.

In [None]:
from pathlib import Path
import re

import numpy as np
import pandas as pd

DATA_DIR = Path("../data")
OUTPUT_FULL = DATA_DIR / "training_dataset_long.csv"
OUTPUT_MULTICLASS = DATA_DIR / "training_dataset_multiclass.csv"

TRANSITION_FILES = {
    "EI": DATA_DIR / "data_ei.csv",
    "IE": DATA_DIR / "data_ie.csv",
    "ZE": DATA_DIR / "data_ze.csv",
    "EZ": DATA_DIR / "data_ez.csv",
}

LABEL_CANDIDATES = {"label", "target", "is_transition", "y"}


def find_label_column(df: pd.DataFrame) -> str | None:
    for col in df.columns:
        if col.lower() in LABEL_CANDIDATES:
            return col
    return None


def sort_b_columns(columns: list[str]) -> list[str]:
    numbered = []
    for col in columns:
        match = re.fullmatch(r"B(\d+)", str(col))
        if match:
            numbered.append((int(match.group(1)), col))
    return [col for _, col in sorted(numbered, key=lambda item: item[0])]


def normalize_binary_label(series: pd.Series) -> pd.Series:
    if pd.api.types.is_bool_dtype(series):
        return series.astype("int64")

    text = series.astype(str).str.strip().str.lower()
    mapped = text.map(
        {
            "true": 1,
            "false": 0,
            "1": 1,
            "0": 0,
            "t": 1,
            "f": 0,
            "yes": 1,
            "no": 0,
        }
    )

    numeric = pd.to_numeric(text, errors="coerce")
    merged = mapped.where(mapped.notna(), numeric)

    if merged.isna().any():
        unknown = sorted(text[merged.isna()].unique().tolist())
        raise ValueError(f"Valores de label no reconocidos: {unknown}")

    return merged.astype("int64")


def build_transition_dataframe(file_path: Path, transition_type: str) -> pd.DataFrame:
    if not file_path.exists():
        raise FileNotFoundError(f"No existe archivo: {file_path}")

    df = pd.read_csv(file_path)
    if df.empty:
        print(f"Advertencia: {file_path.name} esta vacio")
        return pd.DataFrame(
            columns=[
                "gene_id",
                "chromosome",
                "global_position",
                "local_position_col",
                "local_position",
                "sequence",
                "sequence_len",
                "transition_type",
                "label",
                "target_multiclass",
                "source_file",
            ]
        )

    b_cols = sort_b_columns(df.columns.tolist())
    if not b_cols:
        raise ValueError(f"{file_path.name} no tiene columnas B1..Bn")

    label_col = find_label_column(df)

    sequence = (
        df[b_cols]
        .fillna("")
        .astype(str)
        .agg("".join, axis=1)
        .str.lower()
    )

    metadata_cols = [c for c in ["gene_id", "chromosome", "global_position"] if c in df.columns]

    known_cols = set(metadata_cols + b_cols)
    if label_col is not None:
        known_cols.add(label_col)

    position_candidates = [c for c in df.columns if c not in known_cols]
    local_position_col = position_candidates[0] if position_candidates else None

    out = pd.DataFrame()
    for col in metadata_cols:
        out[col] = df[col]

    out["local_position_col"] = local_position_col if local_position_col else pd.NA
    if local_position_col:
        out["local_position"] = df[local_position_col]
    else:
        out["local_position"] = pd.Series([pd.NA] * len(df), dtype="object")

    out["sequence"] = sequence
    out["sequence_len"] = sequence.str.len()
    out["transition_type"] = transition_type
    out["source_file"] = file_path.name

    if label_col is not None:
        out["label"] = normalize_binary_label(df[label_col])
    else:
        # Si no existe label, se asume muestra positiva para esa transicion.
        out["label"] = 1

    out["target_multiclass"] = np.where(out["label"] == 1, transition_type, "NONE")

    return out

In [None]:
frames = []
for transition_type, file_path in TRANSITION_FILES.items():
    transition_df = build_transition_dataframe(file_path, transition_type)
    frames.append(transition_df)

    label_dist = transition_df["label"].value_counts(dropna=False).to_dict() if not transition_df.empty else {}
    print(f"{transition_type}: {len(transition_df)} filas | labels={label_dist}")

dataset = pd.concat(frames, ignore_index=True)
print(f"\nTotal filas: {len(dataset)}")

dataset.head(5)

In [None]:
print("Distribucion por transition_type:")
print(dataset["transition_type"].value_counts(dropna=False))

print("\nDistribucion por label:")
print(dataset["label"].value_counts(dropna=False))

print("\nCruce transition_type x label:")
print(pd.crosstab(dataset["transition_type"], dataset["label"], margins=True))

duplicates = dataset.duplicated(
    subset=["gene_id", "chromosome", "global_position", "transition_type", "sequence", "label"]
).sum()
print(f"\nDuplicados exactos: {duplicates}")

invalid_chars = dataset["sequence"].str.contains(r"[^acgtn]", regex=True, na=False).sum()
print(f"Secuencias con caracteres fuera de A/C/G/T/N: {invalid_chars}")

print("\nLongitud de secuencias por transition_type:")
print(dataset.groupby("transition_type")["sequence_len"].describe()[["min", "max", "mean"]])

In [None]:
DATA_DIR.mkdir(parents=True, exist_ok=True)

dataset.to_csv(OUTPUT_FULL, index=False)

dataset_multiclass = dataset[dataset["label"] == 1].copy()
dataset_multiclass.to_csv(OUTPUT_MULTICLASS, index=False)

print(f"Guardado: {OUTPUT_FULL} ({len(dataset)} filas)")
print(
    "Guardado: "
    f"{OUTPUT_MULTICLASS} ({len(dataset_multiclass)} filas, solo label=1 para multiclase)"
)

## Proximo paso

Con los CSV unificados ya puedes crear `02_baseline_modelo.ipynb`.

Recomendaciones:
1. Si `label` es binario por archivo, puedes entrenar 4 modelos binarios (uno por tipo de transicion).
2. Si quieres un solo modelo multiclase EI/IE/ZE/EZ, usa `training_dataset_multiclass.csv`.
3. Mantener un conjunto de prueba separado y usar `stratify` en el split.