# 06 - Single-Class Confusion Matrices (EI, IE, ZE, EZ)

Genera una matriz de confusión binaria por cada zona de transición usando
el conjunto de **test** (misma partición `GroupShuffleSplit` que el
entrenamiento) y el modelo campeón registrado en `model_registry.json`.

In [None]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.model_selection import GroupShuffleSplit
from sklearn.metrics import confusion_matrix, classification_report
from autogluon.tabular import TabularPredictor

In [None]:
project_root = Path().resolve().parents[1]

registry_path = project_root / "training/reports/model_registry.json"
with registry_path.open() as f:
    registry = json.load(f)

TRANSITIONS = {
    "EI": {"csv": project_root / "data/data_ei.csv"},
    "IE": {"csv": project_root / "data/data_ie.csv"},
    "ZE": {"csv": project_root / "data/data_ze.csv"},
    "EZ": {"csv": project_root / "data/data_ez.csv"},
}

for t, cfg in registry["models"].items():
    model_path = Path(cfg["path"])
    if not model_path.is_absolute():
        model_path = project_root / model_path
    TRANSITIONS[t]["model_path"] = model_path

print("Transitions configuradas:")
for k, v in TRANSITIONS.items():
    print(f"  {k}: csv={v['csv'].name}, model={v.get('model_path', '?')}")

In [None]:
def split_by_gene(data, test_size=0.2, random_state=42):
    splitter = GroupShuffleSplit(test_size=test_size, n_splits=1, random_state=random_state)
    idx_train, idx_test = next(splitter.split(data, groups=data["gene_id"]))
    return data.iloc[idx_train].copy(), data.iloc[idx_test].copy()


def prepare_test_set(csv_path: Path):
    """Replica exacta de la partición usada en el entrenamiento."""
    df = pd.read_csv(csv_path)
    seq_cols = [c for c in df.columns if c.startswith("B")]
    df_model = df[["gene_id"] + seq_cols + ["label"]].copy()
    df_model["label"] = (
        df_model["label"].astype(str).str.lower().map({"true": 1, "false": 0})
    )

    train_val, test_data = split_by_gene(df_model, test_size=0.2, random_state=42)
    test = test_data.drop(columns=["gene_id"])
    return test


def normalize_preds(preds):
    s = pd.Series(preds).astype(str).str.strip().str.lower()
    return s.map({"1": 1, "0": 0, "true": 1, "false": 0}).astype(int).values

In [None]:
results = {}

for transition, cfg in TRANSITIONS.items():
    print(f"\n{'='*50}")
    print(f"Transition: {transition}")
    print(f"{'='*50}")

    test = prepare_test_set(cfg["csv"])
    X_test = test.drop(columns=["label"])
    y_test = test["label"].values

    predictor = TabularPredictor.load(str(cfg["model_path"]), require_py_version_match=False)
    y_pred = normalize_preds(predictor.predict(X_test))

    cm = confusion_matrix(y_test, y_pred, labels=[1, 0])
    results[transition] = {"cm": cm, "y_test": y_test, "y_pred": y_pred}

    print(f"Test samples: {len(y_test)}")
    print(f"Confusion matrix (rows=actual, cols=predicted):")
    print(f"  TP={cm[0,0]}  FN={cm[0,1]}")
    print(f"  FP={cm[1,0]}  TN={cm[1,1]}")
    print()
    print(classification_report(y_test, y_pred, target_names=["Negative", "Positive"]))

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 12))
fig.suptitle("Confusion Matrices — Transition Zones", fontsize=18, fontweight="bold", y=1.01)

order = ["EI", "IE", "ZE", "EZ"]

for ax, transition in zip(axes.flat, order):
    cm = results[transition]["cm"]

    sns.heatmap(
        cm,
        annot=True,
        fmt="d",
        cmap="Blues",
        xticklabels=["Positive", "Negative"],
        yticklabels=["Positive", "Negative"],
        ax=ax,
        annot_kws={"size": 16, "fontweight": "bold"},
        linewidths=0.5,
        linecolor="white",
    )
    ax.set_title(f"Confusion Matrix - {transition}", fontsize=14, fontweight="bold")
    ax.set_ylabel("Actual", fontsize=12)
    ax.set_xlabel("Predicted", fontsize=12)

plt.tight_layout()
plt.savefig(project_root / "training/reports/confusion_matrices.png", dpi=150, bbox_inches="tight")
plt.show()
print("Saved to training/reports/confusion_matrices.png")

In [None]:
rows = []
for transition in order:
    cm = results[transition]["cm"]
    tp, fn = cm[0]
    fp, tn = cm[1]
    total = tp + tn + fp + fn
    accuracy = (tp + tn) / total
    precision = tp / max(tp + fp, 1)
    recall = tp / max(tp + fn, 1)
    f1 = 2 * precision * recall / max(precision + recall, 1e-12)
    rows.append({
        "transition": transition,
        "TP": tp, "FN": fn, "FP": fp, "TN": tn,
        "accuracy": round(accuracy, 4),
        "precision": round(precision, 4),
        "recall": round(recall, 4),
        "f1": round(f1, 4),
    })

summary_df = pd.DataFrame(rows)
summary_df.to_csv(project_root / "training/reports/confusion_matrix_summary.csv", index=False)
summary_df