# 05 - Inference Smoke Test
Este notebook valida que los modelos campeones (EI, IE, ZE, EZ) cargan correctamente desde training/reports/model_registry.json y pueden inferir sobre muestras reales.

Checks principales:

Carga de TabularPredictor por transicion.
predict devuelve el mismo numero de filas que la entrada.
predict_proba devuelve probabilidades validas en [0, 1].
Exporta un reporte consolidado a `training/reports/inference_smoke_test.csv.`

In [51]:
import json
import pandas as pd
from pathlib import Path
from autogluon.tabular import TabularPredictor


project_root = Path.cwd().parent.parent
registry_path =  project_root / "training/reports/model_registry.json"
report_path = project_root / "training/reports/inference_smoke_test.csv"

data_path_map = {
    "EI": "data/data_ei.csv",
    "IE": "data/data_ie.csv",
    "ZE": "data/data_ze.csv",
    "EZ": "data/data_ez.csv",
}



with registry_path.open() as f:
    registry = json.load(f)

models_cfg = registry.get("models", {})

if not models_cfg:
    raise ValueError("No models found in the registry.")

expected = {"EI", "IE", "ZE", "EZ"}
missing = expected - set(models_cfg.keys())
if missing:
    print(f"Warning: missing transitions in registry: {sorted(missing)}")

registry_df = pd.DataFrame(
    [
        {
            "transition": transition,
            "model_path": cfg.get("path"),
            "registry_f1": cfg.get("f1"),
        }
        for transition, cfg in sorted(models_cfg.items())
    ]
)
project_root, registry_path, report_path




(Path('/mnt/c/Users/Carlos/OneDrive/Documentos/Genome-Transition_AutoGluon'),
 Path('/mnt/c/Users/Carlos/OneDrive/Documentos/Genome-Transition_AutoGluon/training/reports/model_registry.json'),
 Path('/mnt/c/Users/Carlos/OneDrive/Documentos/Genome-Transition_AutoGluon/training/reports/inference_smoke_test.csv'))

In [31]:
def normalize_binary(preds):
    s = pd.Series(preds).copy()

    mapped = s.astype(str).str.strip().str.lower().map({
        "1": 1,
        "0": 0,
        "true": 1,
        "false": 0,
        "yes": 1,
        "no": 0,
    })
    if mapped.isna().any():
        numeric = pd.to_numeric(s, errors="coerce")
        if numeric.notna().all():
            mapped = numeric.astype(int)
    if mapped.isna().any():
        raise ValueError("Could not normalize binary values to 0/1")
    return mapped.astype(int).reset_index(drop=True)


def get_positive_proba(proba_output):

    if isinstance(proba_output, pd.DataFrame):
        for col in [1, "1", True, "True", "true", "positive", "pos", "label_1"]:
            if col in proba_output.columns:
                return pd.to_numeric(proba_output[col], errors="coerce").astype(float).reset_index(drop=True)

        if proba_output.shape[1] == 2:
            return pd.to_numeric(proba_output.iloc[:, 1], errors="coerce").astype(float).reset_index(drop=True)

        return pd.to_numeric(proba_output.max(axis=1), errors="coerce").astype(float).reset_index(drop=True)

    raise TypeError(f"Unsupported probability output type: {type(proba_output)}")

def sample_transition_dataframe(transition, sample_size=300, random_state=42):
    csv_path = project_root / data_path_map[transition]
    if not csv_path.exists():
        raise FileNotFoundError(f"CSV for {transition} not found at {csv_path}")

    df = pd.read_csv(csv_path)
    feature_cols = [c for c in df.columns if c.startswith("B")]
    if not feature_cols:
        raise ValueError(f"No feature columns (B*) found in {csv_path}")

    X = df[feature_cols].copy()
    y = normalize_binary(df["label"])

    if len(X) > sample_size:
        sample_idx = X.sample(n=sample_size, random_state=random_state).index
        X = X.loc[sample_idx].reset_index(drop=True)
        y = y.loc[sample_idx].reset_index(drop=True)
    else:
        X = X.reset_index(drop=True)
        y = y.reset_index(drop=True)

    return X, y, csv_path

def compute_binary_metrics(y_true, y_pred):
    y_true = normalize_binary(y_true)
    y_pred = normalize_binary(y_pred)

    tp = int(((y_pred == 1) & (y_true == 1)).sum())
    tn = int(((y_pred == 0) & (y_true == 0)).sum())
    fp = int(((y_pred == 1) & (y_true == 0)).sum())
    fn = int(((y_pred == 0) & (y_true == 1)).sum())

    accuracy = (tp + tn) / max(tp + tn + fp + fn, 1)
    precision = tp / max(tp + fp, 1)
    recall = tp / max(tp + fn, 1)
    f1 = (2 * precision * recall) / max(precision + recall, 1e-12)

    return {
        "accuracy": float(accuracy),
        "precision": float(precision),
        "recall": float(recall),
        "f1": float(f1),
    }


In [35]:
rows = []

for transition , cfg in sorted(models_cfg.items()):

    model_path_raw = cfg.get("path")
    if not model_path_raw:
        raise ValueError(f"Model path missing for transition '{transition}'")

    model_path = Path(model_path_raw)
    if not model_path.is_absolute():
        model_path = project_root / model_path


    try:
        predictor = TabularPredictor.load(str(model_path))
    except Exception as e:
        print(f"Error loading model for {transition}: {e}")
        continue

    X_test, y_test, csv_path = sample_transition_dataframe(transition)

        

    try:
        y_pred_proba = predictor.predict(X_test, as_pandas=True)
        y_pred = normalize_binary(y_pred_proba)
    except Exception as e:
        print(f"Error during prediction for {transition}: {e}")
        continue

    try:
        y_proba_raw = predictor.predict_proba(X_test, as_pandas=True)
        y_score = get_positive_proba(y_proba_raw)
    except Exception as e:
        print(f"Error during probability prediction for {transition}: {e}")

    metrics = compute_binary_metrics(y_test, y_pred)

    try:
        row = {
        "transition": transition,
        "model_path": str(model_path),
        "data_csv": str(csv_path),
        "n_rows": int(len(X_test)),
        "proba_min": float(y_score.min()),
        "proba_max": float(y_score.max()),
        "proba_mean": float(y_score.mean()),
        "accuracy": metrics["accuracy"],
        "precision": metrics["precision"],
        "recall": metrics["recall"],
        "f1_smoke": metrics["f1"],
        "error": "",
        }
    except Exception as e:
        row["error"] = repr(e)
    
    rows.append(row)
    

results_df = pd.DataFrame(rows).sort_values("transition").reset_index(drop=True)
results_df

    

Unnamed: 0,transition,model_path,data_csv,n_rows,proba_min,proba_max,proba_mean,accuracy,precision,recall,f1_smoke,error
0,EI,/mnt/c/Users/Carlos/OneDrive/Documentos/Genome-Transition_AutoGluon/training/models/autogluon_ei,/mnt/c/Users/Carlos/OneDrive/Documentos/Genome-Transition_AutoGluon/data/data_ei.csv,300,1.206287e-06,1.0,0.388918,0.973333,0.965517,0.965517,0.965517,
1,EZ,/mnt/c/Users/Carlos/OneDrive/Documentos/Genome-Transition_AutoGluon/training/models/autogluon_ez,/mnt/c/Users/Carlos/OneDrive/Documentos/Genome-Transition_AutoGluon/data/data_ez.csv,300,0.04923027,0.783399,0.513558,0.74,0.666667,0.980392,0.793651,
2,IE,/mnt/c/Users/Carlos/OneDrive/Documentos/Genome-Transition_AutoGluon/training/models/autogluon_ie,/mnt/c/Users/Carlos/OneDrive/Documentos/Genome-Transition_AutoGluon/data/data_ie.csv,300,5.657446e-07,1.0,0.381892,0.99,0.991304,0.982759,0.987013,
3,ZE,/mnt/c/Users/Carlos/OneDrive/Documentos/Genome-Transition_AutoGluon/training/models/autogluon_ze,/mnt/c/Users/Carlos/OneDrive/Documentos/Genome-Transition_AutoGluon/data/data_ze.csv,300,0.1469623,0.717682,0.477071,0.723333,0.710843,0.771242,0.739812,


In [52]:
report_path.parent.mkdir(parents=True, exist_ok=True)
results_df.to_csv(report_path, index=False)
print(f"Smoke test report saved to: {report_path}")

Smoke test report saved to: /mnt/c/Users/Carlos/OneDrive/Documentos/Genome-Transition_AutoGluon/training/reports/inference_smoke_test.csv
