# Evaluacion de metricas adicionales del mejor modelo AutoML

Este notebook carga el mejor modelo guardado en `data/results/automl/*_run_summary.json` y calcula metricas adicionales con validacion cruzada (R^2, MAE, MSE, RMSE).


In [1]:
from __future__ import annotations

import json
import sys
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, root_mean_squared_error
from sklearn.model_selection import KFold


In [2]:
# Resolver raiz del proyecto tanto si el notebook se ejecuta desde /notebooks como desde la raiz.
cwd = Path.cwd().resolve()
project_root = cwd if (cwd / 'src').exists() else cwd.parent

if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

from src import models

results_dir = project_root / 'data' / 'results' / 'automl'
processed_dir = project_root / 'data' / 'processed'

summary_candidates = sorted(
    results_dir.glob('*_run_summary.json'),
    key=lambda p: p.stat().st_mtime,
    reverse=True,
)

if not summary_candidates:
    raise FileNotFoundError(f'No se encontraron run summaries en {results_dir}')

summary_path = summary_candidates[0]
dataset_stem = summary_path.name.replace('_run_summary.json', '')
dataset_path = processed_dir / f'{dataset_stem}.csv'

if not dataset_path.exists():
    raise FileNotFoundError(f'No se encontro el dataset esperado: {dataset_path}')

print(f'Summary usado: {summary_path.name}')
print(f'Dataset usado: {dataset_path.name}')


Summary usado: StabIndex_run_summary.json
Dataset usado: StabIndex.csv


In [3]:
with summary_path.open('r', encoding='utf-8') as f:
    run_summary = json.load(f)

best_model_name = run_summary['best_model'].strip().lower()
best_model_params = run_summary.get('params', {})

model_map = {
    'ridge': models.RidgeRegressor,
    'knn': models.KNNRegressor,
    'randomforest': models.RandomForestRegressorSA,
    'mlp': models.MLP,
}

if best_model_name not in model_map:
    raise ValueError(f"Modelo no soportado: {best_model_name}")

print('Best model:', best_model_name)
print('Params:', best_model_params)


Best model: mlp
Params: {'hidden_layer_sizes': [224, 240, 224], 'activation': 'relu', 'alpha': 5.700325715527988e-05, 'learning_rate_init': 0.008301071071690645}


In [4]:
df = pd.read_csv(dataset_path)
target_col = df.columns[-1]
X = df.drop(columns=[target_col])
y = df[target_col]

cv = KFold(n_splits=5, shuffle=True, random_state=42)
model_cls = model_map[best_model_name]

rows = []
for fold_idx, (train_idx, val_idx) in enumerate(cv.split(X, y), start=1):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model = model_cls(**best_model_params)
    model.fit(X_train, y_train)
    pred = model.predict(X_val)

    rows.append({
        'fold': fold_idx,
        'r2': float(r2_score(y_val, pred)),
        'mae': float(mean_absolute_error(y_val, pred)),
        'mse': float(mean_squared_error(y_val, pred)),
        'rmse': float(root_mean_squared_error(y_val, pred)),
    })

metrics_df = pd.DataFrame(rows)
metrics_df


Unnamed: 0,fold,r2,mae,mse,rmse
0,1,0.859105,0.282492,0.152918,0.391048
1,2,0.872253,0.261248,0.112643,0.335623
2,3,0.875423,0.272439,0.125786,0.354664
3,4,0.878192,0.273356,0.130269,0.360928
4,5,0.868308,0.269708,0.125415,0.35414


In [5]:
summary_df = metrics_df[['r2', 'mae', 'mse', 'rmse']].agg(['mean', 'std']).T
summary_df.columns = ['mean', 'std']
summary_df


Unnamed: 0,mean,std
r2,0.870656,0.007433
mae,0.271848,0.00763
mse,0.129406,0.014694
rmse,0.359281,0.02012
