# 04b - Modelos de Regresi√≥n para Score de Riesgo Compuesto

**Objetivo:** Desarrollar y comparar modelos de regresi√≥n para predecir el `composite_risk_score` continuo
 
**Modalidades integradas:**
- Gen√©tica (APOE, biomarcadores gen√©ticos)
- Biomarcadores (tau, ptau, ABETA)
- Neuroimagen (PET, datos de patolog√≠a tau)
- Actividad/Sue√±o (patrones de actividad y sue√±o)
- Demograf√≠a (edad, g√©nero, factores de riesgo)

---

## Importar Librer√≠as

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# ML Libraries
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
import lightgbm as lgb

# MLflow
import mlflow
import mlflow.sklearn
import mlflow.xgboost
import mlflow.lightgbm
from mlflow.tracking import MlflowClient

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Utils
import joblib
import os
from datetime import datetime
import json

## Configuraciones

In [None]:
# Configuraci√≥n de MLflow
experiment_name = "alzheimer_regression_models"
mlflow.set_experiment(experiment_name)

print(f"üî¨ Experimento MLflow: {experiment_name}")
print(f"üìä Tracking URI: {mlflow.get_tracking_uri()}")

# %%
# Configuraci√≥n de visualizaci√≥n
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

In [None]:
# Configuraci√≥n de paths
DATA_PATH = "../data/processed/"
MODELS_PATH = "../models/regression/"
RESULTS_PATH = "../results/regression/"

# Crear directorios si no existen
os.makedirs(MODELS_PATH, exist_ok=True)
os.makedirs(RESULTS_PATH, exist_ok=True)

## Cargar datos procesados

In [None]:
# Cargar datos procesados
print("üìÅ Cargando dataset procesado...")

# Cargar el dataset final del feature engineering
df = pd.read_csv(f"{DATA_PATH}alzheimer_features_final.csv")

print(f"üìä Dataset cargado: {df.shape}")
print(f"üéØ Target: composite_risk_score")
print(f"üìã Features disponibles: {df.shape[1] - 1}")

## An√°lisis inicial del target

In [None]:
# An√°lisis inicial del target
print("üéØ AN√ÅLISIS DEL TARGET VARIABLE")
print("="*50)

target_col = 'composite_risk_score'
target_stats = df[target_col].describe()

print(f"üìà Estad√≠sticas del {target_col}:")
for stat, value in target_stats.items():
    print(f"   {stat}: {value:.4f}")

print(f"\nüìä Distribuci√≥n del score:")
print(f"   Registros v√°lidos: {df[target_col].notna().sum():,}")
print(f"   Registros faltantes: {df[target_col].isna().sum():,}")
print(f"   Rango: [{df[target_col].min():.3f}, {df[target_col].max():.3f}]")

# %%
# Visualizaci√≥n de la distribuci√≥n del target
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Histograma
axes[0,0].hist(df[target_col].dropna(), bins=50, alpha=0.7, color='skyblue', edgecolor='black')
axes[0,0].set_title('Distribuci√≥n del Composite Risk Score')
axes[0,0].set_xlabel('Risk Score')
axes[0,0].set_ylabel('Frecuencia')

# Box plot
axes[0,1].boxplot(df[target_col].dropna())
axes[0,1].set_title('Box Plot del Risk Score')
axes[0,1].set_ylabel('Risk Score')

# Q-Q plot
from scipy import stats
stats.probplot(df[target_col].dropna(), dist="norm", plot=axes[1,0])
axes[1,0].set_title('Q-Q Plot (Normalidad)')

# Distribuci√≥n por categor√≠as de riesgo (si existe)
if 'risk_category' in df.columns:
    risk_cats = df.groupby('risk_category')[target_col].mean().sort_values()
    axes[1,1].bar(risk_cats.index, risk_cats.values, alpha=0.7)
    axes[1,1].set_title('Risk Score promedio por Categor√≠a')
    axes[1,1].set_xlabel('Categor√≠a de Riesgo')
    axes[1,1].set_ylabel('Risk Score promedio')
    axes[1,1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig(f"{RESULTS_PATH}target_distribution_analysis.png", dpi=300, bbox_inches='tight')
plt.show()

## Preparaci√≥n de datos para modelado

In [None]:
# Preparaci√≥n de datos para modelado
print("üîß PREPARACI√ìN DE DATOS PARA MODELADO")
print("="*50)

# Separar features y target
feature_cols = [col for col in df.columns if col not in [target_col, 'risk_category']]
X = df[feature_cols].copy()
y = df[target_col].copy()

# Eliminar registros con target faltante
valid_mask = y.notna()
X = X[valid_mask]
y = y[valid_mask]

print(f"üìä Datos finales para modelado:")
print(f"   Samples: {X.shape[0]:,}")
print(f"   Features: {X.shape[1]:,}")
print(f"   Target v√°lido: {y.notna().sum():,}")

In [None]:
# An√°lisis de missing values en features
missing_analysis = pd.DataFrame({
    'feature': X.columns,
    'missing_count': X.isnull().sum(),
    'missing_pct': (X.isnull().sum() / len(X)) * 100
}).sort_values('missing_pct', ascending=False)

print(f"\nüìã Top 10 features con m√°s missing values:")
print(missing_analysis.head(10))

# %%
# Manejo de valores faltantes
print("üîß Manejo de valores faltantes...")

# Para features num√©ricas, usar mediana
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')
X_imputed = pd.DataFrame(
    imputer.fit_transform(X), 
    columns=X.columns,
    index=X.index
)

print(f"‚úÖ Missing values imputados")
print(f"   Missing antes: {X.isnull().sum().sum():,}")
print(f"   Missing despu√©s: {X_imputed.isnull().sum().sum():,}")


In [None]:
# Divisi√≥n de datos
print("üîÑ Divisi√≥n de datos...")

# Split estratificado basado en cuartiles del target
y_quartiles = pd.qcut(y, q=4, labels=['Q1', 'Q2', 'Q3', 'Q4'])

X_train, X_test, y_train, y_test = train_test_split(
    X_imputed, y, 
    test_size=0.2, 
    random_state=42,
    stratify=y_quartiles
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train,
    test_size=0.2,
    random_state=42,
    stratify=pd.qcut(y_train, q=4, labels=['Q1', 'Q2', 'Q3', 'Q4'])
)

print(f"üìä Divisi√≥n de datos completada:")
print(f"   Train: {X_train.shape[0]:,} samples")
print(f"   Validation: {X_val.shape[0]:,} samples")  
print(f"   Test: {X_test.shape[0]:,} samples")

# Verificar distribuci√≥n del target en cada split
print(f"\nüìà Distribuci√≥n del target por conjunto:")
for name, target_set in [('Train', y_train), ('Validation', y_val), ('Test', y_test)]:
    print(f"   {name}: Œº={target_set.mean():.4f}, œÉ={target_set.std():.4f}")


In [None]:
# Escalado de features
print("‚öñÔ∏è Escalado de features...")

# Usar RobustScaler para manejar outliers
scaler = RobustScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Convertir de vuelta a DataFrame para mantener nombres de columnas
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_val_scaled = pd.DataFrame(X_val_scaled, columns=X_val.columns, index=X_val.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

print("‚úÖ Escalado completado con RobustScaler")


## Definici√≥n de modelos de regresi√≥n

In [None]:
print("ü§ñ DEFINICI√ìN DE MODELOS DE REGRESI√ìN")
print("="*50)

# Diccionario de modelos con hiperpar√°metros b√°sicos
models = {
    'linear_regression': {
        'model': LinearRegression(),
        'scaled': True,
        'description': 'Regresi√≥n Lineal cl√°sica'
    },
    
    'ridge_regression': {
        'model': Ridge(alpha=1.0, random_state=42),
        'scaled': True,
        'description': 'Ridge Regression con regularizaci√≥n L2'
    },
    
    'lasso_regression': {
        'model': Lasso(alpha=0.1, random_state=42, max_iter=2000),
        'scaled': True,
        'description': 'Lasso Regression con regularizaci√≥n L1'
    },
    
    'elastic_net': {
        'model': ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42, max_iter=2000),
        'scaled': True,
        'description': 'Elastic Net con regularizaci√≥n L1 + L2'
    },
    
    'random_forest': {
        'model': RandomForestRegressor(
            n_estimators=100,
            max_depth=10,
            min_samples_split=5,
            min_samples_leaf=2,
            random_state=42,
            n_jobs=-1
        ),
        'scaled': False,
        'description': 'Random Forest con ensemble de √°rboles'
    },
    
    'extra_trees': {
        'model': ExtraTreesRegressor(
            n_estimators=100,
            max_depth=10,
            min_samples_split=5,
            min_samples_leaf=2,
            random_state=42,
            n_jobs=-1
        ),
        'scaled': False,
        'description': 'Extra Trees con randomizaci√≥n adicional'
    },
    
    'gradient_boosting': {
        'model': GradientBoostingRegressor(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=6,
            random_state=42
        ),
        'scaled': False,
        'description': 'Gradient Boosting secuencial'
    },
    
    'xgboost': {
        'model': xgb.XGBRegressor(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=6,
            random_state=42,
            n_jobs=-1
        ),
        'scaled': False,
        'description': 'XGBoost optimizado'
    },
    
    'lightgbm': {
        'model': lgb.LGBMRegressor(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=6,
            random_state=42,
            n_jobs=-1,
            verbose=-1
        ),
        'scaled': False,
        'description': 'LightGBM r√°pido y eficiente'
    },
    
    'svr': {
        'model': SVR(kernel='rbf', C=1.0, gamma='scale'),
        'scaled': True,
        'description': 'Support Vector Regression'
    },
    
    'knn': {
        'model': KNeighborsRegressor(n_neighbors=5, weights='distance'),
        'scaled': True,
        'description': 'K-Nearest Neighbors'
    },
    
    'mlp': {
        'model': MLPRegressor(
            hidden_layer_sizes=(100, 50),
            learning_rate_init=0.001,
            max_iter=500,
            random_state=42,
            early_stopping=True,
            validation_fraction=0.1
        ),
        'scaled': True,
        'description': 'Multi-Layer Perceptron (Red Neuronal)'
    }
}

print(f"üîß {len(models)} modelos definidos:")
for name, config in models.items():
    print(f"   ‚Ä¢ {name}: {config['description']}")

In [None]:
# Funci√≥n de evaluaci√≥n de modelos
def evaluate_model(model, X_train, X_val, y_train, y_val, model_name):
    """
    Eval√∫a un modelo de regresi√≥n y retorna m√©tricas
    """
    # Entrenar modelo
    model.fit(X_train, y_train)
    
    # Predicciones
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    
    # M√©tricas
    metrics = {
        'train_mse': mean_squared_error(y_train, y_train_pred),
        'val_mse': mean_squared_error(y_val, y_val_pred),
        'train_rmse': np.sqrt(mean_squared_error(y_train, y_train_pred)),
        'val_rmse': np.sqrt(mean_squared_error(y_val, y_val_pred)),
        'train_mae': mean_absolute_error(y_train, y_train_pred),
        'val_mae': mean_absolute_error(y_val, y_val_pred),
        'train_r2': r2_score(y_train, y_train_pred),
        'val_r2': r2_score(y_val, y_val_pred)
    }
    
    # Cross-validation score
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2', n_jobs=-1)
    metrics['cv_r2_mean'] = cv_scores.mean()
    metrics['cv_r2_std'] = cv_scores.std()
    
    return metrics, y_train_pred, y_val_pred

## Entrenamiento y evaluaci√≥n de modelos

In [None]:
print("üöÄ ENTRENAMIENTO Y EVALUACI√ìN DE MODELOS")
print("="*50)

results = {}
all_predictions = {}

for model_name, config in models.items():
    print(f"\nüîÑ Entrenando {model_name}...")
    
    with mlflow.start_run(run_name=model_name):
        # Seleccionar datos escalados o no seg√∫n el modelo
        if config['scaled']:
            X_train_model = X_train_scaled
            X_val_model = X_val_scaled
        else:
            X_train_model = X_train
            X_val_model = X_val
            
        # Entrenar y evaluar
        try:
            metrics, y_train_pred, y_val_pred = evaluate_model(
                config['model'], X_train_model, X_val_model, 
                y_train, y_val, model_name
            )
            
            # Guardar resultados
            results[model_name] = metrics
            all_predictions[model_name] = {
                'train_pred': y_train_pred,
                'val_pred': y_val_pred
            }
            
            # Log en MLflow
            mlflow.log_param("model_type", model_name)
            mlflow.log_param("scaled_data", config['scaled'])
            mlflow.log_param("description", config['description'])
            mlflow.log_param("n_features", X_train_model.shape[1])
            mlflow.log_param("n_train_samples", X_train_model.shape[0])
            
            # Log m√©tricas
            for metric_name, value in metrics.items():
                mlflow.log_metric(metric_name, value)
            
            # Log modelo
            if model_name in ['xgboost']:
                mlflow.xgboost.log_model(config['model'], f"model_{model_name}")
            elif model_name in ['lightgbm']:
                mlflow.lightgbm.log_model(config['model'], f"model_{model_name}")
            else:
                mlflow.sklearn.log_model(config['model'], f"model_{model_name}")
            
            # Guardar modelo localmente
            joblib.dump(config['model'], f"{MODELS_PATH}{model_name}_model.pkl")
            
            print(f"   ‚úÖ {model_name} completado")
            print(f"      Val R¬≤: {metrics['val_r2']:.4f}")
            print(f"      Val RMSE: {metrics['val_rmse']:.4f}")
            
        except Exception as e:
            print(f"   ‚ùå Error en {model_name}: {str(e)}")
            
print(f"\nüéØ Entrenamiento completado para {len(results)} modelos")


In [None]:
# Comparaci√≥n de resultados
print("üìä COMPARACI√ìN DE RESULTADOS")
print("="*50)

# Crear DataFrame con resultados
results_df = pd.DataFrame(results).T

# Ordenar por R¬≤ de validaci√≥n
results_df = results_df.sort_values('val_r2', ascending=False)

print("üèÜ Ranking de modelos por R¬≤ de validaci√≥n:")
print("-" * 70)
for i, (model, row) in enumerate(results_df.iterrows(), 1):
    print(f"{i:2d}. {model:20s} | R¬≤: {row['val_r2']:.4f} | RMSE: {row['val_rmse']:.4f} | MAE: {row['val_mae']:.4f}")


## Guardar resultados

In [None]:
results_df.to_csv(f"{RESULTS_PATH}regression_models_comparison.csv")


## Visualizaci√≥n de comparaci√≥n de modelos

In [None]:
# Visualizaci√≥n de comparaci√≥n de modelos
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# R¬≤ Score comparison
models_names = results_df.index[:10]  # Top 10 modelos
train_r2 = results_df.loc[models_names, 'train_r2']
val_r2 = results_df.loc[models_names, 'val_r2']

x = np.arange(len(models_names))
width = 0.35

axes[0,0].bar(x - width/2, train_r2, width, label='Train R¬≤', alpha=0.8)
axes[0,0].bar(x + width/2, val_r2, width, label='Validation R¬≤', alpha=0.8)
axes[0,0].set_xlabel('Modelos')
axes[0,0].set_ylabel('R¬≤ Score')
axes[0,0].set_title('Comparaci√≥n R¬≤ Score')
axes[0,0].set_xticks(x)
axes[0,0].set_xticklabels(models_names, rotation=45, ha='right')
axes[0,0].legend()
axes[0,0].grid(True, alpha=0.3)

# RMSE comparison
train_rmse = results_df.loc[models_names, 'train_rmse']
val_rmse = results_df.loc[models_names, 'val_rmse']

axes[0,1].bar(x - width/2, train_rmse, width, label='Train RMSE', alpha=0.8)
axes[0,1].bar(x + width/2, val_rmse, width, label='Validation RMSE', alpha=0.8)
axes[0,1].set_xlabel('Modelos')
axes[0,1].set_ylabel('RMSE')
axes[0,1].set_title('Comparaci√≥n RMSE')
axes[0,1].set_xticks(x)
axes[0,1].set_xticklabels(models_names, rotation=45, ha='right')
axes[0,1].legend()
axes[0,1].grid(True, alpha=0.3)

# Overfit analysis (diferencia entre train y validation R¬≤)
overfit_score = results_df.loc[models_names, 'train_r2'] - results_df.loc[models_names, 'val_r2']
colors = ['red' if x > 0.1 else 'orange' if x > 0.05 else 'green' for x in overfit_score]

axes[1,0].bar(models_names, overfit_score, color=colors, alpha=0.7)
axes[1,0].set_xlabel('Modelos')
axes[1,0].set_ylabel('Train R¬≤ - Val R¬≤')
axes[1,0].set_title('An√°lisis de Overfitting')
axes[1,0].tick_params(axis='x', rotation=45)
axes[1,0].grid(True, alpha=0.3)
axes[1,0].axhline(y=0.05, color='orange', linestyle='--', alpha=0.5)
axes[1,0].axhline(y=0.1, color='red', linestyle='--', alpha=0.5)

# Cross-validation scores
cv_means = results_df.loc[models_names, 'cv_r2_mean']
cv_stds = results_df.loc[models_names, 'cv_r2_std']

axes[1,1].bar(models_names, cv_means, yerr=cv_stds, capsize=5, alpha=0.7)
axes[1,1].set_xlabel('Modelos')
axes[1,1].set_ylabel('CV R¬≤ Score')
axes[1,1].set_title('Cross-Validation R¬≤ (5-fold)')
axes[1,1].tick_params(axis='x', rotation=45)
axes[1,1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(f"{RESULTS_PATH}models_comparison_analysis.png", dpi=300, bbox_inches='tight')
plt.show()

## An√°lisis de predicciones del mejor modelo

In [None]:
best_model_name = results_df.index[0]
best_predictions = all_predictions[best_model_name]

print(f"üèÜ AN√ÅLISIS DEL MEJOR MODELO: {best_model_name}")
print("="*50)

fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# Scatter plot real vs predicho - Training
axes[0,0].scatter(y_train, best_predictions['train_pred'], alpha=0.6, s=20)
axes[0,0].plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'r--', lw=2)
axes[0,0].set_xlabel('Valores Reales')
axes[0,0].set_ylabel('Valores Predichos')
axes[0,0].set_title(f'{best_model_name} - Training Set')
axes[0,0].grid(True, alpha=0.3)

# Scatter plot real vs predicho - Validation
axes[0,1].scatter(y_val, best_predictions['val_pred'], alpha=0.6, s=20, color='orange')
axes[0,1].plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'r--', lw=2)
axes[0,1].set_xlabel('Valores Reales')
axes[0,1].set_ylabel('Valores Predichos')
axes[0,1].set_title(f'{best_model_name} - Validation Set')
axes[0,1].grid(True, alpha=0.3)

# Residuals plot - Training
train_residuals = y_train - best_predictions['train_pred']
axes[1,0].scatter(best_predictions['train_pred'], train_residuals, alpha=0.6, s=20)
axes[1,0].axhline(y=0, color='r', linestyle='--')
axes[1,0].set_xlabel('Valores Predichos')
axes[1,0].set_ylabel('Residuales')
axes[1,0].set_title('Residuales - Training Set')
axes[1,0].grid(True, alpha=0.3)

# Residuals plot - Validation
val_residuals = y_val - best_predictions['val_pred']
axes[1,1].scatter(best_predictions['val_pred'], val_residuals, alpha=0.6, s=20, color='orange')
axes[1,1].axhline(y=0, color='r', linestyle='--')
axes[1,1].set_xlabel('Valores Predichos')
axes[1,1].set_ylabel('Residuales')
axes[1,1].set_title('Residuales - Validation Set')
axes[1,1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(f"{RESULTS_PATH}best_model_predictions_analysis.png", dpi=300, bbox_inches='tight')
plt.show()

# %%
# Feature importance del mejor modelo (si es tree-based)
if hasattr(models[best_model_name]['model'], 'feature_importances_'):
    print(f"üåü FEATURE IMPORTANCE - {best_model_name}")
    print("="*50)
    
    # Obtener importancias
    if models[best_model_name]['scaled']:
        feature_names = X_train_scaled.columns
    else:
        feature_names = X_train.columns
        
    importances = models[best_model_name]['model'].feature_importances_
    
    # Crear DataFrame con importancias
    feature_importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': importances
    }).sort_values('importance', ascending=False)
    
    # Top 20 features m√°s importantes
    top_features = feature_importance_df.head(20)
    
    plt.figure(figsize=(12, 8))
    sns.barplot(data=top_features, x='importance', y='feature', palette='viridis')
    plt.title(f'Top 20 Features m√°s importantes - {best_model_name}')
    plt.xlabel('Importancia')
    plt.tight_layout()
    plt.savefig(f"{RESULTS_PATH}feature_importance_{best_model_name}.png", dpi=300, bbox_inches='tight')
    plt.show()
    
    # Guardar feature importance
    feature_importance_df.to_csv(f"{RESULTS_PATH}feature_importance_{best_model_name}.csv", index=False)
    
    print("üîù Top 10 features m√°s importantes:")
    for i, (_, row) in enumerate(top_features.head(10).iterrows(), 1):
        print(f"   {i:2d}. {row['feature']:30s} | {row['importance']:.4f}")



## Guardado de objetos importantes

In [None]:
print("üíæ GUARDADO DE RESULTADOS")
print("="*50)

# Guardar el scaler
joblib.dump(scaler, f"{MODELS_PATH}scaler.pkl")

# Guardar configuraci√≥n de splits
split_info = {
    'train_indices': X_train.index.tolist(),
    'val_indices': X_val.index.tolist(),
    'test_indices': X_test.index.tolist(),
    'feature_columns': X_train.columns.tolist(),
    'target_column': target_col,
    'best_model': best_model_name
}

with open(f"{RESULTS_PATH}split_configuration.json", 'w') as f:
    json.dump(split_info, f, indent=2)

# Guardar predicciones
predictions_df = pd.DataFrame({
    'y_train_true': y_train,
    'y_val_true': y_val
})

for model_name, preds in all_predictions.items():
    predictions_df[f'{model_name}_train_pred'] = preds['train_pred']
    predictions_df[f'{model_name}_val_pred'] = preds['val_pred']

predictions_df.to_csv(f"{RESULTS_PATH}all_predictions.csv")

print("‚úÖ Resultados guardados:")
print(f"   ‚Ä¢ Modelos: {MODELS_PATH}")
print(f"   ‚Ä¢ Resultados: {RESULTS_PATH}")
print(f"   ‚Ä¢ Configuraci√≥n de splits: split_configuration.json")
print(f"   ‚Ä¢ Predicciones: all_predictions.csv")


## Resumen final

In [None]:
# Resumen final
print("üéØ RESUMEN FINAL DEL DESARROLLO DE MODELOS DE REGRESI√ìN")
print("="*60)

print(f"üìä Dataset procesado:")
print(f"   ‚Ä¢ Registros totales: {len(X):,}")
print(f"   ‚Ä¢ Features utilizadas: {X.shape[1]:,}")
print(f"   ‚Ä¢ Target: {target_col}")
print(f"   ‚Ä¢ Rango del target: [{y.min():.3f}, {y.max():.3f}]")

print(f"\nüîÑ Divisi√≥n de datos:")
print(f"   ‚Ä¢ Training: {len(X_train):,} samples ({len(X_train)/len(X)*100:.1f}%)")
print(f"   ‚Ä¢ Validation: {len(X_val):,} samples ({len(X_val)/len(X)*100:.1f}%)")
print(f"   ‚Ä¢ Test: {len(X_test):,} samples ({len(X_test)/len(X)*100:.1f}%)")

print(f"\nü§ñ Modelos evaluados: {len(results)}")
for i, (model, metrics) in enumerate(results_df.head(5).iterrows(), 1):
    print(f"   {i}. {model:20s} | R¬≤: {metrics['val_r2']:.4f} | RMSE: {metrics['val_rmse']:.4f}")

print(f"\nüèÜ Mejor modelo: {best_model_name}")
best_metrics = results_df.loc[best_model_name]
print(f"   ‚Ä¢ R¬≤ Validaci√≥n: {best_metrics['val_r2']:.4f}")
print(f"   ‚Ä¢ RMSE Validaci√≥n: {best_metrics['val_rmse']:.4f}")
print(f"   ‚Ä¢ MAE Validaci√≥n: {best_metrics['val_mae']:.4f}")
print(f"   ‚Ä¢ CV R¬≤ (5-fold): {best_metrics['cv_r2_mean']:.4f} ¬± {best_metrics['cv_r2_std']:.4f}")

print(f"\nüíæ Archivos generados:")
print(f"   ‚Ä¢ Modelos entrenados: {len(results)} archivos .pkl")
print(f"   ‚Ä¢ Comparaci√≥n de resultados: regression_models_comparison.csv")
print(f"   ‚Ä¢ Predicciones completas: all_predictions.csv")
print(f"   ‚Ä¢ Configuraci√≥n de splits: split_configuration.json")
print(f"   ‚Ä¢ Visualizaciones: 4+ gr√°ficos PNG")

print(f"\nüî¨ MLflow:")
print(f"   ‚Ä¢ Experimento: {experiment_name}")
print(f"   ‚Ä¢ Runs registrados: {len(results)}")
print(f"   ‚Ä¢ Tracking URI: {mlflow.get_tracking_uri()}")

print(f"\nüìà Pr√≥ximos pasos sugeridos:")
print(f"   1. Evaluaci√≥n detallada en conjunto de test")
print(f"   2. Optimizaci√≥n de hiperpar√°metros del mejor modelo")
print(f"   3. An√°lisis de feature importance y selecci√≥n")
print(f"   4. Ensemble methods con top modelos")
print(f"   5. An√°lisis de explicabilidad (SHAP/LIME)")

print(f"\n‚úÖ NOTEBOOK 04b_regression_models.ipynb COMPLETADO")
print("="*60)

---

__Abraham Tartalos__