# 04b - Regression Models for Risk Score Prediction

**Objetivo**: Desarrollar modelos de regresi√≥n para predecir el `composite_risk_score` continuo
 
**Modelos a desarrollar**:
- Linear Regression (baseline)
- Random Forest Regressor
- XGBoost Regressor
- Support Vector Regressor
- Neural Network Regressor

---

## Importar Librer√≠as

In [8]:
import sys
import os
sys.path.append('../src/modeling')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Importar pipeline de regresi√≥n
from regression_pipeline import RegressionPipeline
from model_utils import load_processed_data, save_model_artifacts
import mlflow
import mlflow.sklearn

## Configuraciones

In [15]:
# Configuraci√≥n
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("‚úÖ Librer√≠as y m√≥dulos importados correctamente")
print(f"üìÖ Fecha de ejecuci√≥n: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

‚úÖ Librer√≠as y m√≥dulos importados correctamente
üìÖ Fecha de ejecuci√≥n: 2025-06-15 08:20:51


In [10]:
# Configuraci√≥n MLflow
mlflow.set_experiment("alzheimer_multimodal_monitoring")

print("üîß MLflow configurado para modelos de regresi√≥n")
print("üéØ Target: composite_risk_score (regresi√≥n continua)")


2025/06/14 11:49:21 INFO mlflow.tracking.fluent: Experiment with name 'alzheimer_multimodal_monitoring' does not exist. Creating a new experiment.


üîß MLflow configurado para modelos de regresi√≥n
üéØ Target: composite_risk_score (regresi√≥n continua)


In [16]:
# Agrega despu√©s de tu configuraci√≥n:
experiment = mlflow.get_experiment_by_name("alzheimer_multimodal_monitoring")
print(f"ID del experimento: {experiment.experiment_id}")
print(f"Ubicaci√≥n: {experiment.artifact_location}")

ID del experimento: 503314857977141831
Ubicaci√≥n: file:///E:/usuarios/alumno/Escritorio/Alzheimer-Multimodal-Monitoring/notebooks/mlruns/503314857977141831


## Cargar datos procesados

In [17]:
# Cargar datos procesados
try:
    df = pd.read_csv('../data/processed/features/alzheimer_features_selected_20250605.csv')
    print(f"üìä Dataset cargado: {df.shape}")
    print(f"üìà Registros con score v√°lido: {df['composite_risk_score'].notna().sum()}")
    
    # Informaci√≥n del target
    target_stats = df['composite_risk_score'].describe()
    print(f"\nüìä ESTAD√çSTICAS DEL TARGET:")
    print(f"   ‚Ä¢ Media: {target_stats['mean']:.3f}")
    print(f"   ‚Ä¢ Desviaci√≥n est√°ndar: {target_stats['std']:.3f}")
    print(f"   ‚Ä¢ Rango: [{target_stats['min']:.3f}, {target_stats['max']:.3f}]")
    
except FileNotFoundError:
    print("‚ùå Error: Archivo no encontrado")
    print("üí° Ejecuta primero el notebook de feature engineering")

üìä Dataset cargado: (48466, 189)
üìà Registros con score v√°lido: 48466

üìä ESTAD√çSTICAS DEL TARGET:
   ‚Ä¢ Media: 0.367
   ‚Ä¢ Desviaci√≥n est√°ndar: 0.213
   ‚Ä¢ Rango: [0.000, 0.929]


In [14]:
# Inicializar pipeline de regresi√≥n
regression_pipeline = RegressionPipeline(
    target_column='composite_risk_score',
    test_size=0.2,
    random_state=42
)

print("üîß Pipeline de regresi√≥n inicializado")
print(f"   ‚Ä¢ Target: {regression_pipeline.target_column}")
print(f"   ‚Ä¢ Test size: {regression_pipeline.test_size}")

TypeError: RegressionPipeline.__init__() got an unexpected keyword argument 'target_column'

## Preparaci√≥n de datos para modelado

In [None]:
X_train, X_test, y_train, y_test = regression_pipeline.prepare_data(df)

print(f"üìä DIVISI√ìN DE DATOS:")
print(f"   ‚Ä¢ Training set: {X_train.shape}")
print(f"   ‚Ä¢ Test set: {X_test.shape}")
print(f"   ‚Ä¢ Features utilizadas: {X_train.shape[1]}")
print(f"   ‚Ä¢ Target range - Train: [{y_train.min():.3f}, {y_train.max():.3f}]")
print(f"   ‚Ä¢ Target range - Test: [{y_test.min():.3f}, {y_test.max():.3f}]")


## Definici√≥n de modelos

In [None]:
# 1. Linear Regression (Baseline)
with mlflow.start_run(run_name="linear_regression_baseline"):
    print("\nüîπ Entrenando Linear Regression...")
    
    lr_results = regression_pipeline.train_linear_regression(
        X_train, y_train, X_test, y_test
    )
    
    print(f"‚úÖ Linear Regression completado")
    print(f"   ‚Ä¢ R¬≤ Score: {lr_results['r2_score']:.4f}")
    print(f"   ‚Ä¢ RMSE: {lr_results['rmse']:.4f}")
    print(f"   ‚Ä¢ MAE: {lr_results['mae']:.4f}")

# %%
# 2. Random Forest Regressor
with mlflow.start_run(run_name="random_forest_regressor"):
    print("\nüå≤ Entrenando Random Forest...")
    
    rf_results = regression_pipeline.train_random_forest(
        X_train, y_train, X_test, y_test,
        n_estimators=100,
        max_depth=10,
        min_samples_split=5
    )
    
    print(f"‚úÖ Random Forest completado")
    print(f"   ‚Ä¢ R¬≤ Score: {rf_results['r2_score']:.4f}")
    print(f"   ‚Ä¢ RMSE: {rf_results['rmse']:.4f}")
    print(f"   ‚Ä¢ MAE: {rf_results['mae']:.4f}")

# %%
# 3. XGBoost Regressor
with mlflow.start_run(run_name="xgboost_regressor"):
    print("\nüöÄ Entrenando XGBoost...")
    
    xgb_results = regression_pipeline.train_xgboost(
        X_train, y_train, X_test, y_test,
        n_estimators=100,
        max_depth=6,
        learning_rate=0.1
    )
    
    print(f"‚úÖ XGBoost completado")
    print(f"   ‚Ä¢ R¬≤ Score: {xgb_results['r2_score']:.4f}")
    print(f"   ‚Ä¢ RMSE: {xgb_results['rmse']:.4f}")
    print(f"   ‚Ä¢ MAE: {xgb_results['mae']:.4f}")

# %%
# 4. Support Vector Regressor
with mlflow.start_run(run_name="svr_regressor"):
    print("\n‚ö° Entrenando Support Vector Regressor...")
    
    svr_results = regression_pipeline.train_svr(
        X_train, y_train, X_test, y_test,
        kernel='rbf',
        C=1.0,
        gamma='scale'
    )
    
    print(f"‚úÖ SVR completado")
    print(f"   ‚Ä¢ R¬≤ Score: {svr_results['r2_score']:.4f}")
    print(f"   ‚Ä¢ RMSE: {svr_results['rmse']:.4f}")
    print(f"   ‚Ä¢ MAE: {svr_results['mae']:.4f}")

# %%
# 5. Neural Network Regressor
with mlflow.start_run(run_name="neural_network_regressor"):
    print("\nüß† Entrenando Neural Network...")
    
    nn_results = regression_pipeline.train_neural_network(
        X_train, y_train, X_test, y_test,
        hidden_layer_sizes=(100, 50),
        max_iter=500,
        learning_rate_init=0.001
    )
    
    print(f"‚úÖ Neural Network completado")
    print(f"   ‚Ä¢ R¬≤ Score: {nn_results['r2_score']:.4f}")
    print(f"   ‚Ä¢ RMSE: {nn_results['rmse']:.4f}")
    print(f"   ‚Ä¢ MAE: {nn_results['mae']:.4f}")

## Comparaci{on de modelos

In [None]:
# Comparaci√≥n de modelos
models_comparison = {
    'Linear Regression': lr_results,
    'Random Forest': rf_results,
    'XGBoost': xgb_results,
    'SVR': svr_results,
    'Neural Network': nn_results
}

# Crear DataFrame de comparaci√≥n
comparison_df = pd.DataFrame({
    model_name: {
        'R¬≤ Score': results['r2_score'],
        'RMSE': results['rmse'],
        'MAE': results['mae'],
        'Training Time (s)': results.get('training_time', 0)
    }
    for model_name, results in models_comparison.items()
}).T

print("\nüìä COMPARACI√ìN DE MODELOS DE REGRESI√ìN")
print("=" * 60)
print(comparison_df.round(4))


In [None]:
# Visualizaciones de comparaci√≥n
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# R¬≤ Score comparison
comparison_df['R¬≤ Score'].plot(kind='bar', ax=axes[0,0], color='skyblue')
axes[0,0].set_title('R¬≤ Score por Modelo')
axes[0,0].set_ylabel('R¬≤ Score')
axes[0,0].tick_params(axis='x', rotation=45)

# RMSE comparison
comparison_df['RMSE'].plot(kind='bar', ax=axes[0,1], color='lightcoral')
axes[0,1].set_title('RMSE por Modelo')
axes[0,1].set_ylabel('RMSE')
axes[0,1].tick_params(axis='x', rotation=45)

# MAE comparison
comparison_df['MAE'].plot(kind='bar', ax=axes[1,0], color='lightgreen')
axes[1,0].set_title('MAE por Modelo')
axes[1,0].set_ylabel('MAE')
axes[1,0].tick_params(axis='x', rotation=45)

# Training Time comparison
if 'Training Time (s)' in comparison_df.columns:
    comparison_df['Training Time (s)'].plot(kind='bar', ax=axes[1,1], color='orange')
    axes[1,1].set_title('Tiempo de Entrenamiento')
    axes[1,1].set_ylabel('Segundos')
    axes[1,1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## Identificar mejor modelo

In [None]:
# Identificar mejor modelo
best_model_r2 = comparison_df['R¬≤ Score'].idxmax()
best_model_rmse = comparison_df['RMSE'].idxmin()
best_model_mae = comparison_df['MAE'].idxmin()

print(f"\nüèÜ MEJORES MODELOS POR M√âTRICA:")
print(f"   ‚Ä¢ Mejor R¬≤ Score: {best_model_r2} ({comparison_df.loc[best_model_r2, 'R¬≤ Score']:.4f})")
print(f"   ‚Ä¢ Mejor RMSE: {best_model_rmse} ({comparison_df.loc[best_model_rmse, 'RMSE']:.4f})")
print(f"   ‚Ä¢ Mejor MAE: {best_model_mae} ({comparison_df.loc[best_model_mae, 'MAE']:.4f})")

# Modelo recomendado (mejor balance)
comparison_df['Score_Compuesto'] = (
    comparison_df['R¬≤ Score'] * 0.4 +
    (1 - comparison_df['RMSE'] / comparison_df['RMSE'].max()) * 0.3 +
    (1 - comparison_df['MAE'] / comparison_df['MAE'].max()) * 0.3
)

best_overall = comparison_df['Score_Compuesto'].idxmax()
print(f"\n‚≠ê MODELO RECOMENDADO: {best_overall}")
print(f"   ‚Ä¢ Score Compuesto: {comparison_df.loc[best_overall, 'Score_Compuesto']:.4f}")


In [None]:
# An√°lisis de residuos del mejor modelo
best_model_results = models_comparison[best_overall]
residuals = best_model_results['predictions'] - best_model_results['y_true']

fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Scatter plot: Predicciones vs Valores reales
axes[0].scatter(best_model_results['y_true'], best_model_results['predictions'], 
               alpha=0.6, color='blue')
axes[0].plot([best_model_results['y_true'].min(), best_model_results['y_true'].max()], 
             [best_model_results['y_true'].min(), best_model_results['y_true'].max()], 
             'r--', lw=2)
axes[0].set_xlabel('Valores Reales')
axes[0].set_ylabel('Predicciones')
axes[0].set_title(f'Predicciones vs Reales - {best_overall}')

# Histograma de residuos
axes[1].hist(residuals, bins=30, alpha=0.7, color='green')
axes[1].set_xlabel('Residuos')
axes[1].set_ylabel('Frecuencia')
axes[1].set_title(f'Distribuci√≥n de Residuos - {best_overall}')
axes[1].axvline(x=0, color='red', linestyle='--')

plt.tight_layout()
plt.show()

print(f"\nüìä AN√ÅLISIS DE RESIDUOS ({best_overall}):")
print(f"   ‚Ä¢ Media de residuos: {residuals.mean():.6f}")
print(f"   ‚Ä¢ Desviaci√≥n est√°ndar: {residuals.std():.4f}")
print(f"   ‚Ä¢ Residuos dentro de ¬±1œÉ: {(np.abs(residuals) <= residuals.std()).mean()*100:.1f}%")

# %%
# An√°lisis de importancia de features (si disponible)
if hasattr(regression_pipeline, 'get_feature_importance'):
    print("\nüîç AN√ÅLISIS DE IMPORTANCIA DE FEATURES")
    print("=" * 50)
    
    try:
        feature_importance = regression_pipeline.get_feature_importance(best_overall)
        
        if feature_importance is not None:
            # Top 15 features m√°s importantes
            top_features = feature_importance.head(15)
            
            plt.figure(figsize=(12, 8))
            top_features.plot(kind='barh', color='steelblue')
            plt.title(f'Top 15 Features M√°s Importantes - {best_overall}')
            plt.xlabel('Importancia')
            plt.tight_layout()
            plt.show()
            
            print("üîù TOP 10 FEATURES M√ÅS IMPORTANTES:")
            for i, (feature, importance) in enumerate(top_features.head(10).items(), 1):
                print(f"   {i:2d}. {feature}: {importance:.4f}")
        
    except Exception as e:
        print(f"‚ö†Ô∏è  No se pudo obtener importancia de features: {e}")


## Guardar resultados

In [None]:
# Guardar resultados
results_summary = {
    'experiment_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'best_model': best_overall,
    'best_r2_score': comparison_df.loc[best_overall, 'R¬≤ Score'],
    'best_rmse': comparison_df.loc[best_overall, 'RMSE'],
    'best_mae': comparison_df.loc[best_overall, 'MAE'],
    'total_features': X_train.shape[1],
    'training_samples': X_train.shape[0],
    'test_samples': X_test.shape[0]
}

# Guardar comparaci√≥n de modelos
comparison_df.to_csv('../data/processed/regression_models_comparison.csv')
print(f"üìÅ Comparaci√≥n guardada en: ../data/processed/regression_models_comparison.csv")

# Guardar resumen
import json
with open('../data/processed/regression_results_summary.json', 'w') as f:
    json.dump(results_summary, f, indent=2)

print(f"üìÅ Resumen guardado en: ../data/processed/regression_results_summary.json")


In [None]:
print("\n" + "="*60)
print("üéØ RESUMEN DE MODELOS DE REGRESI√ìN")
print("="*60)
print(f"‚úÖ Modelos entrenados: {len(models_comparison)}")
print(f"üèÜ Mejor modelo: {best_overall}")
print(f"üìä Mejor R¬≤ Score: {comparison_df.loc[best_overall, 'R¬≤ Score']:.4f}")
print(f"üìâ Mejor RMSE: {comparison_df.loc[best_overall, 'RMSE']:.4f}")
print(f"üìà Features utilizadas: {X_train.shape[1]}")
print(f"üîÑ Listo para optimizaci√≥n en Fase 5")
print("="*60)

---

__Abraham Tartalos__