# 04b - Regression Models for Risk Score Prediction

**Objetivo**: Desarrollar modelos de regresi√≥n para predecir el `composite_risk_score` continuo
 
**Modelos a desarrollar**:
- Linear Regression (baseline)
- Random Forest Regressor
- XGBoost Regressor
- Support Vector Regressor
- Neural Network Regressor

---

## Importar Librer√≠as

In [17]:
import sys
import os
sys.path.append('../src/modeling')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
import gc
warnings.filterwarnings('ignore')
from sklearn.impute import SimpleImputer

# Importar pipeline de regresi√≥n
from regression_pipeline import RegressionPipeline
from model_utils import load_processed_data, save_model_artifacts
import mlflow
import mlflow.sklearn

## Configuraciones

In [2]:
# Configuraci√≥n
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

In [3]:
print("‚úÖ Librer√≠as y m√≥dulos importados correctamente")
print(f"üìÖ Fecha de ejecuci√≥n: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

‚úÖ Librer√≠as y m√≥dulos importados correctamente
üìÖ Fecha de ejecuci√≥n: 2025-06-15 09:52:31


In [3]:
# Configuraci√≥n MLflow
mlflow.set_experiment("alzheimer_multimodal_monitoring")

print("üîß MLflow configurado para modelos de regresi√≥n")
print("üéØ Target: composite_risk_score (regresi√≥n continua)")


üîß MLflow configurado para modelos de regresi√≥n
üéØ Target: composite_risk_score (regresi√≥n continua)


In [4]:
# Agrega despu√©s de tu configuraci√≥n:
experiment = mlflow.get_experiment_by_name("alzheimer_multimodal_monitoring")
print(f"ID del experimento: {experiment.experiment_id}")
print(f"Ubicaci√≥n: {experiment.artifact_location}")

ID del experimento: 503314857977141831
Ubicaci√≥n: file:///E:/usuarios/alumno/Escritorio/Alzheimer-Multimodal-Monitoring/notebooks/mlruns/503314857977141831


## Cargar datos procesados

In [5]:
# Cargar datos procesados
try:
    df = pd.read_csv('../data/processed/features/alzheimer_features_selected_20250605.csv')
    print(f"üìä Dataset cargado: {df.shape}")
    print(f"üìà Registros con score v√°lido: {df['composite_risk_score'].notna().sum()}")
    
    # Informaci√≥n del target
    target_stats = df['composite_risk_score'].describe()
    print(f"\nüìä ESTAD√çSTICAS DEL TARGET:")
    print(f"   ‚Ä¢ Media: {target_stats['mean']:.3f}")
    print(f"   ‚Ä¢ Desviaci√≥n est√°ndar: {target_stats['std']:.3f}")
    print(f"   ‚Ä¢ Rango: [{target_stats['min']:.3f}, {target_stats['max']:.3f}]")
    
except FileNotFoundError:
    print("‚ùå Error: Archivo no encontrado")
    print("üí° Ejecuta primero el notebook de feature engineering")

üìä Dataset cargado: (48466, 189)
üìà Registros con score v√°lido: 48466

üìä ESTAD√çSTICAS DEL TARGET:
   ‚Ä¢ Media: 0.367
   ‚Ä¢ Desviaci√≥n est√°ndar: 0.213
   ‚Ä¢ Rango: [0.000, 0.929]


In [6]:
col_nonnum = df.select_dtypes(exclude=['number'])
cant_nonnum = col_nonnum.shape[1]
print(f"cantidad de columnas no numericas: {cant_nonnum}")
print(list(col_nonnum))

cantidad de columnas no numericas: 9
['gender', 'update_stamp_apoe_normalized', 'update_stamp_apoe', 'PTGENDER', 'subject_id_activity', 'PTID_apoe', 'EDUCATION_LEVEL', 'gender_normalized', 'risk_category']


In [7]:
# Inicializar pipeline de regresi√≥n
regression_pipeline = RegressionPipeline(
    target_col='composite_risk_score',
    test_size=0.2,
    random_state=42
)

print("üîß Pipeline de regresi√≥n inicializado")
print(f"   ‚Ä¢ Target: {regression_pipeline.target_col}")
print(f"   ‚Ä¢ Test size: {regression_pipeline.test_size}")

üîß Pipeline de regresi√≥n inicializado
   ‚Ä¢ Target: composite_risk_score
   ‚Ä¢ Test size: 0.2


## Separaci√≥n de datos en caracter√≠sticas (X) y variable objetivo (y)

In [8]:
# Separar caracter√≠sticas (X) y variable objetivo (y)
X = df.drop(columns=['composite_risk_score'])  # Todas las columnas excepto el target
y = df['composite_risk_score']  # Solo la columna objetivo

# Verificar las dimensiones
print(f"üîç Dimensiones de los datos:")
print(f"   ‚Ä¢ X: {X.shape} (features)")
print(f"   ‚Ä¢ y: {y.shape} (target)")

üîç Dimensiones de los datos:
   ‚Ä¢ X: (48466, 188) (features)
   ‚Ä¢ y: (48466,) (target)


In [9]:
print("Valores faltantes en X:", X.isna().sum().sum())
print("Valores faltantes en y:", y.isna().sum())

Valores faltantes en X: 1077109
Valores faltantes en y: 0


## Ejecuci√≥n del Pipeline y Manejo de NaN

In [16]:
from joblib import Memory
import mlflow

# Limpiar cach√© de joblib
mem = Memory(location=None)
mem.clear()

# Limpiar cach√© de MLflow (opcional)
mlflow.end_run()

[Memory(location=None)]: Flushing completely the cache


In [19]:
# 1. Eliminar columnas no num√©ricas
non_numeric = ['gender', 'update_stamp_apoe_normalized', 'update_stamp_apoe', 
              'PTGENDER', 'subject_id_activity', 'PTID_apoe', 'EDUCATION_LEVEL', 
              'gender_normalized', 'risk_category']
X = X.drop(columns=non_numeric, errors='ignore')  # 'errors=ignore' evita errores si alguna columna no existe

# 2. Reemplazar infinitos por NaN
print(f"Valores infinitos antes: {np.isinf(X.values).sum()}")
X.replace([np.inf, -np.inf], np.nan, inplace=True)

# 3. Identificar y eliminar columnas completamente vac√≠as
empty_cols = X.columns[X.isna().all()].tolist()
print(f"\nüî• Columnas completamente vac√≠as ({len(empty_cols)}): {empty_cols}")
X = X.drop(columns=empty_cols)

# 4. Imputar valores faltantes
print(f"\nValores NaN en X antes de imputar: {X.isna().sum().sum()}")
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# 5. Crear nuevo DataFrame manteniendo solo las columnas no vac√≠as
X = pd.DataFrame(X_imputed, columns=X.columns)

# 6. Verificar resultados
print(f"\nValores infinitos despu√©s: {np.isinf(X.values).sum()}")
print(f"Valores NaN despu√©s de imputar: {X.isna().sum().sum()}")
print(f"Nuevas dimensiones: {X.shape}")

# Liberar memoria expl√≠citamente
gc.collect()

# Ejecutar el pipeline completo de regresi√≥n para todos los modelos
results = regression_pipeline.run_regression_pipeline(
    X,  # Features 
    y,  # Target 
    optimize_hyperparams=False,  # No optimizar hiperpar√°metros
    cross_validate=True          # Realizar validaci√≥n cruzada
)

print("‚úÖ Pipeline completo ejecutado!")

Valores infinitos antes: 0

üî• Columnas completamente vac√≠as (0): []

Valores NaN en X antes de imputar: 0

Valores infinitos despu√©s: 0
Valores NaN despu√©s de imputar: 0
Nuevas dimensiones: (48466, 169)

üöÄ INICIANDO PIPELINE DE REGRESI√ìN
   ‚Ä¢ Dimensiones entrenamiento: (38772, 169)
   ‚Ä¢ Dimensiones test: (9694, 169)

 Entrenando linear_regression...
üìä RMSE: 0.0034
üìä MAE: 0.0014
üìä R¬≤: 0.9997
üìä CV RMSE: 0.0034 ¬± 0.0001


MlflowException: Failed to convert metric value to float: can only convert an array of size 1 to a Python scalar

## Comparaci√≥n de modelos

In [None]:
# Comparaci√≥n de modelos
models_comparison = {
    'Linear Regression': lr_results,
    'Random Forest': rf_results,
    'XGBoost': xgb_results,
    'SVR': svr_results,
    'Neural Network': nn_results
}

# Crear DataFrame de comparaci√≥n
comparison_df = pd.DataFrame({
    model_name: {
        'R¬≤ Score': results['r2_score'],
        'RMSE': results['rmse'],
        'MAE': results['mae'],
        'Training Time (s)': results.get('training_time', 0)
    }
    for model_name, results in models_comparison.items()
}).T

print("\nüìä COMPARACI√ìN DE MODELOS DE REGRESI√ìN")
print("=" * 60)
print(comparison_df.round(4))


In [None]:
# Visualizaciones de comparaci√≥n
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# R¬≤ Score comparison
comparison_df['R¬≤ Score'].plot(kind='bar', ax=axes[0,0], color='skyblue')
axes[0,0].set_title('R¬≤ Score por Modelo')
axes[0,0].set_ylabel('R¬≤ Score')
axes[0,0].tick_params(axis='x', rotation=45)

# RMSE comparison
comparison_df['RMSE'].plot(kind='bar', ax=axes[0,1], color='lightcoral')
axes[0,1].set_title('RMSE por Modelo')
axes[0,1].set_ylabel('RMSE')
axes[0,1].tick_params(axis='x', rotation=45)

# MAE comparison
comparison_df['MAE'].plot(kind='bar', ax=axes[1,0], color='lightgreen')
axes[1,0].set_title('MAE por Modelo')
axes[1,0].set_ylabel('MAE')
axes[1,0].tick_params(axis='x', rotation=45)

# Training Time comparison
if 'Training Time (s)' in comparison_df.columns:
    comparison_df['Training Time (s)'].plot(kind='bar', ax=axes[1,1], color='orange')
    axes[1,1].set_title('Tiempo de Entrenamiento')
    axes[1,1].set_ylabel('Segundos')
    axes[1,1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## Identificar mejor modelo

In [None]:
# Identificar mejor modelo
best_model_r2 = comparison_df['R¬≤ Score'].idxmax()
best_model_rmse = comparison_df['RMSE'].idxmin()
best_model_mae = comparison_df['MAE'].idxmin()

print(f"\nüèÜ MEJORES MODELOS POR M√âTRICA:")
print(f"   ‚Ä¢ Mejor R¬≤ Score: {best_model_r2} ({comparison_df.loc[best_model_r2, 'R¬≤ Score']:.4f})")
print(f"   ‚Ä¢ Mejor RMSE: {best_model_rmse} ({comparison_df.loc[best_model_rmse, 'RMSE']:.4f})")
print(f"   ‚Ä¢ Mejor MAE: {best_model_mae} ({comparison_df.loc[best_model_mae, 'MAE']:.4f})")

# Modelo recomendado (mejor balance)
comparison_df['Score_Compuesto'] = (
    comparison_df['R¬≤ Score'] * 0.4 +
    (1 - comparison_df['RMSE'] / comparison_df['RMSE'].max()) * 0.3 +
    (1 - comparison_df['MAE'] / comparison_df['MAE'].max()) * 0.3
)

best_overall = comparison_df['Score_Compuesto'].idxmax()
print(f"\n‚≠ê MODELO RECOMENDADO: {best_overall}")
print(f"   ‚Ä¢ Score Compuesto: {comparison_df.loc[best_overall, 'Score_Compuesto']:.4f}")


In [None]:
# An√°lisis de residuos del mejor modelo
best_model_results = models_comparison[best_overall]
residuals = best_model_results['predictions'] - best_model_results['y_true']

fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Scatter plot: Predicciones vs Valores reales
axes[0].scatter(best_model_results['y_true'], best_model_results['predictions'], 
               alpha=0.6, color='blue')
axes[0].plot([best_model_results['y_true'].min(), best_model_results['y_true'].max()], 
             [best_model_results['y_true'].min(), best_model_results['y_true'].max()], 
             'r--', lw=2)
axes[0].set_xlabel('Valores Reales')
axes[0].set_ylabel('Predicciones')
axes[0].set_title(f'Predicciones vs Reales - {best_overall}')

# Histograma de residuos
axes[1].hist(residuals, bins=30, alpha=0.7, color='green')
axes[1].set_xlabel('Residuos')
axes[1].set_ylabel('Frecuencia')
axes[1].set_title(f'Distribuci√≥n de Residuos - {best_overall}')
axes[1].axvline(x=0, color='red', linestyle='--')

plt.tight_layout()
plt.show()

print(f"\nüìä AN√ÅLISIS DE RESIDUOS ({best_overall}):")
print(f"   ‚Ä¢ Media de residuos: {residuals.mean():.6f}")
print(f"   ‚Ä¢ Desviaci√≥n est√°ndar: {residuals.std():.4f}")
print(f"   ‚Ä¢ Residuos dentro de ¬±1œÉ: {(np.abs(residuals) <= residuals.std()).mean()*100:.1f}%")

# %%
# An√°lisis de importancia de features (si disponible)
if hasattr(regression_pipeline, 'get_feature_importance'):
    print("\nüîç AN√ÅLISIS DE IMPORTANCIA DE FEATURES")
    print("=" * 50)
    
    try:
        feature_importance = regression_pipeline.get_feature_importance(best_overall)
        
        if feature_importance is not None:
            # Top 15 features m√°s importantes
            top_features = feature_importance.head(15)
            
            plt.figure(figsize=(12, 8))
            top_features.plot(kind='barh', color='steelblue')
            plt.title(f'Top 15 Features M√°s Importantes - {best_overall}')
            plt.xlabel('Importancia')
            plt.tight_layout()
            plt.show()
            
            print("üîù TOP 10 FEATURES M√ÅS IMPORTANTES:")
            for i, (feature, importance) in enumerate(top_features.head(10).items(), 1):
                print(f"   {i:2d}. {feature}: {importance:.4f}")
        
    except Exception as e:
        print(f"‚ö†Ô∏è  No se pudo obtener importancia de features: {e}")


## Guardar resultados

In [None]:
# Guardar resultados
results_summary = {
    'experiment_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
    'best_model': best_overall,
    'best_r2_score': comparison_df.loc[best_overall, 'R¬≤ Score'],
    'best_rmse': comparison_df.loc[best_overall, 'RMSE'],
    'best_mae': comparison_df.loc[best_overall, 'MAE'],
    'total_features': X_train.shape[1],
    'training_samples': X_train.shape[0],
    'test_samples': X_test.shape[0]
}

# Guardar comparaci√≥n de modelos
comparison_df.to_csv('../data/processed/regression_models_comparison.csv')
print(f"üìÅ Comparaci√≥n guardada en: ../data/processed/regression_models_comparison.csv")

# Guardar resumen
import json
with open('../data/processed/regression_results_summary.json', 'w') as f:
    json.dump(results_summary, f, indent=2)

print(f"üìÅ Resumen guardado en: ../data/processed/regression_results_summary.json")


In [None]:
print("\n" + "="*60)
print("üéØ RESUMEN DE MODELOS DE REGRESI√ìN")
print("="*60)
print(f"‚úÖ Modelos entrenados: {len(models_comparison)}")
print(f"üèÜ Mejor modelo: {best_overall}")
print(f"üìä Mejor R¬≤ Score: {comparison_df.loc[best_overall, 'R¬≤ Score']:.4f}")
print(f"üìâ Mejor RMSE: {comparison_df.loc[best_overall, 'RMSE']:.4f}")
print(f"üìà Features utilizadas: {X_train.shape[1]}")
print(f"üîÑ Listo para optimizaci√≥n en Fase 5")
print("="*60)

---

__Abraham Tartalos__