# 09e - Risk Stratification Models
 
**Objetivo**: Desarrollar modelos de estratificaci√≥n de riesgo utilizando los pipelines especializados
 
**Componentes**:
- Clustering para identificaci√≥n de fenotipos
- Estratificaci√≥n gen√©tica y demogr√°fica  
- Modelos probabil√≠sticos GMM
- An√°lisis jer√°rquico de subestratos

---

## Importar librer√≠as y Configuraciones

In [None]:
import sys
import os
sys.path.append('../scripts/modeling')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Importar scripts especializados
from risk_stratification import RiskStratificationPipeline
from model_utils import ModelEvaluator, DataPreprocessor
import ensemble_methods

In [None]:
# Configuraci√≥n
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


In [None]:
print("‚úÖ Librer√≠as y scripts importados correctamente")
print(f"üìÖ Fecha de ejecuci√≥n: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

In [None]:
# Configuraci√≥n MLflow
mlflow.set_experiment("alzheimer_risk_stratification")

## Cargar datos procesados

In [None]:
# Cargar datos procesados
try:
    df = pd.read_csv('../data/processed/integrated_features_final.csv')
    print(f"üìä Dataset cargado: {df.shape}")
    print(f"üìà Score de riesgo disponible: {df['composite_risk_score'].notna().sum()} registros")
    print(f"üéØ Distribuci√≥n de categor√≠as:")
    print(df['risk_category'].value_counts())
except FileNotFoundError:
    print("‚ùå Error: Archivo de features no encontrado")
    print("üí° Ejecuta primero el notebook 03_feature_engineering_master.ipynb")


## Inicializar pipeline de estratificaci√≥n y Preparar datos usando utilities

In [None]:
# Inicializar pipeline de estratificaci√≥n
stratifier = RiskStratificationPipeline(
    target_column='composite_risk_score',
    category_column='risk_category'
)

# Preparar datos usando utilities
preprocessor = DataPreprocessor()
X_processed, feature_names = preprocessor.prepare_stratification_features(df)
y_risk = df['composite_risk_score'].dropna()

print(f"üîß Features preparadas: {len(feature_names)}")
print(f"üìä Registros v√°lidos: {len(X_processed)}")

## 1. Identificaci√≥n de Fenotipos con K-Means

In [None]:
# 1. Identificaci√≥n de Fenotipos con K-Means
with mlflow.start_run(run_name="phenotype_identification"):
    mlflow.set_tag("phase", "risk_stratification")
    mlflow.set_tag("method", "kmeans_clustering")
    
    # Ejecutar clustering de fenotipos
    phenotype_results = stratifier.identify_phenotypes(
        X_processed, 
        method='kmeans',
        k_range=(2, 8)
    )
    
    # Registrar resultados
    mlflow.log_params(phenotype_results['params'])
    mlflow.log_metrics(phenotype_results['metrics'])
    mlflow.sklearn.log_model(phenotype_results['model'], "phenotype_model")
    
    print(f"üß¨ Fenotipos identificados: {phenotype_results['n_clusters']}")
    print(f"üìä Silhouette Score: {phenotype_results['metrics']['silhouette_score']:.3f}")


## 2. Estratificaci√≥n Gen√©tica por APOE

In [None]:
# 2. Estratificaci√≥n Gen√©tica por APOE
if 'APOE_e4_carrier' in df.columns:
    with mlflow.start_run(run_name="genetic_stratification"):
        mlflow.set_tag("stratification_type", "genetic_apoe")
        
        # Ejecutar estratificaci√≥n gen√©tica
        genetic_results = stratifier.genetic_stratification(
            df, 
            genetic_markers=['APOE_e4_carrier', 'APOE_e4_present']
        )
        
        # Registrar m√©tricas
        mlflow.log_metrics(genetic_results['metrics'])
        
        print("üß¨ ESTRATIFICACI√ìN GEN√âTICA COMPLETADA")
        print(f"üìä APOE4 Carriers: {genetic_results['apoe_carrier_stats']['count']}")
        print(f"üìà Diferencia de riesgo: {genetic_results['metrics']['risk_difference']:.3f}")


## 3. Estratificaci√≥n Probabil√≠stica con GMM

In [None]:
# 3. Estratificaci√≥n Probabil√≠stica con GMM
with mlflow.start_run(run_name="probabilistic_stratification"):
    mlflow.set_tag("method", "gaussian_mixture")
    
    # Ejecutar GMM
    gmm_results = stratifier.probabilistic_stratification(
        X_processed,
        method='gmm',
        n_components_range=(2, 6)
    )
    
    # Registrar modelo
    mlflow.log_params(gmm_results['params'])
    mlflow.log_metrics(gmm_results['metrics'])
    mlflow.sklearn.log_model(gmm_results['model'], "gmm_model")
    
    print(f"üé≤ Componentes GMM √≥ptimos: {gmm_results['n_components']}")
    print(f"üìä BIC Score: {gmm_results['metrics']['bic_score']:.1f}")


## 4. Estratificaci√≥n Jer√°rquica

In [None]:
# 4. Estratificaci√≥n Jer√°rquica
with mlflow.start_run(run_name="hierarchical_stratification"):
    mlflow.set_tag("method", "hierarchical")
    
    # Definir criterios jer√°rquicos
    hierarchical_criteria = {
        'high_risk_apoe': {
            'conditions': ['risk_category == "High"', 'APOE_e4_carrier == 1'],
            'operator': 'and'
        },
        'moderate_risk_high_bio': {
            'conditions': ['risk_category == "Moderate"', 'biomarker_risk_score > 0.7'],
            'operator': 'and'
        }
    }
    
    # Ejecutar estratificaci√≥n jer√°rquica
    hierarchical_results = stratifier.hierarchical_stratification(
        df,
        criteria=hierarchical_criteria
    )
    
    # Registrar resultados
    mlflow.log_metrics(hierarchical_results['metrics'])
    
    print("üèóÔ∏è ESTRATIFICACI√ìN JER√ÅRQUICA COMPLETADA")
    print(f"üìä Subestratos identificados: {hierarchical_results['n_substrata']}")


## 5. Estratificaci√≥n por Biomarcadores

In [None]:
# 5. Estratificaci√≥n por Biomarcadores
biomarker_features = [col for col in df.columns if 
                     any(marker in col.lower() for marker in ['tau', 'abeta', 'ptau'])]

if biomarker_features:
    with mlflow.start_run(run_name="biomarker_stratification"):
        mlflow.set_tag("stratification_type", "biomarker")
        
        # Ejecutar clustering de biomarcadores
        biomarker_results = stratifier.biomarker_stratification(
            df,
            biomarker_columns=biomarker_features,
            n_clusters=3
        )
        
        # Registrar modelo
        mlflow.log_params(biomarker_results['params'])
        mlflow.log_metrics(biomarker_results['metrics'])
        mlflow.sklearn.log_model(biomarker_results['model'], "biomarker_model")
        
        print("üß™ ESTRATIFICACI√ìN POR BIOMARCADORES COMPLETADA")
        print(f"üìä Clusters de biomarcadores: {biomarker_results['n_clusters']}")


## 6. Ensemble de Estratificaci√≥n

In [None]:
# 6. Ensemble de Estratificaci√≥n
with mlflow.start_run(run_name="ensemble_stratification"):
    mlflow.set_tag("method", "ensemble")
    
    # Combinar resultados de m√∫ltiples m√©todos
    ensemble_stratifier = ensemble_methods.StratificationEnsemble()
    
    # Agregar m√©todos individuales
    ensemble_stratifier.add_method('phenotypes', phenotype_results)
    ensemble_stratifier.add_method('gmm', gmm_results)
    if 'genetic_results' in locals():
        ensemble_stratifier.add_method('genetic', genetic_results)
    
    # Generar ensemble
    ensemble_results = ensemble_stratifier.combine_stratifications(
        method='voting',
        weights='performance'
    )
    
    # Registrar ensemble
    mlflow.log_metrics(ensemble_results['metrics'])
    
    print("üéØ ENSEMBLE DE ESTRATIFICACI√ìN COMPLETADO")
    print(f"üìä Score de consenso: {ensemble_results['consensus_score']:.3f}")


## Visualizaci√≥n de Resultados

In [None]:
# Visualizaci√≥n de Resultados
evaluator = ModelEvaluator()

# Crear visualizaciones integradas
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# 1. Distribuci√≥n de fenotipos
evaluator.plot_stratification_results(
    phenotype_results, 
    title="Fenotipos Identificados",
    ax=axes[0,0]
)

# 2. Componentes GMM
evaluator.plot_stratification_results(
    gmm_results,
    title="Componentes GMM", 
    ax=axes[0,1]
)

# 3. Distribuci√≥n de riesgo por estrato
stratifier.plot_risk_distribution_by_strata(ax=axes[0,2])

# 4. Comparaci√≥n de m√©todos
if 'ensemble_results' in locals():
    evaluator.plot_method_comparison(
        [phenotype_results, gmm_results, ensemble_results],
        method_names=['K-Means', 'GMM', 'Ensemble'],
        ax=axes[1,0]
    )

# 5. Matriz de consenso
if 'ensemble_results' in locals():
    evaluator.plot_consensus_matrix(ensemble_results, ax=axes[1,1])

# 6. M√©tricas de calidad
quality_metrics = stratifier.compute_stratification_quality()
evaluator.plot_quality_metrics(quality_metrics, ax=axes[1,2])

plt.tight_layout()
plt.show()

print("üìä Visualizaciones generadas correctamente")


## Resumen Final, Exportaci√≥n y Guardado resultados

In [None]:
# Resumen Final y Exportaci√≥n
print("\n" + "="*60)
print("üìä RESUMEN DE ESTRATIFICACI√ìN DE RIESGO")
print("="*60)

# Compilar resumen
summary_stats = stratifier.generate_summary_report()

for category, stats in summary_stats.items():
    print(f"\nüéØ {category.upper()}:")
    for key, value in stats.items():
        print(f"   ‚Ä¢ {key}: {value}")


In [None]:
# Guardar resultados consolidados
consolidated_results = stratifier.consolidate_results()
consolidated_results.to_csv('../data/processed/risk_stratification_results.csv', index=False)

# Guardar modelos para la siguiente fase
model_artifacts = {
    'phenotype_model': phenotype_results['model'],
    'gmm_model': gmm_results['model'],
}

if 'genetic_results' in locals():
    model_artifacts['genetic_stratification'] = genetic_results

if 'ensemble_results' in locals():
    model_artifacts['ensemble_model'] = ensemble_results

# Exportar para fase de evaluaci√≥n
import joblib
joblib.dump(model_artifacts, '../models/stratification_models.pkl')

print("\n‚úÖ Estratificaci√≥n de riesgo completada exitosamente")
print("üìÅ Modelos guardados en MLflow y ../models/")
print("üìä Resultados exportados a ../data/processed/")
print("üîÑ Listo para Fase 5: Evaluaci√≥n y Optimizaci√≥n")

---

__Abraham Tartalos__