# 04a - Model Development Master
# 
# **Coordinador Principal de la Fase 4: Desarrollo de Modelos**
# 
# Este notebook orquesta todo el desarrollo de modelos para el proyecto de Monitorizaci√≥n Multimodal de Alzheimer:
# 
# **Objetivos principales**:
# - Coordinar la ejecuci√≥n de todos los pipelines de modelado
# - Integrar resultados de regresi√≥n, clasificaci√≥n, an√°lisis temporal y estratificaci√≥n
# - Gestionar experimentos MLflow de manera centralizada
# - Generar resumen executivo de todos los modelos desarrollados
# 
# **Notebooks integrados**:
# - 04b_regression_models.ipynb (composite_risk_score)
# - 04c_classification_models.ipynb (risk_category)
# - 04d_temporal_analysis.ipynb (series temporales)
# - 04e_risk_stratification.ipynb (estratificaci√≥n)

## Importar librer√≠as

In [1]:
import sys
import os
sys.path.append('../src/modeling')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow
import mlflow.sklearn
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Importar scripts de modelado
from regression_pipeline import RegressionPipeline
from classification_pipeline import ClassificationPipeline
from temporal_modeling import TemporalModeling
from risk_stratification import RiskStratification
from ensemble_methods import EnsembleMethods
from model_utils import ModelUtils

In [2]:
# Configuraci√≥n de visualizaci√≥n
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

Iniciando Fase 4: Desarrollo de Modelos


In [None]:
print("üöÄ FASE 4: DESARROLLO DE MODELOS - COORDINADOR MASTER")
print("=" * 60)
print(f"üìÖ Fecha de ejecuci√≥n: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("‚úÖ Librer√≠as y scripts importados correctamente")

## Configuraci√≥n MLflow Master

In [3]:
# Configuraci√≥n MLflow Master
mlflow.set_experiment("alzheimer_model_development_master")

# Configuraci√≥n global del proyecto
PROJECT_CONFIG = {
    'project_name': 'Alzheimer_Multimodal_Monitoring',
    'phase': 'model_development',
    'target_regression': 'composite_risk_score',
    'target_classification': 'risk_category',
    'data_path': '../data/processed/integrated_features_final.csv',
    'results_path': '../results/model_development/',
    'random_state': 42
}

# Crear directorio de resultados si no existe
os.makedirs(PROJECT_CONFIG['results_path'], exist_ok=True)

print("üîß Configuraci√≥n MLflow Master establecida")
print(f"üéØ Target Regresi√≥n: {PROJECT_CONFIG['target_regression']}")
print(f"üéØ Target Clasificaci√≥n: {PROJECT_CONFIG['target_classification']}")


üìÅ Configurando rutas y cargando datos...
üìä Cargando dataset final...
üìã Cargando metadatos de feature engineering...
‚úÖ Dataset cargado: (48466, 189)
‚úÖ Features seleccionadas: 192
‚úÖ Registros v√°lidos: 48466


## Cargar y validar datos

In [5]:
# Cargar y validar datos
print("\nüìä CARGA Y VALIDACI√ìN DE DATOS")
print("-" * 40)

try:
    df = pd.read_csv(PROJECT_CONFIG['data_path'])
    
    # Validaciones b√°sicas
    utils = ModelUtils()
    data_quality = utils.validate_data_quality(df, 
                                               PROJECT_CONFIG['target_regression'],
                                               PROJECT_CONFIG['target_classification'])
    
    print(f"‚úÖ Dataset cargado: {df.shape}")
    print(f"üìà Registros v√°lidos para regresi√≥n: {data_quality['regression_samples']}")
    print(f"üìä Registros v√°lidos para clasificaci√≥n: {data_quality['classification_samples']}")
    print(f"üéØ Features disponibles: {data_quality['total_features']}")
    print(f"üìâ Completitud promedio: {data_quality['data_completeness']:.1%}")
    
    if data_quality['quality_score'] < 0.7:
        print("‚ö†Ô∏è  Advertencia: Calidad de datos por debajo del umbral recomendado")
    
except FileNotFoundError:
    print("‚ùå Error: Archivo de features no encontrado")
    print("üí° Ejecuta primero el notebook 03_feature_engineering_master.ipynb")
    raise


üîß Configurando MLflow para tracking de experimentos...
‚úÖ Experimento creado: Alzheimer_Multimodal_Monitoring_Phase4


## Inicializar pipelines de modelado

In [6]:
# Inicializar pipelines de modelado
print("\nüîß INICIALIZACI√ìN DE PIPELINES")
print("-" * 40)

with mlflow.start_run(run_name="master_pipeline_initialization"):
    mlflow.set_tag("phase", "model_development")
    mlflow.set_tag("pipeline_type", "master_coordinator")
    
    # Inicializar pipelines
    regression_pipeline = RegressionPipeline(random_state=PROJECT_CONFIG['random_state'])
    classification_pipeline = ClassificationPipeline(random_state=PROJECT_CONFIG['random_state'])
    temporal_pipeline = TemporalModeling(random_state=PROJECT_CONFIG['random_state'])
    stratification_pipeline = RiskStratification(random_state=PROJECT_CONFIG['random_state'])
    ensemble_pipeline = EnsembleMethods(random_state=PROJECT_CONFIG['random_state'])
    
    # Registrar configuraci√≥n
    mlflow.log_params(PROJECT_CONFIG)
    mlflow.log_metrics(data_quality)
    
    print("‚úÖ Pipeline de Regresi√≥n inicializado")
    print("‚úÖ Pipeline de Clasificaci√≥n inicializado") 
    print("‚úÖ Pipeline Temporal inicializado")
    print("‚úÖ Pipeline de Estratificaci√≥n inicializado")
    print("‚úÖ Pipeline de Ensemble inicializado")



üìä Realizando an√°lisis preliminar del dataset...
üîç INFORMACI√ìN B√ÅSICA DEL DATASET:
  ‚Ä¢ Forma del dataset: (48466, 189)
  ‚Ä¢ Registros √∫nicos: 48198
  ‚Ä¢ Memoria utilizada: 92.6 MB

üéØ VARIABLES OBJETIVO:
  ‚Ä¢ Continua: composite_risk_score
  ‚Ä¢ Categ√≥rica: risk_category

üìà DISTRIBUCI√ìN - COMPOSITE_RISK_SCORE:
  ‚Ä¢ count: 48466.0000
  ‚Ä¢ mean: 0.3671
  ‚Ä¢ std: 0.2128
  ‚Ä¢ min: 0.0000
  ‚Ä¢ 25%: 0.1489
  ‚Ä¢ 50%: 0.3631
  ‚Ä¢ 75%: 0.5714
  ‚Ä¢ max: 0.9286

üìä DISTRIBUCI√ìN - RISK_CATEGORY:
  ‚Ä¢ Low: 22,501 (46.4%)
  ‚Ä¢ Moderate: 22,345 (46.1%)
  ‚Ä¢ High: 3,620 (7.5%)


## 1. EJECUTAR MODELOS DE REGRESI√ìN

In [None]:
# 1. EJECUTAR MODELOS DE REGRESI√ìN
print("\nüéØ FASE 1: MODELOS DE REGRESI√ìN")
print("-" * 40)

with mlflow.start_run(run_name="regression_models_execution", nested=True):
    mlflow.set_tag("model_family", "regression")
    
    # Preparar datos para regresi√≥n
    X_reg, y_reg = utils.prepare_regression_data(df, PROJECT_CONFIG['target_regression'])
    
    # Ejecutar pipeline de regresi√≥n
    regression_results = regression_pipeline.run_full_pipeline(X_reg, y_reg)
    
    # Registrar mejores m√©tricas
    best_model = regression_results['best_model']
    mlflow.log_metrics({
        'best_regression_r2': regression_results['best_metrics']['r2'],
        'best_regression_rmse': regression_results['best_metrics']['rmse'],
        'best_regression_mae': regression_results['best_metrics']['mae']
    })
    
    print(f"üèÜ Mejor modelo de regresi√≥n: {best_model}")
    print(f"üìä R¬≤ Score: {regression_results['best_metrics']['r2']:.4f}")
    print(f"üìä RMSE: {regression_results['best_metrics']['rmse']:.4f}")


## 2. EJECUTAR MODELOS DE CLASIFICACI√ìN

In [None]:
# 2. EJECUTAR MODELOS DE CLASIFICACI√ìN  
print("\nüéØ FASE 2: MODELOS DE CLASIFICACI√ìN")
print("-" * 40)

with mlflow.start_run(run_name="classification_models_execution", nested=True):
    mlflow.set_tag("model_family", "classification")
    
    # Preparar datos para clasificaci√≥n
    X_clf, y_clf = utils.prepare_classification_data(df, PROJECT_CONFIG['target_classification'])
    
    # Ejecutar pipeline de clasificaci√≥n
    classification_results = classification_pipeline.run_full_pipeline(X_clf, y_clf)
    
    # Registrar mejores m√©tricas
    best_clf_model = classification_results['best_model']
    mlflow.log_metrics({
        'best_classification_accuracy': classification_results['best_metrics']['accuracy'],
        'best_classification_f1': classification_results['best_metrics']['f1_macro'],
        'best_classification_precision': classification_results['best_metrics']['precision_macro'],
        'best_classification_recall': classification_results['best_metrics']['recall_macro']
    })
    
    print(f"üèÜ Mejor modelo de clasificaci√≥n: {best_clf_model}")
    print(f"üìä Accuracy: {classification_results['best_metrics']['accuracy']:.4f}")
    print(f"üìä F1-Score: {classification_results['best_metrics']['f1_macro']:.4f}")


## 3. EJECUTAR AN√ÅLISIS TEMPORAL

In [None]:
# 3. EJECUTAR AN√ÅLISIS TEMPORAL
print("\nüéØ FASE 3: AN√ÅLISIS TEMPORAL")
print("-" * 40)

with mlflow.start_run(run_name="temporal_analysis_execution", nested=True):
    mlflow.set_tag("model_family", "temporal")
    
    # Verificar disponibilidad de datos temporales
    temporal_features = [col for col in df.columns if any(x in col.lower() for x in ['time', 'date', 'sequence', 'visit'])]
    
    if temporal_features:
        # Ejecutar an√°lisis temporal
        temporal_results = temporal_pipeline.run_temporal_analysis(df, temporal_features)
        
        mlflow.log_metrics({
            'temporal_features_count': len(temporal_features),
            'temporal_samples': temporal_results.get('samples', 0)
        })
        
        print(f"üìà Features temporales identificadas: {len(temporal_features)}")
        print(f"üìä An√°lisis temporal completado")
    else:
        print("‚ö†Ô∏è  No se encontraron features temporales suficientes")
        print("üí° An√°lisis temporal omitido en esta iteraci√≥n")


## 4. EJECUTAR ESTRATIFICACI√ìN DE RIESGO

In [None]:
# 4. EJECUTAR ESTRATIFICACI√ìN DE RIESGO
print("\nüéØ FASE 4: ESTRATIFICACI√ìN DE RIESGO")
print("-" * 40)

with mlflow.start_run(run_name="risk_stratification_execution", nested=True):
    mlflow.set_tag("model_family", "stratification")
    
    # Ejecutar estratificaci√≥n
    stratification_results = stratification_pipeline.run_stratification_analysis(df, 
                                                                               PROJECT_CONFIG['target_regression'])
    
    # Registrar m√©tricas de estratificaci√≥n
    mlflow.log_metrics({
        'phenotypes_identified': stratification_results['n_phenotypes'],
        'stratification_quality': stratification_results['silhouette_score'],
        'risk_groups': stratification_results['n_risk_groups']
    })
    
    print(f"üß¨ Fenotipos identificados: {stratification_results['n_phenotypes']}")
    print(f"üìä Calidad de estratificaci√≥n: {stratification_results['silhouette_score']:.3f}")
    print(f"üéØ Grupos de riesgo: {stratification_results['n_risk_groups']}")


## 5. EJECUTAR M√âTODOS ENSEMBLE

In [None]:
# 5. EJECUTAR M√âTODOS ENSEMBLE
print("\nüéØ FASE 5: M√âTODOS ENSEMBLE")
print("-" * 40)

with mlflow.start_run(run_name="ensemble_methods_execution", nested=True):
    mlflow.set_tag("model_family", "ensemble")
    
    # Combinar mejores modelos en ensemble
    base_models = {
        'regression': regression_results['top_models'][:3],
        'classification': classification_results['top_models'][:3]
    }
    
    # Ejecutar ensemble para regresi√≥n
    ensemble_reg_results = ensemble_pipeline.create_regression_ensemble(X_reg, y_reg, base_models['regression'])
    
    # Ejecutar ensemble para clasificaci√≥n  
    ensemble_clf_results = ensemble_pipeline.create_classification_ensemble(X_clf, y_clf, base_models['classification'])
    
    # Registrar m√©tricas ensemble
    mlflow.log_metrics({
        'ensemble_regression_r2': ensemble_reg_results['ensemble_score'],
        'ensemble_classification_f1': ensemble_clf_results['ensemble_score'],
        'ensemble_improvement_reg': ensemble_reg_results['improvement'],
        'ensemble_improvement_clf': ensemble_clf_results['improvement']
    })
    
    print(f"üéØ Ensemble Regresi√≥n R¬≤: {ensemble_reg_results['ensemble_score']:.4f}")
    print(f"üéØ Ensemble Clasificaci√≥n F1: {ensemble_clf_results['ensemble_score']:.4f}")
    print(f"üìà Mejora Regresi√≥n: +{ensemble_reg_results['improvement']:.3f}")
    print(f"üìà Mejora Clasificaci√≥n: +{ensemble_clf_results['improvement']:.3f}")


## RESUMEN EJECUTIVO DE RESULTADOS

In [None]:
# RESUMEN EJECUTIVO DE RESULTADOS
print("\n" + "="*60)
print("üìä RESUMEN EJECUTIVO - DESARROLLO DE MODELOS")
print("="*60)

# Compilar resultados finales
final_results = {
    'Datos': {
        'Total_Registros': len(df),
        'Features_Finales': len([col for col in df.columns if col not in [PROJECT_CONFIG['target_regression'], PROJECT_CONFIG['target_classification']]]),
        'Completitud_Datos': f"{data_quality['data_completeness']:.1%}",
        'Calidad_Score': f"{data_quality['quality_score']:.3f}"
    },
    'Modelos_Regresi√≥n': {
        'Mejor_Modelo': best_model,
        'R2_Score': f"{regression_results['best_metrics']['r2']:.4f}",
        'RMSE': f"{regression_results['best_metrics']['rmse']:.4f}",
        'Modelos_Evaluados': len(regression_results['all_results'])
    },
    'Modelos_Clasificaci√≥n': {
        'Mejor_Modelo': best_clf_model,
        'Accuracy': f"{classification_results['best_metrics']['accuracy']:.4f}",
        'F1_Score': f"{classification_results['best_metrics']['f1_macro']:.4f}",
        'Modelos_Evaluados': len(classification_results['all_results'])
    },
    'Estratificaci√≥n': {
        'Fenotipos_Identificados': stratification_results['n_phenotypes'],
        'Grupos_Riesgo': stratification_results['n_risk_groups'],
        'Calidad_Clustering': f"{stratification_results['silhouette_score']:.3f}"
    },
    'Ensemble': {
        'Mejora_Regresi√≥n': f"+{ensemble_reg_results['improvement']:.3f}",
        'Mejora_Clasificaci√≥n': f"+{ensemble_clf_results['improvement']:.3f}",
        'Score_Final_R2': f"{ensemble_reg_results['ensemble_score']:.4f}",
        'Score_Final_F1': f"{ensemble_clf_results['ensemble_score']:.4f}"
    }
}

# Mostrar resumen
for category, metrics in final_results.items():
    print(f"\nüéØ {category.upper().replace('_', ' ')}:")
    for metric, value in metrics.items():
        print(f"   ‚Ä¢ {metric.replace('_', ' ')}: {value}")


## Guardar resultados y Generar reportes

In [None]:
# Guardar resultados y generar reportes
print("\nüìÅ GUARDADO DE RESULTADOS Y REPORTES")
print("-" * 40)

# Compilar todos los resultados
master_results = {
    'execution_timestamp': datetime.now().isoformat(),
    'project_config': PROJECT_CONFIG,
    'data_quality': data_quality,
    'regression_results': regression_results,
    'classification_results': classification_results,
    'stratification_results': stratification_results,
    'ensemble_results': {
        'regression': ensemble_reg_results,
        'classification': ensemble_clf_results
    },
    'final_summary': final_results
}

# Guardar resultados master
import json
results_file = os.path.join(PROJECT_CONFIG['results_path'], 'master_results.json')
with open(results_file, 'w') as f:
    # Convertir numpy arrays a listas para serializaci√≥n JSON
    serializable_results = utils.make_json_serializable(master_results)
    json.dump(serializable_results, f, indent=2)

print(f"‚úÖ Resultados guardados en: {results_file}")

# Generar reporte CSV para an√°lisis posterior
results_df = pd.DataFrame({
    'Pipeline': ['Regression', 'Classification', 'Stratification', 'Ensemble_Reg', 'Ensemble_Clf'],
    'Best_Score': [
        regression_results['best_metrics']['r2'],
        classification_results['best_metrics']['f1_macro'],
        stratification_results['silhouette_score'],
        ensemble_reg_results['ensemble_score'],
        ensemble_clf_results['ensemble_score']
    ],
    'Models_Evaluated': [
        len(regression_results['all_results']),
        len(classification_results['all_results']), 
        stratification_results['n_phenotypes'],
        len(base_models['regression']),
        len(base_models['classification'])
    ]
})

csv_file = os.path.join(PROJECT_CONFIG['results_path'], 'pipeline_summary.csv')
results_df.to_csv(csv_file, index=False)
print(f"‚úÖ Resumen CSV guardado en: {csv_file}")


## Estado final y Pr√≥ximos pasos

In [None]:
# Estado final y pr√≥ximos pasos
print("\n" + "="*60)
print("üéâ FASE 4 COMPLETADA EXITOSAMENTE")
print("="*60)

completion_status = {
    '‚úÖ Modelos de Regresi√≥n': f"{len(regression_results['all_results'])} modelos evaluados",
    '‚úÖ Modelos de Clasificaci√≥n': f"{len(classification_results['all_results'])} modelos evaluados",
    '‚úÖ An√°lisis Temporal': "Completado" if temporal_features else "Omitido (sin datos temporales)",
    '‚úÖ Estratificaci√≥n de Riesgo': f"{stratification_results['n_phenotypes']} fenotipos identificados",
    '‚úÖ M√©todos Ensemble': "Mejoras significativas obtenidas",
    '‚úÖ Resultados Guardados': "JSON y CSV generados",
    '‚úÖ Experimentos MLflow': "Todos registrados correctamente"
}

print("\nüìã ESTADO DE COMPLETITUD:")
for status, detail in completion_status.items():
    print(f"   {status}: {detail}")

print(f"\nüîÑ PR√ìXIMA FASE: Evaluaci√≥n y Optimizaci√≥n")
print(f"üìÇ Resultados disponibles en: {PROJECT_CONFIG['results_path']}")
print(f"üî¨ Experimentos MLflow: alzheimer_model_development_master")

print(f"\nüí° RECOMENDACIONES PARA FASE 5:")
print(f"   ‚Ä¢ Revisar m√©tricas de validaci√≥n cruzada")
print(f"   ‚Ä¢ Optimizar hiperpar√°metros de mejores modelos")
print(f"   ‚Ä¢ Evaluar explicabilidad de modelos")
print(f"   ‚Ä¢ Validar en conjunto de prueba independiente")

print("\nüöÄ ¬°Listo para continuar con la Fase 5!")

---

__Abraham Tartalos__