# 04c - Classification Models Development

**Objetivo**: Desarrollar modelos de clasificaci√≥n para predecir categor√≠as de riesgo de Alzheimer (Low, Moderate, High)
 
**Target Variable**: `risk_category`
**Clases**: Low (46.4%), Moderate (46.1%), High (7.5%)
 
**Modelos a desarrollar**:
- Logistic Regression (baseline)
- Random Forest Classifier
- Gradient Boosting (XGBoost, LightGBM)
- Support Vector Machine
- Neural Network (MLP)

---

## Importar librer√≠as

In [8]:
import sys
import os
sys.path.append('../src/modeling')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import mlflow
import mlflow.sklearn
import warnings
warnings.filterwarnings('ignore')

import model_utils
# Importar scripts personalizados
from classification_pipeline import ClassificationPipeline
from ensemble_methods import AlzheimerEnsemble

In [9]:
# Configuraci√≥n de visualizaci√≥n
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")


In [10]:
print("‚úÖ Librer√≠as y scripts importados correctamente")
print(f"üìÖ Fecha de ejecuci√≥n: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")


‚úÖ Librer√≠as y scripts importados correctamente
üìÖ Fecha de ejecuci√≥n: 2025-06-22 18:18:38


## Cargar datos

In [13]:
# Cargar datos procesados
try:
    df = pd.read_csv('../data/processed/features/alzheimer_features_selected_20250621.csv')
    print(f"üìä Dataset cargado: {df.shape}")
    
    # Verificar target variable
    if 'risk_category' in df.columns:
        print(f"üéØ Distribuci√≥n de clases:")
        class_dist = df['risk_category'].value_counts()
        print(class_dist)
        print(f"üìä Porcentajes:")
        print((class_dist / len(df) * 100).round(1))
    else:
        print("‚ùå Error: Variable target 'risk_category' no encontrada")
        
except FileNotFoundError:
    print("‚ùå Error: Archivo de features no encontrado")
    print("üí° Ejecuta primero el notebook 03_feature_engineering_master.ipynb")


üìä Dataset cargado: (48466, 186)
üéØ Distribuci√≥n de clases:
risk_category
Low         22501
Moderate    22345
High         3620
Name: count, dtype: int64
üìä Porcentajes:
risk_category
Low         46.4
Moderate    46.1
High         7.5
Name: count, dtype: float64


## Inicializar Pipeline de Clasificai√≥n

In [16]:
# Inicializar pipeline de clasificaci√≥n
classification_pipeline = ClassificationPipeline()
ensemble_methods = AlzheimerEnsemble()

## Ejecutar Pipeline

In [17]:
# Ejecutar el pipeline completo
print("üöÄ Ejecutando pipeline completo de clasificaci√≥n...")
pipeline_results = classification_pipeline.run_pipeline(df, target_col='risk_category')

# Obtener resultados
results = pipeline_results['results']
cv_results = pipeline_results['cv_results']
trained_models = pipeline_results['trained_models']
best_model_name = pipeline_results['best_model']

print(f"üèÜ Mejor modelo seleccionado: {best_model_name}")


üöÄ Ejecutando pipeline completo de clasificaci√≥n...
üöÄ Iniciando pipeline de clasificaci√≥n...


TypeError: Cannot convert [['Female' 'Female' 'Female' ... 'Male' 'Male' 'Male']
 [nan '2005-11-01 00:00:00' '2005-11-01 00:00:00' ... '2006-10-17'
  '2006-10-17' '2006-10-17']
 [nan 'S_122' 'S_122' ... 'S_065' 'S_065' 'S_065']
 ...
 ['Superior' 'B√°sica' 'B√°sica' ... 'B√°sica' 'B√°sica' 'B√°sica']
 [nan '2005-11-01 00:00:00' '2005-11-01 00:00:00' ... '2006-10-17'
  '2006-10-17' '2006-10-17']
 [nan 'F' 'F' ... 'M' 'M' 'M']] to numeric

## Recopilar m√©tricas de rendimiento

In [None]:
# Recopilar m√©tricas de rendimiento
performance_comparison = {}
for name, metrics in results.items():
    performance_comparison[name] = metrics['f1_weighted']

## Ensamblaje de modelos

In [None]:
# 1. Ensamblaje de modelos
with mlflow.start_run(run_name="ensemble_classification"):
    mlflow.set_tag("model_family", "ensemble")
    mlflow.set_tag("model_type", "classification")
    
    print("üöÄ Creando modelos ensemble...")
    
    # Recopilar mejores modelos entrenados
    best_models = {
        'logistic_regression': trained_models['logistic_regression'],
        'random_forest': trained_models['random_forest'],
        'gradient_boosting': trained_models['gradient_boosting'],
        'svm': trained_models['svm']
    }
    
    # Preparar datos completos
    X_train, X_test, y_train, y_test = classification_pipeline.prepare_data(df)
    X_full = pd.concat([X_train, X_test])
    y_full = pd.concat([y_train, y_test])
    
    # Voting Classifier
    voting_results = ensemble_methods.create_voting_ensemble(
        best_models, X_full, y_full, cv_folds=5
    )
    
    # Stacking Classifier
    stacking_results = ensemble_methods.create_stacking_classifier(
        best_models, X_full, y_full, cv_folds=5
    )
    
    # Registrar resultados
    mlflow.log_metrics({
        'voting_f1_weighted': voting_results['cv_scores']['f1_weighted'],
        'voting_accuracy': voting_results['cv_scores']['accuracy'],
        'stacking_f1_weighted': stacking_results['cv_scores']['f1_weighted'],
        'stacking_accuracy': stacking_results['cv_scores']['accuracy']
    })
    
    mlflow.sklearn.log_model(voting_results['model'], "voting_classifier")
    mlflow.sklearn.log_model(stacking_results['model'], "stacking_classifier")
    
    # A√±adir resultados de ensamblaje a la comparaci√≥n
    performance_comparison['Voting Ensemble'] = voting_results['cv_scores']['f1_weighted']
    performance_comparison['Stacking Ensemble'] = stacking_results['cv_scores']['f1_weighted']


## An√°lisis de Importancia de Caracter√≠sticas (Feature Importance)

In [None]:
# An√°lisis de importancia de caracter√≠sticas (para modelos tree-based)
print("\nüîç AN√ÅLISIS DE IMPORTANCIA DE FEATURES")
print("="*50)

# Obtener nombres de caracter√≠sticas
exclude_cols = ['composite_risk_score', 'risk_category']
feature_cols = [col for col in df.columns if col not in exclude_cols]

# Combinar importancias de modelos
combined_importance = pd.DataFrame()

for name, model in trained_models.items():
    if hasattr(model.named_steps['classifier'], 'feature_importances_'):
        importance = model_utils.get_feature_importance(
            model.named_steps['classifier'], feature_cols
        )
        combined_importance[name] = importance.set_index('feature')['importance']

if not combined_importance.empty:
    combined_importance['Mean_Importance'] = combined_importance.mean(axis=1)
    top_features = combined_importance.nlargest(15, 'Mean_Importance')
    
    # Visualizaci√≥n y guardado de resultados
    print("üéØ Top 15 Features m√°s importantes:")
    print(top_features[['Mean_Importance']].round(4))
    
    # Visualizaci√≥n
    plt.figure(figsize=(10, 8))
    top_features['Mean_Importance'].plot(kind='barh', color='lightgreen')
    plt.title('Top 15 Features - Importancia Promedio')
    plt.xlabel('Importancia Promedio')
    plt.tight_layout()
    plt.show()

## Comparaci√≥n Final de Modelos (agregando los ensembles)

In [None]:
print("\n" + "="*60)
print("üìä COMPARACI√ìN FINAL DE MODELOS")
print("="*60)

# Crear DataFrame comparativo
comparison_df = pd.DataFrame([
    {'Model': model, 'F1_Score_Weighted': score} 
    for model, score in performance_comparison.items()
]).sort_values('F1_Score_Weighted', ascending=False)


print(comparison_df.to_string(index=False))

# Visualizaci√≥n de comparaci√≥n
plt.figure(figsize=(12, 6))
bars = plt.bar(comparison_df['Model'], comparison_df['F1_Score_Weighted'], 
               color='lightcoral', alpha=0.8)
plt.title('Comparaci√≥n de Rendimiento - Modelos de Clasificaci√≥n')
plt.xlabel('Modelo')
plt.ylabel('F1-Score Weighted')
plt.xticks(rotation=45, ha='right')
plt.grid(axis='y', alpha=0.3)

# A√±adir valores en las barras
for bar, score in zip(bars, comparison_df['F1_Score_Weighted']):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.005,
             f'{score:.4f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

## Resumen final y Recomendaciones

In [None]:
# Resumen final y recomendaciones
best_model = comparison_df.iloc[0]['Model']
best_score = comparison_df.iloc[0]['F1_Score_Weighted']

print("\n" + "="*60)
print("üéØ RESUMEN FINAL - MODELOS DE CLASIFICACI√ìN")
print("="*60)
print(f"üèÜ Mejor modelo: {best_model}")
print(f"üìä F1-Score Weighted: {best_score:.4f}")
print(f"üìà Mejora sobre baseline: {((best_score - lr_performance) / lr_performance * 100):.1f}%")
print(f"‚öñÔ∏è Desbalanceamiento de clases: {imbalance_ratio:.2f}")
print(f"üéØ Total de modelos entrenados: {len(performance_comparison)}")

print("\nüí° RECOMENDACIONES:")
if best_score > 0.85:
    print("‚úÖ Excelente rendimiento alcanzado")
elif best_score > 0.75:
    print("‚ö†Ô∏è Buen rendimiento, considerar optimizaci√≥n en Fase 5")
else:
    print("‚ùå Rendimiento bajo, requiere optimizaci√≥n en Fase 5")

if imbalance_ratio > 3:
    print("‚öñÔ∏è Considerar t√©cnicas de balanceamiento adicionales")

print("\nüîÑ Listo para Fase 5: Evaluaci√≥n y Optimizaci√≥n")
print("üìÅ Todos los modelos guardados en MLflow")



## Guardado de Archivos Importantes

In [None]:
# Guardar m√©tricas finales
final_metrics = {
    'best_model': best_model,
    'best_f1_score': best_score,
    'baseline_f1_score': lr_performance,
    'improvement_percentage': (best_score - lr_performance) / lr_performance * 100,
    'models_trained': len(performance_comparison),
    'imbalance_ratio': imbalance_ratio,
    'training_samples': len(X),
    'features_used': len(feature_cols)
}

# Guardar en archivo
pd.Series(final_metrics).to_csv('../reports/model_results/classification_summary.csv')
print("üìÅ M√©tricas guardadas en: ../reports/model_results/classification_summary.csv")

---

__Abraham Tartalos__