# 10c - Modelos de Clasificaci√≥n para Categor√≠as de Riesgo

**Objetivo:** Desarrollar modelos de clasificaci√≥n para predecir `risk_category` (Low/Moderate/High)
 
**Enfoque:** Desarrollo de modelos base con configuraciones est√°ndar
**Evaluaci√≥n y optimizaci√≥n:** Se realizar√° en la siguiente fase

---

## Importar librer√≠as

In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# ML Libraries
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report
import xgboost as xgb
import lightgbm as lgb

# MLflow
import mlflow
import mlflow.sklearn
import mlflow.xgboost
import mlflow.lightgbm

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Utils
import joblib
import os
import json

In [None]:
# Configuraci√≥n MLflow
experiment_name = "alzheimer_classification_models"
mlflow.set_experiment(experiment_name)

print(f"üî¨ Experimento MLflow: {experiment_name}")
print(f"üìä Tracking URI: {mlflow.get_tracking_uri()}")

## Cargar datos

In [None]:
# Configuraci√≥n de paths
DATA_PATH = "../data/processed/"
MODELS_PATH = "../models/classification/"
RESULTS_PATH = "../results/classification/"

os.makedirs(MODELS_PATH, exist_ok=True)
os.makedirs(RESULTS_PATH, exist_ok=True)

In [None]:
# Cargar datos
print("üìÅ Cargando dataset...")
df = pd.read_csv(f"{DATA_PATH}alzheimer_features_final.csv")

print(f"üìä Dataset: {df.shape}")
print(f"üéØ Target: risk_category")

# %%
# An√°lisis del target categ√≥rico
target_col = 'risk_category'
target_counts = df[target_col].value_counts()

print("üéØ AN√ÅLISIS DEL TARGET CATEG√ìRICO")
print("="*40)
print(f"üìä Distribuci√≥n de {target_col}:")
for category, count in target_counts.items():
    pct = (count / len(df)) * 100
    print(f"   {category}: {count:,} ({pct:.1f}%)")

print(f"\nüìã Clases disponibles: {df[target_col].unique()}")
print(f"üî¢ N√∫mero de clases: {df[target_col].nunique()}")


In [None]:
# Visualizaci√≥n de distribuci√≥n de clases
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Distribuci√≥n de clases
target_counts.plot(kind='bar', ax=axes[0], color='skyblue', alpha=0.8)
axes[0].set_title('Distribuci√≥n de Categor√≠as de Riesgo')
axes[0].set_xlabel('Categor√≠a de Riesgo')
axes[0].set_ylabel('Frecuencia')
axes[0].tick_params(axis='x', rotation=45)

# Distribuci√≥n porcentual
target_pct = (target_counts / target_counts.sum()) * 100
target_pct.plot(kind='pie', ax=axes[1], autopct='%1.1f%%', startangle=90)
axes[1].set_title('Distribuci√≥n Porcentual')
axes[1].set_ylabel('')

plt.tight_layout()
plt.savefig(f"{RESULTS_PATH}target_distribution.png", dpi=300, bbox_inches='tight')
plt.show()

## Preparaci√≥n de datos

In [None]:
print("üîß PREPARACI√ìN DE DATOS")
print("="*30)

# Features y target
feature_cols = [col for col in df.columns if col not in [target_col, 'composite_risk_score']]
X = df[feature_cols].copy()
y = df[target_col].copy()

# Eliminar registros con target faltante
valid_mask = y.notna()
X = X[valid_mask]
y = y[valid_mask]

print(f"üìä Datos finales:")
print(f"   Samples: {X.shape[0]:,}")
print(f"   Features: {X.shape[1]:,}")
print(f"   Clases: {y.nunique()}")

# %%
# Encoding del target
le = LabelEncoder()
y_encoded = le.fit_transform(y)

print(f"üî§ Encoding de clases:")
for i, class_name in enumerate(le.classes_):
    print(f"   {class_name} ‚Üí {i}")

# %%
# Manejo de valores faltantes
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy='median')
X_imputed = pd.DataFrame(
    imputer.fit_transform(X), 
    columns=X.columns, 
    index=X.index
)

print(f"‚úÖ Missing values imputados")

In [None]:
# Divisi√≥n estratificada de datos
X_train, X_test, y_train, y_test = train_test_split(
    X_imputed, y_encoded, 
    test_size=0.2, 
    random_state=42,
    stratify=y_encoded
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train,
    test_size=0.2,
    random_state=42,
    stratify=y_train
)

print(f"üìä Divisi√≥n estratificada:")
print(f"   Train: {X_train.shape[0]:,}")
print(f"   Validation: {X_val.shape[0]:,}")
print(f"   Test: {X_test.shape[0]:,}")

# Verificar distribuci√≥n en cada split
for name, y_split in [('Train', y_train), ('Val', y_val), ('Test', y_test)]:
    unique, counts = np.unique(y_split, return_counts=True)
    dist = (counts / counts.sum()) * 100
    print(f"   {name}: {dist}")

In [None]:
# Escalado de features
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(
    scaler.fit_transform(X_train), 
    columns=X_train.columns, 
    index=X_train.index
)
X_val_scaled = pd.DataFrame(
    scaler.transform(X_val), 
    columns=X_val.columns, 
    index=X_val.index
)
X_test_scaled = pd.DataFrame(
    scaler.transform(X_test), 
    columns=X_test.columns, 
    index=X_test.index
)

print("‚öñÔ∏è Escalado completado")

## Definici√≥n de modelos de clasificaci√≥n

In [None]:
# Definici√≥n de modelos de clasificaci√≥n
print("ü§ñ DEFINICI√ìN DE MODELOS")
print("="*30)

models = {
    'logistic_regression': {
        'model': LogisticRegression(random_state=42, max_iter=1000),
        'scaled': True,
        'description': 'Regresi√≥n Log√≠stica'
    },
    
    'random_forest': {
        'model': RandomForestClassifier(
            n_estimators=100,
            max_depth=10,
            min_samples_split=5,
            random_state=42,
            n_jobs=-1
        ),
        'scaled': False,
        'description': 'Random Forest'
    },
    
    'extra_trees': {
        'model': ExtraTreesClassifier(
            n_estimators=100,
            max_depth=10,
            min_samples_split=5,
            random_state=42,
            n_jobs=-1
        ),
        'scaled': False,
        'description': 'Extra Trees'
    },
    
    'gradient_boosting': {
        'model': GradientBoostingClassifier(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=6,
            random_state=42
        ),
        'scaled': False,
        'description': 'Gradient Boosting'
    },
    
    'xgboost': {
        'model': xgb.XGBClassifier(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=6,
            random_state=42,
            n_jobs=-1
        ),
        'scaled': False,
        'description': 'XGBoost'
    },
    
    'lightgbm': {
        'model': lgb.LGBMClassifier(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=6,
            random_state=42,
            n_jobs=-1,
            verbose=-1
        ),
        'scaled': False,
        'description': 'LightGBM'
    },
    
    'svm': {
        'model': SVC(kernel='rbf', random_state=42, probability=True),
        'scaled': True,
        'description': 'Support Vector Machine'
    },
    
    'knn': {
        'model': KNeighborsClassifier(n_neighbors=5, weights='distance'),
        'scaled': True,
        'description': 'K-Nearest Neighbors'
    },
    
    'naive_bayes': {
        'model': GaussianNB(),
        'scaled': True,
        'description': 'Naive Bayes'
    },
    
    'decision_tree': {
        'model': DecisionTreeClassifier(
            max_depth=10,
            min_samples_split=5,
            random_state=42
        ),
        'scaled': False,
        'description': 'Decision Tree'
    },
    
    'mlp': {
        'model': MLPClassifier(
            hidden_layer_sizes=(100, 50),
            learning_rate_init=0.001,
            max_iter=500,
            random_state=42,
            early_stopping=True
        ),
        'scaled': True,
        'description': 'Multi-Layer Perceptron'
    }
}

print(f"üîß {len(models)} modelos definidos")

In [None]:
# Funci√≥n de evaluaci√≥n
def evaluate_classifier(model, X_train, X_val, y_train, y_val, model_name):
    """Eval√∫a un modelo de clasificaci√≥n"""
    
    # Entrenar
    model.fit(X_train, y_train)
    
    # Predicciones
    y_train_pred = model.predict(X_train)
    y_val_pred = model.predict(X_val)
    
    # M√©tricas
    train_acc = accuracy_score(y_train, y_train_pred)
    val_acc = accuracy_score(y_val, y_val_pred)
    
    # Precision, Recall, F1 por clase
    train_prf = precision_recall_fscore_support(y_train, y_train_pred, average='weighted')
    val_prf = precision_recall_fscore_support(y_val, y_val_pred, average='weighted')
    
    # Cross-validation
    cv_scores = cross_val_score(
        model, X_train, y_train, 
        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
        scoring='accuracy',
        n_jobs=-1
    )
    
    metrics = {
        'train_accuracy': train_acc,
        'val_accuracy': val_acc,
        'train_precision': train_prf[0],
        'val_precision': val_prf[0],
        'train_recall': train_prf[1],
        'val_recall': val_prf[1],
        'train_f1': train_prf[2],
        'val_f1': val_prf[2],
        'cv_accuracy_mean': cv_scores.mean(),
        'cv_accuracy_std': cv_scores.std()
    }
    
    return metrics, y_train_pred, y_val_pred

## Entrenamiento de modelos

In [None]:
# Entrenamiento de modelos
print("üöÄ ENTRENAMIENTO DE MODELOS")
print("="*30)

results = {}
all_predictions = {}

for model_name, config in models.items():
    print(f"\nüîÑ {model_name}...")
    
    with mlflow.start_run(run_name=model_name):
        # Seleccionar datos
        if config['scaled']:
            X_train_model = X_train_scaled
            X_val_model = X_val_scaled
        else:
            X_train_model = X_train
            X_val_model = X_val
            
        try:
            # Entrenar y evaluar
            metrics, y_train_pred, y_val_pred = evaluate_classifier(
                config['model'], X_train_model, X_val_model,
                y_train, y_val, model_name
            )
            
            # Guardar resultados
            results[model_name] = metrics
            all_predictions[model_name] = {
                'train_pred': y_train_pred,
                'val_pred': y_val_pred
            }
            
            # MLflow logging
            mlflow.log_param("model_type", model_name)
            mlflow.log_param("scaled_data", config['scaled'])
            mlflow.log_param("n_classes", len(le.classes_))
            mlflow.log_param("n_features", X_train_model.shape[1])
            
            for metric_name, value in metrics.items():
                mlflow.log_metric(metric_name, value)
            
            # Log modelo
            if model_name == 'xgboost':
                mlflow.xgboost.log_model(config['model'], f"model_{model_name}")
            elif model_name == 'lightgbm':
                mlflow.lightgbm.log_model(config['model'], f"model_{model_name}")
            else:
                mlflow.sklearn.log_model(config['model'], f"model_{model_name}")
            
            # Guardar localmente
            joblib.dump(config['model'], f"{MODELS_PATH}{model_name}_classifier.pkl")
            
            print(f"   ‚úÖ Accuracy: {metrics['val_accuracy']:.4f}")
            
        except Exception as e:
            print(f"   ‚ùå Error: {str(e)}")

print(f"\nüéØ Entrenamiento completado: {len(results)} modelos")

## Comparaci√≥n de resultados

In [None]:
# Comparaci√≥n de resultados
print("üìä COMPARACI√ìN DE RESULTADOS")
print("="*30)

results_df = pd.DataFrame(results).T.sort_values('val_accuracy', ascending=False)

print("üèÜ Ranking por Accuracy de validaci√≥n:")
print("-" * 60)
for i, (model, row) in enumerate(results_df.head(8).iterrows(), 1):
    print(f"{i}. {model:20s} | Acc: {row['val_accuracy']:.4f} | F1: {row['val_f1']:.4f}")

# Guardar resultados
results_df.to_csv(f"{RESULTS_PATH}classification_models_comparison.csv")

# %%
# Visualizaci√≥n de comparaci√≥n
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

models_top = results_df.index[:8]

# Accuracy comparison
train_acc = results_df.loc[models_top, 'train_accuracy']
val_acc = results_df.loc[models_top, 'val_accuracy']

x = np.arange(len(models_top))
width = 0.35

axes[0,0].bar(x - width/2, train_acc, width, label='Train', alpha=0.8)
axes[0,0].bar(x + width/2, val_acc, width, label='Validation', alpha=0.8)
axes[0,0].set_title('Accuracy Comparison')
axes[0,0].set_xticks(x)
axes[0,0].set_xticklabels(models_top, rotation=45, ha='right')
axes[0,0].legend()
axes[0,0].grid(True, alpha=0.3)

# F1 Score comparison
train_f1 = results_df.loc[models_top, 'train_f1']
val_f1 = results_df.loc[models_top, 'val_f1']

axes[0,1].bar(x - width/2, train_f1, width, label='Train', alpha=0.8)
axes[0,1].bar(x + width/2, val_f1, width, label='Validation', alpha=0.8)
axes[0,1].set_title('F1 Score Comparison')
axes[0,1].set_xticks(x)
axes[0,1].set_xticklabels(models_top, rotation=45, ha='right')
axes[0,1].legend()
axes[0,1].grid(True, alpha=0.3)

# Overfitting analysis
overfit_score = results_df.loc[models_top, 'train_accuracy'] - results_df.loc[models_top, 'val_accuracy']
colors = ['red' if x > 0.1 else 'orange' if x > 0.05 else 'green' for x in overfit_score]

axes[1,0].bar(models_top, overfit_score, color=colors, alpha=0.7)
axes[1,0].set_title('Overfitting Analysis')
axes[1,0].set_ylabel('Train Acc - Val Acc')
axes[1,0].tick_params(axis='x', rotation=45)
axes[1,0].grid(True, alpha=0.3)

# Cross-validation scores
cv_means = results_df.loc[models_top, 'cv_accuracy_mean']
cv_stds = results_df.loc[models_top, 'cv_accuracy_std']

axes[1,1].bar(models_top, cv_means, yerr=cv_stds, capsize=5, alpha=0.7)
axes[1,1].set_title('Cross-Validation Accuracy')
axes[1,1].tick_params(axis='x', rotation=45)
axes[1,1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(f"{RESULTS_PATH}models_comparison.png", dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Matriz de confusi√≥n del mejor modelo
best_model_name = results_df.index[0]
best_predictions = all_predictions[best_model_name]

print(f"üèÜ MEJOR MODELO: {best_model_name}")

from sklearn.metrics import confusion_matrix
import seaborn as sns

# Matriz de confusi√≥n para validaci√≥n
cm = confusion_matrix(y_val, best_predictions['val_pred'])

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=le.classes_, yticklabels=le.classes_)
plt.title(f'Matriz de Confusi√≥n - {best_model_name}')
plt.xlabel('Predicci√≥n')
plt.ylabel('Real')
plt.savefig(f"{RESULTS_PATH}confusion_matrix_{best_model_name}.png", dpi=300, bbox_inches='tight')
plt.show()

## Feature importance (si disponible)

In [None]:
# Feature importance (si disponible)
if hasattr(models[best_model_name]['model'], 'feature_importances_'):
    print(f"üåü FEATURE IMPORTANCE - {best_model_name}")
    
    importances = models[best_model_name]['model'].feature_importances_
    feature_names = X_train.columns if not models[best_model_name]['scaled'] else X_train_scaled.columns
    
    importance_df = pd.DataFrame({
        'feature': feature_names,
        'importance': importances
    }).sort_values('importance', ascending=False)
    
    # Top 15 features
    top_features = importance_df.head(15)
    
    plt.figure(figsize=(10, 8))
    sns.barplot(data=top_features, x='importance', y='feature')
    plt.title(f'Top 15 Features - {best_model_name}')
    plt.tight_layout()
    plt.savefig(f"{RESULTS_PATH}feature_importance_{best_model_name}.png", dpi=300, bbox_inches='tight')
    plt.show()
    
    importance_df.to_csv(f"{RESULTS_PATH}feature_importance_{best_model_name}.csv", index=False)


## Guardado de configuraci√≥n y resultados

In [None]:
# Guardado de configuraci√≥n y resultados
print("üíæ GUARDADO DE RESULTADOS")

# Guardar scaler y encoder
joblib.dump(scaler, f"{MODELS_PATH}scaler_classification.pkl")
joblib.dump(le, f"{MODELS_PATH}label_encoder.pkl")

# Configuraci√≥n de splits
split_config = {
    'train_indices': X_train.index.tolist(),
    'val_indices': X_val.index.tolist(),
    'test_indices': X_test.index.tolist(),
    'feature_columns': X_train.columns.tolist(),
    'target_column': target_col,
    'classes': le.classes_.tolist(),
    'best_model': best_model_name
}

with open(f"{RESULTS_PATH}classification_split_config.json", 'w') as f:
    json.dump(split_config, f, indent=2)

# Predicciones
pred_df = pd.DataFrame({
    'y_train_true': y_train,
    'y_val_true': y_val
})

for model_name, preds in all_predictions.items():
    pred_df[f'{model_name}_train_pred'] = preds['train_pred']
    pred_df[f'{model_name}_val_pred'] = preds['val_pred']

pred_df.to_csv(f"{RESULTS_PATH}classification_predictions.csv")

print("‚úÖ Guardado completado")

## Resumen final

In [None]:
# Resumen final
print("üéØ RESUMEN FINAL - CLASIFICACI√ìN")
print("="*40)

print(f"üìä Dataset:")
print(f"   ‚Ä¢ Samples: {len(X):,}")
print(f"   ‚Ä¢ Features: {X.shape[1]:,}")
print(f"   ‚Ä¢ Clases: {len(le.classes_)} ({', '.join(le.classes_)})")

print(f"\nü§ñ Modelos evaluados: {len(results)}")
print(f"üèÜ Mejor modelo: {best_model_name}")
best_metrics = results_df.loc[best_model_name]
print(f"   ‚Ä¢ Accuracy: {best_metrics['val_accuracy']:.4f}")
print(f"   ‚Ä¢ F1-Score: {best_metrics['val_f1']:.4f}")
print(f"   ‚Ä¢ CV Accuracy: {best_metrics['cv_accuracy_mean']:.4f} ¬± {best_metrics['cv_accuracy_std']:.4f}")

print(f"\nüíæ Archivos generados:")
print(f"   ‚Ä¢ {len(results)} modelos entrenados")
print(f"   ‚Ä¢ Comparaci√≥n: classification_models_comparison.csv")
print(f"   ‚Ä¢ Predicciones: classification_predictions.csv")
print(f"   ‚Ä¢ Configuraci√≥n: classification_split_config.json")

print(f"\n‚úÖ NOTEBOOK 04c_classification_models.ipynb COMPLETADO")

---

__Abraham Tartalos__