# Modelos de Ensemble
## Proyecto de Clasificación Multiclase

Este notebook implementa modelos de ensemble avanzados:
- **Bagging**: BaggingClassifier
- **Boosting**: AdaBoost, Gradient Boosting, XGBoost, LightGBM
- **Stacking**: Combinación de múltiples modelos
- **Voting**: Soft y Hard Voting

Los modelos de ensemble combinan múltiples modelos para mejorar el rendimiento.

In [None]:
# Importar librerías
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
import pickle
import time

# Modelos Ensemble
from sklearn.ensemble import (
    BaggingClassifier,
    AdaBoostClassifier,
    GradientBoostingClassifier,
    VotingClassifier,
    StackingClassifier,
    ExtraTreesClassifier
)
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Modelos base
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

# Métricas
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix
)
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings('ignore')

# Configuración
plt.style.use('seaborn-v0_8-darkgrid')
%matplotlib inline
np.random.seed(42)

## 1. Carga de Datos

In [None]:
# Cargar datos
X_train = np.load('../data/processed/X_train_selected.npy')
X_test = np.load('../data/processed/X_test_selected.npy')
y_train = np.load('../data/processed/y_train_resampled.npy')
y_test = np.load('../data/processed/y_test.npy')

# Cargar label encoder
with open('../models/label_encoder.pkl', 'rb') as f:
    label_encoder = pickle.load(f)

print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"Número de clases: {len(label_encoder.classes_)}")

## 2. Función de Evaluación

In [None]:
def evaluate_ensemble(model, X_train, X_test, y_train, y_test, model_name):
    """
    Entrena y evalúa un modelo ensemble
    """
    print(f"\n{'='*80}")
    print(f"Evaluando: {model_name}")
    print(f"{'='*80}")
    
    # Entrenar
    start_time = time.time()
    model.fit(X_train, y_train)
    training_time = time.time() - start_time
    
    # Predicciones
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # Métricas
    train_acc = accuracy_score(y_train, y_pred_train)
    test_acc = accuracy_score(y_test, y_pred_test)
    precision = precision_score(y_test, y_pred_test, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred_test, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred_test, average='weighted', zero_division=0)
    
    print(f"\n📊 Resultados:")
    print(f"  Tiempo de entrenamiento: {training_time:.2f}s")
    print(f"  Accuracy (Train): {train_acc:.4f}")
    print(f"  Accuracy (Test): {test_acc:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall: {recall:.4f}")
    print(f"  F1-Score: {f1:.4f}")
    
    # Classification report
    print(f"\n📋 Classification Report:")
    print(classification_report(y_test, y_pred_test, 
                                target_names=label_encoder.classes_,
                                zero_division=0))
    
    results = {
        'model_name': model_name,
        'train_accuracy': float(train_acc),
        'test_accuracy': float(test_acc),
        'precision': float(precision),
        'recall': float(recall),
        'f1_score': float(f1),
        'training_time': float(training_time)
    }
    
    cm = confusion_matrix(y_test, y_pred_test)
    
    return model, results, cm

## 3. Modelos de Bagging

### 3.1 BaggingClassifier

In [None]:
bagging_model = BaggingClassifier(
    estimator=DecisionTreeClassifier(max_depth=10),
    n_estimators=50,
    random_state=42,
    n_jobs=-1
)

bagging_model, bagging_results, bagging_cm = evaluate_ensemble(
    bagging_model, X_train, X_test, y_train, y_test, "Bagging Classifier"
)

### 3.2 Extra Trees

In [None]:
et_model = ExtraTreesClassifier(
    n_estimators=100,
    max_depth=15,
    random_state=42,
    n_jobs=-1
)

et_model, et_results, et_cm = evaluate_ensemble(
    et_model, X_train, X_test, y_train, y_test, "Extra Trees"
)

## 4. Modelos de Boosting

### 4.1 AdaBoost

In [None]:
ada_model = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=3),
    n_estimators=100,
    learning_rate=1.0,
    random_state=42
)

ada_model, ada_results, ada_cm = evaluate_ensemble(
    ada_model, X_train, X_test, y_train, y_test, "AdaBoost"
)

### 4.2 Gradient Boosting

In [None]:
gb_model = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)

gb_model, gb_results, gb_cm = evaluate_ensemble(
    gb_model, X_train, X_test, y_train, y_test, "Gradient Boosting"
)

### 4.3 XGBoost

In [None]:
xgb_model = XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=42,
    n_jobs=-1,
    eval_metric='mlogloss'
)

xgb_model, xgb_results, xgb_cm = evaluate_ensemble(
    xgb_model, X_train, X_test, y_train, y_test, "XGBoost"
)

### 4.4 LightGBM

In [None]:
lgbm_model = LGBMClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=42,
    n_jobs=-1,
    verbose=-1
)

lgbm_model, lgbm_results, lgbm_cm = evaluate_ensemble(
    lgbm_model, X_train, X_test, y_train, y_test, "LightGBM"
)

## 5. Voting Classifier

In [None]:
# Cargar modelos base previamente entrenados
with open('../models/logistic_regression_model.pkl', 'rb') as f:
    lr_base = pickle.load(f)
with open('../models/random_forest_model.pkl', 'rb') as f:
    rf_base = pickle.load(f)

# Voting Classifier (Soft Voting)
voting_model = VotingClassifier(
    estimators=[
        ('lr', LogisticRegression(max_iter=1000, random_state=42)),
        ('gb', GradientBoostingClassifier(n_estimators=50, random_state=42)),
        ('xgb', XGBClassifier(n_estimators=50, random_state=42, eval_metric='mlogloss'))
    ],
    voting='soft',
    n_jobs=-1
)

voting_model, voting_results, voting_cm = evaluate_ensemble(
    voting_model, X_train, X_test, y_train, y_test, "Voting Classifier (Soft)"
)

## 6. Stacking Classifier

In [None]:
# Stacking Classifier
stacking_model = StackingClassifier(
    estimators=[
        ('lr', LogisticRegression(max_iter=1000, random_state=42)),
        ('gb', GradientBoostingClassifier(n_estimators=50, random_state=42)),
        ('xgb', XGBClassifier(n_estimators=50, random_state=42, eval_metric='mlogloss'))
    ],
    final_estimator=LogisticRegression(max_iter=1000),
    n_jobs=-1
)

stacking_model, stacking_results, stacking_cm = evaluate_ensemble(
    stacking_model, X_train, X_test, y_train, y_test, "Stacking Classifier"
)

## 7. Comparación de Modelos Ensemble

In [None]:
# Compilar resultados
ensemble_results = [
    bagging_results,
    et_results,
    ada_results,
    gb_results,
    xgb_results,
    lgbm_results,
    voting_results,
    stacking_results
]

ensemble_df = pd.DataFrame(ensemble_results)
ensemble_df = ensemble_df.sort_values('test_accuracy', ascending=False)

print("\n" + "="*100)
print("COMPARACIÓN DE MODELOS ENSEMBLE")
print("="*100)
print(ensemble_df.to_string(index=False))

# Guardar
ensemble_df.to_csv('../results/ensemble_models_comparison.csv', index=False)
print("\n✓ Resultados guardados")

In [None]:
# Visualización comparativa
fig, axes = plt.subplots(2, 2, figsize=(18, 12))

# Test Accuracy
ax1 = axes[0, 0]
sorted_df = ensemble_df.sort_values('test_accuracy')
colors = plt.cm.viridis(np.linspace(0, 1, len(sorted_df)))
ax1.barh(sorted_df['model_name'], sorted_df['test_accuracy'], color=colors, edgecolor='black')
ax1.set_xlabel('Accuracy', fontsize=12)
ax1.set_title('Test Accuracy - Modelos Ensemble', fontsize=14, fontweight='bold')
ax1.set_xlim([0, 1])
for i, v in enumerate(sorted_df['test_accuracy']):
    ax1.text(v + 0.01, i, f'{v:.4f}', va='center', fontweight='bold')

# F1-Score
ax2 = axes[0, 1]
sorted_f1 = ensemble_df.sort_values('f1_score')
ax2.barh(sorted_f1['model_name'], sorted_f1['f1_score'], color='coral', edgecolor='black')
ax2.set_xlabel('F1-Score', fontsize=12)
ax2.set_title('F1-Score - Modelos Ensemble', fontsize=14, fontweight='bold')
ax2.set_xlim([0, 1])
for i, v in enumerate(sorted_f1['f1_score']):
    ax2.text(v + 0.01, i, f'{v:.4f}', va='center', fontweight='bold')

# Tiempo de entrenamiento
ax3 = axes[1, 0]
sorted_time = ensemble_df.sort_values('training_time')
ax3.barh(sorted_time['model_name'], sorted_time['training_time'], color='lightgreen', edgecolor='black')
ax3.set_xlabel('Tiempo (segundos)', fontsize=12)
ax3.set_title('Tiempo de Entrenamiento', fontsize=14, fontweight='bold')
for i, v in enumerate(sorted_time['training_time']):
    ax3.text(v + 0.5, i, f'{v:.1f}s', va='center', fontweight='bold')

# Todas las métricas
ax4 = axes[1, 1]
metrics = ['test_accuracy', 'precision', 'recall', 'f1_score']
x = np.arange(len(ensemble_df))
width = 0.2

for i, metric in enumerate(metrics):
    ax4.bar(x + i*width, ensemble_df[metric], width, label=metric.replace('_', ' ').title())

ax4.set_xlabel('Modelos', fontsize=12)
ax4.set_ylabel('Score', fontsize=12)
ax4.set_title('Todas las Métricas', fontsize=14, fontweight='bold')
ax4.set_xticks(x + width * 1.5)
ax4.set_xticklabels(ensemble_df['model_name'], rotation=45, ha='right')
ax4.legend()
ax4.set_ylim([0, 1])
ax4.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('../results/ensemble_models_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

## 8. Mejor Modelo Ensemble

In [None]:
best_ensemble_name = ensemble_df.iloc[0]['model_name']
best_ensemble_acc = ensemble_df.iloc[0]['test_accuracy']

print(f"\n🏆 MEJOR MODELO ENSEMBLE: {best_ensemble_name}")
print(f"   Test Accuracy: {best_ensemble_acc:.4f}")
print(f"\nTop 3 Modelos Ensemble:")
print(ensemble_df[['model_name', 'test_accuracy', 'f1_score', 'training_time']].head(3).to_string(index=False))

## 9. Guardar Modelos Ensemble

In [None]:
# Guardar modelos
ensemble_models = {
    'bagging': bagging_model,
    'extra_trees': et_model,
    'adaboost': ada_model,
    'gradient_boosting': gb_model,
    'xgboost': xgb_model,
    'lightgbm': lgbm_model,
    'voting': voting_model,
    'stacking': stacking_model
}

for name, model in ensemble_models.items():
    with open(f'../models/{name}_ensemble.pkl', 'wb') as f:
        pickle.dump(model, f)

print("✓ Modelos ensemble guardados")

## Conclusiones

Se han implementado y evaluado 8 modelos de ensemble:
- ✓ Bagging Classifier
- ✓ Extra Trees
- ✓ AdaBoost
- ✓ Gradient Boosting
- ✓ XGBoost
- ✓ LightGBM
- ✓ Voting Classifier
- ✓ Stacking Classifier

### Observaciones:
1. Los modelos de boosting (XGBoost, LightGBM, Gradient Boosting) generalmente superan a bagging
2. Stacking y Voting pueden combinar fortalezas de múltiples modelos
3. Hay un trade-off entre precisión y tiempo de entrenamiento

### Próximo paso:
Optimización de hiperparámetros del mejor modelo