# Modelos Base de Clasificación
## Proyecto de Clasificación Multiclase

Este notebook implementa y evalúa varios modelos base de clasificación:
- Logistic Regression
- K-Nearest Neighbors (KNN)
- Support Vector Machine (SVM)
- Decision Tree
- Random Forest

Se evaluarán y compararán para establecer una línea base.

In [None]:
# Importar librerías
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
import pickle
import time

# Modelos
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

# Métricas
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    classification_report, confusion_matrix, roc_auc_score,
    roc_curve, auc
)
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings('ignore')

# Configuración
plt.style.use('seaborn-v0_8-darkgrid')
%matplotlib inline

# Semilla para reproducibilidad
np.random.seed(42)

## 1. Carga de Datos Preprocesados

In [None]:
# Cargar datos preprocesados
X_train = np.load('../data/processed/X_train_selected.npy')
X_test = np.load('../data/processed/X_test_selected.npy')
y_train = np.load('../data/processed/y_train_resampled.npy')
y_test = np.load('../data/processed/y_test.npy')

print(f"X_train: {X_train.shape}")
print(f"X_test: {X_test.shape}")
print(f"y_train: {y_train.shape}")
print(f"y_test: {y_test.shape}")

# Cargar label encoder
with open('../models/label_encoder.pkl', 'rb') as f:
    label_encoder = pickle.load(f)

print(f"\nNúmero de clases: {len(label_encoder.classes_)}")
print(f"Clases: {label_encoder.classes_}")

## 2. Definición de Función de Evaluación

In [None]:
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    """
    Entrena y evalúa un modelo de clasificación
    """
    print(f"\n{'='*80}")
    print(f"Evaluando: {model_name}")
    print(f"{'='*80}")
    
    # Entrenar modelo
    start_time = time.time()
    model.fit(X_train, y_train)
    training_time = time.time() - start_time
    
    # Predicciones
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # Métricas
    train_accuracy = accuracy_score(y_train, y_pred_train)
    test_accuracy = accuracy_score(y_test, y_pred_test)
    
    # Métricas multiclase (promedio weighted)
    precision = precision_score(y_test, y_pred_test, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred_test, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred_test, average='weighted', zero_division=0)
    
    # Cross-validation (solo en train para no tocar test)
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    
    print(f"\n📊 Resultados:")
    print(f"  Tiempo de entrenamiento: {training_time:.2f}s")
    print(f"  Accuracy (Train): {train_accuracy:.4f}")
    print(f"  Accuracy (Test): {test_accuracy:.4f}")
    print(f"  Precision (Test): {precision:.4f}")
    print(f"  Recall (Test): {recall:.4f}")
    print(f"  F1-Score (Test): {f1:.4f}")
    print(f"  CV Accuracy (mean ± std): {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")
    
    # Reporte de clasificación
    print(f"\n📋 Classification Report:")
    print(classification_report(y_test, y_pred_test, 
                                target_names=label_encoder.classes_,
                                zero_division=0))
    
    # Matriz de confusión
    cm = confusion_matrix(y_test, y_pred_test)
    
    # Guardar resultados
    results = {
        'model_name': model_name,
        'train_accuracy': float(train_accuracy),
        'test_accuracy': float(test_accuracy),
        'precision': float(precision),
        'recall': float(recall),
        'f1_score': float(f1),
        'cv_mean': float(cv_scores.mean()),
        'cv_std': float(cv_scores.std()),
        'training_time': float(training_time)
    }
    
    return model, results, cm, y_pred_test

In [None]:
def plot_confusion_matrix(cm, model_name, classes):
    """
    Visualiza la matriz de confusión
    """
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=classes, yticklabels=classes,
                cbar_kws={'label': 'Número de predicciones'})
    plt.title(f'Matriz de Confusión - {model_name}', fontsize=14, fontweight='bold')
    plt.ylabel('Clase Real', fontsize=12)
    plt.xlabel('Clase Predicha', fontsize=12)
    plt.tight_layout()
    plt.savefig(f'../results/confusion_matrix_{model_name.replace(" ", "_").lower()}.png', dpi=300, bbox_inches='tight')
    plt.show()

## 3. Entrenamiento de Modelos Base

### 3.1 Logistic Regression

In [None]:
lr_model = LogisticRegression(max_iter=1000, random_state=42, n_jobs=-1)
lr_model, lr_results, lr_cm, lr_pred = evaluate_model(
    lr_model, X_train, X_test, y_train, y_test, "Logistic Regression"
)
plot_confusion_matrix(lr_cm, "Logistic Regression", label_encoder.classes_)

### 3.2 K-Nearest Neighbors

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)
knn_model, knn_results, knn_cm, knn_pred = evaluate_model(
    knn_model, X_train, X_test, y_train, y_test, "K-Nearest Neighbors"
)
plot_confusion_matrix(knn_cm, "K-Nearest Neighbors", label_encoder.classes_)

### 3.3 Support Vector Machine

In [None]:
svm_model = SVC(kernel='rbf', random_state=42)
svm_model, svm_results, svm_cm, svm_pred = evaluate_model(
    svm_model, X_train, X_test, y_train, y_test, "Support Vector Machine"
)
plot_confusion_matrix(svm_cm, "Support Vector Machine", label_encoder.classes_)

### 3.4 Decision Tree

In [None]:
dt_model = DecisionTreeClassifier(random_state=42, max_depth=10)
dt_model, dt_results, dt_cm, dt_pred = evaluate_model(
    dt_model, X_train, X_test, y_train, y_test, "Decision Tree"
)
plot_confusion_matrix(dt_cm, "Decision Tree", label_encoder.classes_)

### 3.5 Random Forest

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, max_depth=15)
rf_model, rf_results, rf_cm, rf_pred = evaluate_model(
    rf_model, X_train, X_test, y_train, y_test, "Random Forest"
)
plot_confusion_matrix(rf_cm, "Random Forest", label_encoder.classes_)

### 3.6 Naive Bayes

In [None]:
nb_model = GaussianNB()
nb_model, nb_results, nb_cm, nb_pred = evaluate_model(
    nb_model, X_train, X_test, y_train, y_test, "Naive Bayes"
)
plot_confusion_matrix(nb_cm, "Naive Bayes", label_encoder.classes_)

## 4. Comparación de Modelos

In [None]:
# Compilar todos los resultados
all_results = [
    lr_results,
    knn_results,
    svm_results,
    dt_results,
    rf_results,
    nb_results
]

results_df = pd.DataFrame(all_results)
results_df = results_df.sort_values('test_accuracy', ascending=False)

print("\n" + "="*100)
print("COMPARACIÓN DE MODELOS BASE")
print("="*100)
print(results_df.to_string(index=False))

# Guardar resultados
results_df.to_csv('../results/base_models_comparison.csv', index=False)
print("\n✓ Resultados guardados en 'results/base_models_comparison.csv'")

In [None]:
# Visualización comparativa
fig, axes = plt.subplots(2, 2, figsize=(18, 12))

# Accuracy
ax1 = axes[0, 0]
results_df_sorted = results_df.sort_values('test_accuracy')
ax1.barh(results_df_sorted['model_name'], results_df_sorted['test_accuracy'], color='steelblue', edgecolor='black')
ax1.set_xlabel('Accuracy', fontsize=12)
ax1.set_title('Test Accuracy por Modelo', fontsize=14, fontweight='bold')
ax1.set_xlim([0, 1])
for i, v in enumerate(results_df_sorted['test_accuracy']):
    ax1.text(v + 0.01, i, f'{v:.4f}', va='center')

# F1-Score
ax2 = axes[0, 1]
results_df_sorted_f1 = results_df.sort_values('f1_score')
ax2.barh(results_df_sorted_f1['model_name'], results_df_sorted_f1['f1_score'], color='coral', edgecolor='black')
ax2.set_xlabel('F1-Score', fontsize=12)
ax2.set_title('F1-Score por Modelo', fontsize=14, fontweight='bold')
ax2.set_xlim([0, 1])
for i, v in enumerate(results_df_sorted_f1['f1_score']):
    ax2.text(v + 0.01, i, f'{v:.4f}', va='center')

# Tiempo de entrenamiento
ax3 = axes[1, 0]
results_df_sorted_time = results_df.sort_values('training_time')
ax3.barh(results_df_sorted_time['model_name'], results_df_sorted_time['training_time'], color='lightgreen', edgecolor='black')
ax3.set_xlabel('Tiempo (segundos)', fontsize=12)
ax3.set_title('Tiempo de Entrenamiento', fontsize=14, fontweight='bold')
for i, v in enumerate(results_df_sorted_time['training_time']):
    ax3.text(v + 0.1, i, f'{v:.2f}s', va='center')

# Métricas combinadas
ax4 = axes[1, 1]
metrics = ['test_accuracy', 'precision', 'recall', 'f1_score']
x = np.arange(len(results_df))
width = 0.2

for i, metric in enumerate(metrics):
    ax4.bar(x + i*width, results_df[metric], width, label=metric.replace('_', ' ').title())

ax4.set_xlabel('Modelos', fontsize=12)
ax4.set_ylabel('Score', fontsize=12)
ax4.set_title('Comparación de Métricas', fontsize=14, fontweight='bold')
ax4.set_xticks(x + width * 1.5)
ax4.set_xticklabels(results_df['model_name'], rotation=45, ha='right')
ax4.legend()
ax4.set_ylim([0, 1])
ax4.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('../results/base_models_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

## 5. Análisis del Mejor Modelo Base

In [None]:
# Identificar el mejor modelo
best_model_name = results_df.iloc[0]['model_name']
best_model_accuracy = results_df.iloc[0]['test_accuracy']

print(f"\n🏆 MEJOR MODELO BASE: {best_model_name}")
print(f"   Accuracy: {best_model_accuracy:.4f}")
print(f"\nTop 3 modelos:")
print(results_df[['model_name', 'test_accuracy', 'f1_score']].head(3).to_string(index=False))

## 6. Guardar Modelos Entrenados

In [None]:
# Guardar los modelos entrenados
models = {
    'logistic_regression': lr_model,
    'knn': knn_model,
    'svm': svm_model,
    'decision_tree': dt_model,
    'random_forest': rf_model,
    'naive_bayes': nb_model
}

for name, model in models.items():
    with open(f'../models/{name}_model.pkl', 'wb') as f:
        pickle.dump(model, f)

print("✓ Modelos guardados en 'models/'")

## Conclusiones

Se han entrenado y evaluado 6 modelos base de clasificación:
- ✓ Logistic Regression
- ✓ K-Nearest Neighbors
- ✓ Support Vector Machine
- ✓ Decision Tree
- ✓ Random Forest
- ✓ Naive Bayes

### Observaciones:
1. Los modelos ensemble (Random Forest) generalmente superan a los modelos simples
2. La comparación de métricas permite identificar el mejor modelo base
3. El análisis de matrices de confusión revela errores específicos por clase

### Próximo paso:
Implementar modelos de ensemble más avanzados (Bagging, Boosting, Stacking)