# ü§ñ Modelado Baseline: Modelos Cl√°sicos de ML
## Entrenamiento y evaluaci√≥n de modelos de clasificaci√≥n

### Objetivos:
1. Entrenar modelos cl√°sicos (Naive Bayes, Logistic Regression, SVM, Random Forest)
2. Comparar TF-IDF vs Count Vectorizer
3. Evaluar m√©tricas (F1, Accuracy, Precision, Recall)
4. Analizar overfitting
5. Seleccionar mejor modelo baseline


## 1. Importar librer√≠as


In [None]:
import pandas as pd
import numpy as np
import sys
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# A√±adir src al path
sys.path.append(str(Path('../src').resolve()))

from models.train import train_model, save_model
from models.evaluate import evaluate_model, compare_models, print_classification_report
from features.vectorization import load_vectorized_data

print("‚úÖ Librer√≠as importadas")


## 2. Cargar datos vectorizados


In [None]:
# Cargar datos vectorizados con TF-IDF
data_dir = Path('../data/processed')
X_train_tfidf, X_test_tfidf, y_train, y_test = load_vectorized_data(data_dir, prefix='tfidf')

# Cargar datos vectorizados con Count Vectorizer
X_train_count, X_test_count, _, _ = load_vectorized_data(data_dir, prefix='count')

print(f"\n‚úÖ Datos cargados:")
print(f"   TF-IDF - Train: {X_train_tfidf.shape}, Test: {X_test_tfidf.shape}")
print(f"   Count  - Train: {X_train_count.shape}, Test: {X_test_count.shape}")
print(f"   Labels - Train: {len(y_train)}, Test: {len(y_test)}")


## 3. Entrenar modelos con TF-IDF


In [None]:
print("="*80)
print("ENTRENANDO MODELOS CON TF-IDF")
print("="*80)

models_tfidf = {}
results_tfidf = {}

# 1. Naive Bayes
print("\nüîµ Entrenando Naive Bayes...")
models_tfidf['Naive Bayes'] = train_model('naive_bayes', X_train_tfidf, y_train, alpha=1.0)
results_tfidf['Naive Bayes'] = evaluate_model(
    models_tfidf['Naive Bayes'], X_train_tfidf, X_test_tfidf, y_train, y_test
)

# 2. Logistic Regression
print("\nüü¢ Entrenando Logistic Regression...")
models_tfidf['Logistic Regression'] = train_model(
    'logistic', X_train_tfidf, y_train, C=1.0, penalty='l2', class_weight='balanced'
)
results_tfidf['Logistic Regression'] = evaluate_model(
    models_tfidf['Logistic Regression'], X_train_tfidf, X_test_tfidf, y_train, y_test
)

# 3. SVM
print("\nüü° Entrenando SVM...")
models_tfidf['SVM'] = train_model(
    'svm', X_train_tfidf, y_train, C=1.0, kernel='linear', class_weight='balanced'
)
results_tfidf['SVM'] = evaluate_model(
    models_tfidf['SVM'], X_train_tfidf, X_test_tfidf, y_train, y_test
)

# 4. Random Forest
print("\nüî¥ Entrenando Random Forest...")
models_tfidf['Random Forest'] = train_model(
    'random_forest', X_train_tfidf, y_train, 
    n_estimators=100, max_depth=10, class_weight='balanced'
)
results_tfidf['Random Forest'] = evaluate_model(
    models_tfidf['Random Forest'], X_train_tfidf, X_test_tfidf, y_train, y_test
)


## 4. Entrenar modelos con Count Vectorizer


In [None]:
print("="*80)
print("ENTRENANDO MODELOS CON COUNT VECTORIZER")
print("="*80)

models_count = {}
results_count = {}

# 1. Naive Bayes
print("\nüîµ Entrenando Naive Bayes...")
models_count['Naive Bayes'] = train_model('naive_bayes', X_train_count, y_train, alpha=1.0)
results_count['Naive Bayes'] = evaluate_model(
    models_count['Naive Bayes'], X_train_count, X_test_count, y_train, y_test
)

# 2. Logistic Regression
print("\nüü¢ Entrenando Logistic Regression...")
models_count['Logistic Regression'] = train_model(
    'logistic', X_train_count, y_train, C=1.0, penalty='l2', class_weight='balanced'
)
results_count['Logistic Regression'] = evaluate_model(
    models_count['Logistic Regression'], X_train_count, X_test_count, y_train, y_test
)

# 3. SVM
print("\nüü° Entrenando SVM...")
models_count['SVM'] = train_model(
    'svm', X_train_count, y_train, C=1.0, kernel='linear', class_weight='balanced'
)
results_count['SVM'] = evaluate_model(
    models_count['SVM'], X_train_count, X_test_count, y_train, y_test
)

# 4. Random Forest
print("\nüî¥ Entrenando Random Forest...")
models_count['Random Forest'] = train_model(
    'random_forest', X_train_count, y_train, 
    n_estimators=100, max_depth=10, class_weight='balanced'
)
results_count['Random Forest'] = evaluate_model(
    models_count['Random Forest'], X_train_count, X_test_count, y_train, y_test
)


In [None]:
print("="*80)
print("COMPARACI√ìN DE MODELOS - TF-IDF")
print("="*80)
comparison_tfidf = compare_models(results_tfidf)
print("\n" + comparison_tfidf.to_string(index=False))

print("\n" + "="*80)
print("COMPARACI√ìN DE MODELOS - COUNT VECTORIZER")
print("="*80)
comparison_count = compare_models(results_count)
print("\n" + comparison_count.to_string(index=False))


## 6. Visualizaci√≥n de resultados


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Comparaci√≥n F1-score
comparison_all = pd.concat([
    comparison_tfidf.assign(Vectorizer='TF-IDF'),
    comparison_count.assign(Vectorizer='Count')
])

# F1-score por modelo
pivot_f1 = comparison_all.pivot(index='Modelo', columns='Vectorizer', values='F1 (test)')
pivot_f1.plot(kind='bar', ax=axes[0, 0], color=['#3498db', '#e74c3c'])
axes[0, 0].set_title('F1-Score (Test) por Modelo y Vectorizador', fontweight='bold', fontsize=12)
axes[0, 0].set_ylabel('F1-Score')
axes[0, 0].set_xlabel('Modelo')
axes[0, 0].legend(title='Vectorizador')
axes[0, 0].grid(axis='y', alpha=0.3)
axes[0, 0].tick_params(axis='x', rotation=45)

# Overfitting por modelo
pivot_overfitting = comparison_all.pivot(index='Modelo', columns='Vectorizer', values='Overfitting (%)')
pivot_overfitting.plot(kind='bar', ax=axes[0, 1], color=['#3498db', '#e74c3c'])
axes[0, 1].axhline(y=5, color='r', linestyle='--', label='Objetivo (<5%)')
axes[0, 1].set_title('Overfitting por Modelo y Vectorizador', fontweight='bold', fontsize=12)
axes[0, 1].set_ylabel('Overfitting (%)')
axes[0, 1].set_xlabel('Modelo')
axes[0, 1].legend(title='Vectorizador')
axes[0, 1].grid(axis='y', alpha=0.3)
axes[0, 1].tick_params(axis='x', rotation=45)

# Accuracy por modelo
pivot_acc = comparison_all.pivot(index='Modelo', columns='Vectorizer', values='Accuracy (test)')
pivot_acc.plot(kind='bar', ax=axes[1, 0], color=['#3498db', '#e74c3c'])
axes[1, 0].set_title('Accuracy (Test) por Modelo y Vectorizador', fontweight='bold', fontsize=12)
axes[1, 0].set_ylabel('Accuracy')
axes[1, 0].set_xlabel('Modelo')
axes[1, 0].legend(title='Vectorizador')
axes[1, 0].grid(axis='y', alpha=0.3)
axes[1, 0].tick_params(axis='x', rotation=45)

# Scatter: F1 vs Overfitting
axes[1, 1].scatter(
    comparison_all[comparison_all['Vectorizer'] == 'TF-IDF']['Overfitting (%)'],
    comparison_all[comparison_all['Vectorizer'] == 'TF-IDF']['F1 (test)'],
    label='TF-IDF', s=100, alpha=0.7, color='#3498db'
)
axes[1, 1].scatter(
    comparison_all[comparison_all['Vectorizer'] == 'Count']['Overfitting (%)'],
    comparison_all[comparison_all['Vectorizer'] == 'Count']['F1 (test)'],
    label='Count', s=100, alpha=0.7, color='#e74c3c', marker='s'
)
axes[1, 1].axvline(x=5, color='r', linestyle='--', alpha=0.5, label='Objetivo Overfitting')
axes[1, 1].set_xlabel('Overfitting (%)')
axes[1, 1].set_ylabel('F1-Score (Test)')
axes[1, 1].set_title('F1-Score vs Overfitting', fontweight='bold', fontsize=12)
axes[1, 1].legend()
axes[1, 1].grid(alpha=0.3)

# A√±adir etiquetas de modelos
for idx, row in comparison_all.iterrows():
    if row['Vectorizer'] == 'TF-IDF':
        axes[1, 1].annotate(row['Modelo'], 
                           (row['Overfitting (%)'], row['F1 (test)']),
                           fontsize=8, alpha=0.7)

plt.tight_layout()
plt.show()


In [None]:
# Combinar todos los resultados
all_results = {}
for name, results in results_tfidf.items():
    all_results[f"{name} (TF-IDF)"] = results
for name, results in results_count.items():
    all_results[f"{name} (Count)"] = results

# Seleccionar mejor modelo (mayor F1 en test, priorizando overfitting < 5%)
comparison_all = compare_models(all_results)

print("="*80)
print("SELECCI√ìN DEL MEJOR MODELO")
print("="*80)

# Filtrar modelos con overfitting < 5%
good_models = comparison_all[comparison_all['Overfitting (%)'] < 5.0]

if len(good_models) > 0:
    best_model_name = good_models.iloc[0]['Modelo']
    best_model_results = all_results[best_model_name]
    print(f"\nüèÜ MEJOR MODELO (Overfitting < 5%): {best_model_name}")
    print(f"   F1-score (test): {best_model_results['test_f1']:.4f}")
    print(f"   Overfitting: {best_model_results['diff_f1']:.2f}%")
    print(f"   Accuracy: {best_model_results['test_accuracy']:.4f}")
    
    # Determinar qu√© modelo y vectorizador usar
    if 'TF-IDF' in best_model_name:
        best_model = models_tfidf[best_model_name.replace(' (TF-IDF)', '')]
        vectorizer_type = 'tfidf'
    else:
        best_model = models_count[best_model_name.replace(' (Count)', '')]
        vectorizer_type = 'count'
else:
    # Si ning√∫n modelo cumple, elegir el mejor F1
    best_model_name = comparison_all.iloc[0]['Modelo']
    best_model_results = all_results[best_model_name]
    print(f"\n‚ö†Ô∏è  MEJOR MODELO (Overfitting > 5%): {best_model_name}")
    print(f"   F1-score (test): {best_model_results['test_f1']:.4f}")
    print(f"   Overfitting: {best_model_results['diff_f1']:.2f}%")
    print(f"   ‚ö†Ô∏è  Necesita optimizaci√≥n para reducir overfitting")
    
    if 'TF-IDF' in best_model_name:
        best_model = models_tfidf[best_model_name.replace(' (TF-IDF)', '')]
        vectorizer_type = 'tfidf'
    else:
        best_model = models_count[best_model_name.replace(' (Count)', '')]
        vectorizer_type = 'count'

print(f"\nüìä Top 3 modelos:")
print(comparison_all.head(3).to_string(index=False))


## 8. Guardar mejor modelo baseline


In [None]:
# Guardar mejor modelo
models_dir = Path('../models/baseline')
models_dir.mkdir(parents=True, exist_ok=True)

model_path = models_dir / 'best_baseline_model.pkl'
model_info = {
    'model_name': best_model_name,
    'vectorizer_type': vectorizer_type,
    'test_f1': best_model_results['test_f1'],
    'test_accuracy': best_model_results['test_accuracy'],
    'overfitting': best_model_results['diff_f1'],
    'train_f1': best_model_results['train_f1']
}

save_model(best_model, model_path, model_info)

print(f"\n‚úÖ Modelo baseline guardado:")
print(f"   {model_path}")
print(f"   Informaci√≥n: {models_dir / 'best_baseline_model_info.pkl'}")


## 9. Resumen del modelado baseline


In [None]:
print("="*80)
print("RESUMEN DEL MODELADO BASELINE")
print("="*80)

print(f"\n‚úÖ Modelos entrenados:")
print(f"   1. Naive Bayes")
print(f"   2. Logistic Regression")
print(f"   3. SVM")
print(f"   4. Random Forest")

print(f"\n‚úÖ Vectorizadores probados:")
print(f"   1. TF-IDF")
print(f"   2. Count Vectorizer")

print(f"\nüèÜ Mejor modelo seleccionado:")
print(f"   {best_model_name}")
print(f"   F1-score (test): {best_model_results['test_f1']:.4f}")
print(f"   Overfitting: {best_model_results['diff_f1']:.2f}%")

print(f"\nüíæ Modelo guardado en:")
print(f"   ../models/baseline/best_baseline_model.pkl")

print("\n‚úÖ Modelado baseline completado")
print("   Pr√≥ximo paso: Optimizaci√≥n de hiperpar√°metros y reducci√≥n de overfitting")
