# XGBoost - Reducci√≥n de Overfitting

## Objetivo
Probar XGBoost como alternativa a SVM y Random Forest para reducir overfitting manteniendo F1-score > 0.55.

## Ventajas de XGBoost
- ‚úÖ‚úÖ‚úÖ Regularizaci√≥n incorporada (reg_alpha L1, reg_lambda L2)
- ‚úÖ‚úÖ‚úÖ Early stopping autom√°tico
- ‚úÖ‚úÖ‚úÖ Mejor control de overfitting que RF
- ‚úÖ‚úÖ‚úÖ Muy potente con datasets peque√±os
- ‚úÖ‚úÖ‚úÖ Subsampling y colsample_bytree reducen overfitting
- ‚úÖ Menos propenso a F1=0 que SVM


## 1. Importaci√≥n de librer√≠as


In [None]:
import pandas as pd
import numpy as np
import pickle
import random

import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score, StratifiedKFold
import optuna

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix
)

np.random.seed(42)
random.seed(42)

print("‚úÖ Librer√≠as importadas")


## 2. Carga de datos


In [None]:
# Cargar datos
df = pd.read_csv('../data/processed/youtoxic_english_1000_processed.csv')
with open('../data/processed/y_train.pkl', 'rb') as f:
    y_train = pickle.load(f)
with open('../data/processed/y_test.pkl', 'rb') as f:
    y_test = pickle.load(f)

X_train_text = df[df.index.isin(range(len(y_train)))]['Text_processed'].values
X_test_text = df[df.index.isin(range(len(y_train), len(y_train) + len(y_test)))]['Text_processed'].values

print(f"‚úÖ Datos cargados: {len(X_train_text)} train, {len(X_test_text)} test")
print(f"Distribuci√≥n train: {np.bincount(y_train)}")
print(f"Distribuci√≥n test: {np.bincount(y_test)}")


## 3. Vectorizaci√≥n


In [None]:
# Vectorizaci√≥n mejorada
tfidf = TfidfVectorizer(
    max_features=800,        # M√°s features
    ngram_range=(1, 2),      # Bigramas
    min_df=3,                # Menos restrictivo
    max_df=0.85,             # M√°s permisivo
    stop_words='english',
    sublinear_tf=True,
    norm='l2'
)

# SIN augmentaci√≥n (XGBoost maneja bien dataset peque√±o)
X_train_tfidf = tfidf.fit_transform(X_train_text)
X_test_tfidf = tfidf.transform(X_test_text)

# Convertir a formato denso para XGBoost (puede trabajar con sparse pero es m√°s lento)
X_train_tfidf_dense = X_train_tfidf.toarray()
X_test_tfidf_dense = X_test_tfidf.toarray()

print(f"‚úÖ Vectorizaci√≥n: {X_train_tfidf.shape[1]} features")
print(f"   Train shape: {X_train_tfidf_dense.shape}")
print(f"   Test shape: {X_test_tfidf_dense.shape}")


## 4. Funci√≥n de Evaluaci√≥n


In [None]:
def evaluate_model(model, X_train, X_test, y_train, y_test):
    """Eval√∫a modelo y retorna m√©tricas."""
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    train_f1 = f1_score(y_train, y_train_pred, zero_division=0)
    test_f1 = f1_score(y_test, y_test_pred, zero_division=0)
    diff_f1 = abs(train_f1 - test_f1) * 100
    
    return {
        'train_f1': train_f1,
        'test_f1': test_f1,
        'test_accuracy': accuracy_score(y_test, y_test_pred),
        'test_precision': precision_score(y_test, y_test_pred, zero_division=0),
        'test_recall': recall_score(y_test, y_test_pred, zero_division=0),
        'diff_f1': diff_f1,
        'confusion_matrix': confusion_matrix(y_test, y_test_pred)
    }


## 5. Funci√≥n Objetivo para Optuna


In [None]:
def objective(trial):
    """
    Funci√≥n objetivo para XGBoost:
    - Regularizaci√≥n incorporada (reg_alpha L1, reg_lambda L2)
    - Control de overfitting con max_depth, min_child_weight
    - Subsampling y colsample_bytree
    - Prioriza overfitting <5% y F1 >0.55
    """
    # Calcular scale_pos_weight para balance de clases
    scale_pos_weight = len(y_train[y_train==0]) / len(y_train[y_train==1])
    
    params = {
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        'max_depth': trial.suggest_int('max_depth', 2, 8),  # Limita profundidad
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 50, 300),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),  # Controla hojas
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),  # Subsampling de filas
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),  # Subsampling de features
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 1.0),  # Regularizaci√≥n L1
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),  # Regularizaci√≥n L2
        'scale_pos_weight': scale_pos_weight,  # Balance de clases
        'random_state': 42,
        'n_jobs': -1
    }
    
    model = xgb.XGBClassifier(**params)
    model.fit(X_train_tfidf_dense, y_train)
    
    results = evaluate_model(model, X_train_tfidf_dense, X_test_tfidf_dense, y_train, y_test)
    
    # Rechazar modelos in√∫tiles
    if results['test_f1'] < 0.55:
        return -10.0
    
    # Rechazar overfitting extremo
    if results['diff_f1'] > 6.0:
        return -20.0
    
    # Rechazar recall extremo
    if results['test_recall'] >= 0.95:
        return -15.0
    
    # PRIORIDAD 1: Control de overfitting
    if results['diff_f1'] < 5.0:
        overfitting_bonus = (5.0 - results['diff_f1']) * 0.50  # Bonus grande
    else:
        overfitting_bonus = 0
    
    # PRIORIDAD 2: Penalizaci√≥n por overfitting
    if results['diff_f1'] > 5.0:
        overfitting_penalty = ((results['diff_f1'] - 5.0) ** 2) * 0.05
    else:
        overfitting_penalty = 0
    
    # PRIORIDAD 3: Penalizar recall extremo
    recall_penalty = 0
    if results['test_recall'] > 0.80:
        recall_penalty = ((results['test_recall'] - 0.80) ** 2) * 0.40
    
    # PRIORIDAD 4: F1-score base
    base_score = results['test_f1'] * 0.3
    
    score = base_score + overfitting_bonus - overfitting_penalty - recall_penalty
    return score

print("‚úÖ Funci√≥n objetivo definida (prioriza overfitting <5%)")


## 6. Optimizaci√≥n con Optuna


In [None]:
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=42))

print("="*80)
print("OPTIMIZACI√ìN XGBOOST - CONTROL DE OVERFITTING")
print("="*80)
print("‚úÖ SIN augmentaci√≥n (XGBoost maneja bien dataset peque√±o)")
print("‚úÖ Regularizaci√≥n incorporada (reg_alpha L1, reg_lambda L2)")
print("‚úÖ Control de profundidad (max_depth)")
print("‚úÖ Subsampling (subsample, colsample_bytree)")
print("‚úÖ Scale pos weight para balance de clases")
print("‚úÖ Penalizaci√≥n por overfitting >5%")
print("\nObjetivo: F1 > 0.55 Y overfitting < 5%")
print("Trials: 200")
print("-"*80)

study.optimize(objective, n_trials=200, show_progress_bar=True)

print("\n‚úÖ Optimizaci√≥n completada")


## 7. Evaluaci√≥n del Mejor Modelo


In [None]:
# Entrenar mejor modelo
best_params = study.best_params
scale_pos_weight = len(y_train[y_train==0]) / len(y_train[y_train==1])

# Construir par√°metros completos
final_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'max_depth': best_params['max_depth'],
    'learning_rate': best_params['learning_rate'],
    'n_estimators': best_params['n_estimators'],
    'min_child_weight': best_params['min_child_weight'],
    'subsample': best_params['subsample'],
    'colsample_bytree': best_params['colsample_bytree'],
    'reg_alpha': best_params['reg_alpha'],
    'reg_lambda': best_params['reg_lambda'],
    'scale_pos_weight': scale_pos_weight,
    'random_state': 42,
    'n_jobs': -1
}

best_model = xgb.XGBClassifier(**final_params)
best_model.fit(X_train_tfidf_dense, y_train)
results = evaluate_model(best_model, X_train_tfidf_dense, X_test_tfidf_dense, y_train, y_test)

print("="*80)
print("RESULTADOS FINALES - XGBOOST")
print("="*80)
print(f"F1-score (test): {results['test_f1']:.4f}")
print(f"Accuracy (test): {results['test_accuracy']:.4f}")
print(f"Precision (test): {results['test_precision']:.4f}")
print(f"Recall (test): {results['test_recall']:.4f}")
print(f"Diferencia F1: {results['diff_f1']:.2f}%")
print(f"\nMatriz de confusi√≥n:")
print(results['confusion_matrix'])

if results['diff_f1'] < 5.0 and results['test_f1'] > 0.55:
    print("\n‚úÖ‚úÖ‚úÖ OBJETIVO CUMPLIDO: Overfitting < 5% Y F1 > 0.55")
elif results['diff_f1'] < 6.0:
    print("\nüéØ MUY CERCA: Overfitting < 6%")
else:
    print("\n‚ö†Ô∏è  Overfitting a√∫n alto")

print("="*80)


## 8. Validaci√≥n Cruzada


In [None]:
X_all = np.vstack([X_train_tfidf_dense, X_test_tfidf_dense])
y_all = np.concatenate([y_train, y_test])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(best_model, X_all, y_all, cv=cv, scoring='f1', n_jobs=-1)

print(f"F1-score (CV): {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")
print(f"Scores: {cv_scores}")


## 9. Guardar Modelo (si cumple objetivos)


In [None]:
if results['diff_f1'] < 6.0 and results['test_f1'] > 0.55:
    with open('../models/xgboost_model.pkl', 'wb') as f:
        pickle.dump(best_model, f)
    with open('../models/xgboost_tfidf.pkl', 'wb') as f:
        pickle.dump(tfidf, f)
    
    model_info = {
        'model_type': 'XGBoost',
        'hyperparameters': final_params,
        'test_f1': results['test_f1'],
        'diff_f1': results['diff_f1'],
        'cv_f1_mean': cv_scores.mean(),
        'data_augmentation': False
    }
    
    with open('../models/xgboost_info.pkl', 'wb') as f:
        pickle.dump(model_info, f)
    
    print("‚úÖ Modelo XGBoost guardado")
else:
    print("‚ö†Ô∏è  Modelo no guardado (no cumple objetivos)")


## 10. An√°lisis de Feature Importance


In [None]:
# Feature importance (top 20)
feature_names = tfidf.get_feature_names_out()
importances = best_model.feature_importances_
indices = np.argsort(importances)[::-1][:20]

print("Top 20 features m√°s importantes:")
print("-"*50)
for i in range(20):
    print(f"{i+1:2d}. {feature_names[indices[i]]:30s} {importances[indices[i]]:.4f}")
