# Modelo de Clasificación Jerárquica con Aumento de Datos v4.0

Este notebook implementa un pipeline avanzado de clasificación jerárquica con características combinadas:

1.  **Carga y Preprocesamiento**: Usa `hate_speech_twitter` y realiza limpieza de texto (tokenización, stemming, etc.).
2.  **Generación de Características Dual**: Crea embeddings de BERT y vectores TF-IDF.
3.  **Aumento de Datos Sintéticos**: Utiliza **CTGAN** de la librería `sdv` para generar datos sintéticos y **balancear las sub-categorías** de discurso de odio en el conjunto de entrenamiento, mejorando la robustez del modelo.
4.  **Entrenamiento de Clasificador Principal (Nivel 1)**: Entrena y optimiza un **ensemble extendido de seis modelos**. Incluye modelos que usan solo embeddings, solo TF-IDF, y una **combinación de ambos** para una detección más robusta de `odio` vs. `no-odio`.
5.  **Entrenamiento de Clasificador de Sub-categorías (Nivel 2)**: Entrena un ensemble de tres modelos XGBoost, MLP y Regresión Logística para clasificar el **tipo de odio** (ej. sexismo, racismo), utilizando también **características combinadas de embeddings y TF-IDF**.
6.  **Evaluación Jerárquica**: Evalúa el rendimiento del pipeline completo en dos niveles, reportando la precisión tanto en la detección de odio como en la clasificación de su tipo.

## 1. Instalación y Configuración

In [None]:
#!pip install transformers torch datasets scikit-learn xgboost pandas seaborn matplotlib tqdm optuna nltk scipy sdv nltk ipywidgets

In [None]:
import pandas as pd
import numpy as np
import os
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
import time
import torch
from torch.utils.data import DataLoader
import xgboost as xgb
from tqdm.auto import tqdm
import re

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, log_loss, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix, vstack

import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
try:
    nltk.data.find('tokenizers/punkt')
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('punkt')
    nltk.download('stopwords')

BERT_MODEL_NAME = 'bert-base-uncased'
MAX_SAMPLES = 10000 # Aumentar para un mejor entrenamiento de sub-categorías
MAX_TOKEN_LENGTH = 128

# --- Configuración de Dispositivo (GPU o CPU) ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Usando dispositivo: {device}")

# --- Definición de Rutas Locales ---
job_id = f"hierarchical-job-{int(time.time())}"
BASE_DIR = "./datos_locales"
# PROCESSED_DIR = os.path.join(BASE_DIR, "processed", job_id)
MODEL_OUTPUT_DIR = os.path.join(BASE_DIR, "model_output", job_id)
# os.makedirs(PROCESSED_DIR, exist_ok=True)
os.makedirs(MODEL_OUTPUT_DIR, exist_ok=True)
# PROCESSED_DATA_PATH = os.path.join(PROCESSED_DIR, "processed_data_with_embeddings.csv")
print(f"\nID de trabajo: {job_id}")

Usando dispositivo: cuda

ID de trabajo: hierarchical-job-1750954743


## 2. Carga, Análisis y Preprocesamiento de Datos

In [None]:
print("Cargando dataset 'thefrankhsu/hate_speech_twitter'...")
dataset = load_dataset("thefrankhsu/hate_speech_twitter")
df = pd.DataFrame(dataset['train'])

# Renombrar columnas y manejar nulos en 'categories'
df = df.rename(columns={'tweet': 'text_raw', 'label': 'main_label', 'categories': 'sub_label_str'})
df['sub_label_str'] = df['sub_label_str'].fillna('not-hate')


if MAX_SAMPLES is not None:
    # Asegurarse de que el tamaño de la muestra no sea mayor que la población
    sample_size = min(MAX_SAMPLES, len(df))
    print(f"Tomando una muestra aleatoria de {sample_size} registros (de un total de {len(df)}).")
    df = df.sample(n=sample_size, random_state=42, replace=False).reset_index(drop=True)

print("Distribución de etiquetas principales:")
print(df['main_label'].value_counts())

print("\nDistribución de sub-etiquetas (solo para 'odio'):")
print(df[df['main_label'] == 1]['sub_label_str'].value_counts())

# Codificar sub-etiquetas
from sklearn.preprocessing import LabelEncoder
sub_label_encoder = LabelEncoder()
df['sub_label_encoded'] = sub_label_encoder.fit_transform(df['sub_label_str'])
sub_label_mapping = dict(zip(sub_label_encoder.classes_, sub_label_encoder.transform(sub_label_encoder.classes_)))
print("\nMapeo de Sub-etiquetas:", sub_label_mapping)

# Preprocesamiento de texto
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
def clean_text(text, apply_stemming=False):
    if pd.isna(text): return ""
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|#','', text)
    tokens = word_tokenize(text)
    words = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words]
    if apply_stemming: words = [stemmer.stem(word) for word in words]
    return " ".join(words)

tqdm.pandas(desc="Limpiando Texto para Embeddings")
df['text_cleaned'] = df['text_raw'].progress_apply(lambda x: clean_text(x, apply_stemming=False))
tqdm.pandas(desc="Aplicando Stemming para TF-IDF")
df['text_stemmed'] = df['text_cleaned'].progress_apply(lambda x: " ".join([stemmer.stem(word) for word in x.split()]))

## 3. Generación de Embeddings y División de Datos

In [None]:
print(f"Cargando modelo y tokenizador BERT: {BERT_MODEL_NAME}")
tokenizer_bert = AutoTokenizer.from_pretrained(BERT_MODEL_NAME)
model_bert = AutoModel.from_pretrained(BERT_MODEL_NAME).to(device)
model_bert.eval()

def get_bert_embeddings(batch_text):
    inputs = tokenizer_bert(batch_text, padding=True, truncation=True, max_length=MAX_TOKEN_LENGTH, return_tensors='pt')
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model_bert(**inputs)
    return outputs.last_hidden_state[:, 0, :].cpu().numpy()

print("Generando embeddings...")
batch_size = 32
all_embeddings = np.vstack([get_bert_embeddings(df.iloc[i:i+batch_size]['text_cleaned'].tolist()) for i in tqdm(range(0, len(df), batch_size))])

embedding_cols = [f'dim_{i}' for i in range(all_embeddings.shape[1])]
df_embeddings = pd.DataFrame(all_embeddings, columns=embedding_cols, index=df.index)
df_processed = pd.concat([df, df_embeddings], axis=1)

print("\n--- Dividiendo Datos ---")
y_main = df_processed['main_label'].values
df_trainval, df_test = train_test_split(df_processed, test_size=0.2, random_state=42, stratify=y_main)
y_trainval_main = df_trainval['main_label'].values
df_train, df_val = train_test_split(df_trainval, test_size=0.25, random_state=42, stratify=y_trainval_main)

print(f"Tamaño Train: {len(df_train)}, Val: {len(df_val)}, Test: {len(df_test)}")

## 4. Aumento de Datos Sintéticos para Sub-categorías (CTGAN)
Nos enfocamos en el desbalance de las sub-categorías de 'odio'. Usaremos CTGAN para generar nuevos datos de entrenamiento para las clases minoritarias, basándonos en sus embeddings. **Importante**: CTGAN solo genera embeddings sintéticos; no puede generar texto. La parte de TF-IDF para estos datos se manejará más adelante.

In [None]:
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata
from sklearn.preprocessing import LabelEncoder

print("--- Preparando datos para aumento ---")
# 1. Aislar los datos de entrenamiento que son 'odio'
df_train_hate = df_train[df_train['main_label'] == 1].copy()
features_to_augment = ['sub_label_str'] + embedding_cols
df_to_augment = df_train_hate[features_to_augment]

print("Distribución de sub-categorías ANTES del aumento:")
hate_counts = df_to_augment['sub_label_str'].value_counts()
print(hate_counts)

# Crear un nuevo encoder dedicado SOLO para las sub-categorías de odio.
sub_hate_only_encoder = LabelEncoder()
df_synthetic = pd.DataFrame() # Inicializar como dataframe vacío

if len(hate_counts) > 1 and not df_to_augment.empty:
    # Ajustar el nuevo encoder solo con las etiquetas de odio
    sub_hate_only_encoder.fit(df_to_augment['sub_label_str'])
    print("\nNuevo encoder para Nivel 2 creado. Clases:", sub_hate_only_encoder.classes_)

    # 2. Configurar metadatos
    metadata = SingleTableMetadata()
    metadata.detect_from_dataframe(data=df_to_augment)

    print("\nActualizando metadatos para tratar embeddings como numéricos continuos...")
    for col in embedding_cols:
        metadata.update_column(column_name=col, sdtype='numerical')
    metadata.update_column(column_name='sub_label_str', sdtype='categorical')

    # 3. Configurar y entrenar el sintetizador CTGAN
    use_gpu = torch.cuda.is_available()
    print(f"Usando GPU: {use_gpu}")
    synthesizer = CTGANSynthesizer(
        metadata,
        epochs=150,
        embedding_dim=64,
        verbose=False,
        cuda=use_gpu
    )

    print(f"\nEntrenando CTGAN para generar datos sintéticos... (Usando GPU: {use_gpu})")
    synthesizer.fit(df_to_augment)

    # 4. Determinar cuántas muestras generar
    max_class_size = hate_counts.max()
    num_to_generate = max_class_size * len(hate_counts) - hate_counts.sum()

    # 5. Generar y combinar datos
    if num_to_generate > 0:
        print(f"\nGenerando {num_to_generate} muestras sintéticas...")
        df_synthetic = synthesizer.sample(num_rows=num_to_generate)
        df_train_hate_balanced = pd.concat([df_to_augment, df_synthetic], ignore_index=True)
    else:
        df_train_hate_balanced = df_to_augment
else:
    print("\nSolo una sub-categoría presente o no hay datos de odio, no se requiere aumento.")
    df_train_hate_balanced = df_to_augment
    if not df_to_augment.empty:
        # Ajustar el encoder si solo hay una clase
        sub_hate_only_encoder.fit(df_to_augment['sub_label_str'])

print("\nDistribución de sub-categorías DESPUÉS del aumento:")
all_sub_labels = df_to_augment['sub_label_str'].unique()
print(df_train_hate_balanced['sub_label_str'].value_counts().reindex(all_sub_labels, fill_value=0))

# Preparar datos de entrenamiento para el clasificador de sub-categorías
if not df_train_hate_balanced.empty:
    X_train_sub_emb = df_train_hate_balanced[embedding_cols].values
    # Usar el NUEVO encoder para transformar las etiquetas
    y_train_sub = sub_hate_only_encoder.transform(df_train_hate_balanced['sub_label_str'])
else:
    # Crear arrays vacíos si no hay datos para evitar errores posteriores
    X_train_sub_emb = np.array([]).reshape(0, len(embedding_cols))
    y_train_sub = np.array([])

## 5. Entrenamiento del Clasificador Principal (Nivel 1) con Optuna y Ensemble Extendido

Aquí es donde integramos el pipeline de entrenamiento robusto. Entrenaremos y optimizaremos un **ensemble de seis modelos** (XGBoost y MLP usando solo embeddings, los mismos dos usando embeddings + TF-IDF, y dos Regresiones Logísticas usando cada tipo de característica por separado). Este proceso no utiliza los datos aumentados, solo el conjunto de entrenamiento original.

In [None]:
import optuna
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import warnings

warnings.filterwarnings('ignore', category=UserWarning)
warnings.filterwarnings('ignore', category=FutureWarning)

print("--- Preparando datos para el entrenamiento del Clasificador Principal ---")

# Usar las variables correctas del split jerárquico
y_train = df_train['main_label'].values
y_val = df_val['main_label'].values
num_classes = len(np.unique(y_train)) # Será 2 en este caso

# 1. Escalar características de embeddings
scaler_L1_emb = StandardScaler()
X_train_emb = df_train[embedding_cols].values
X_train_emb_scaled = scaler_L1_emb.fit_transform(X_train_emb)
X_val_emb = df_val[embedding_cols].values
X_val_emb_scaled = scaler_L1_emb.transform(X_val_emb)

# 2. Vectorizar características de texto con TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X_train_text = df_train['text_stemmed'].values
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_text)
X_val_text = df_val['text_stemmed'].values
X_val_tfidf = tfidf_vectorizer.transform(X_val_text)
print(f"TF-IDF: {X_train_tfidf.shape[1]} características generadas.")

# 3. Crear características combinadas (Embeddings + TF-IDF)
# Para XGBoost y modelos de Scikit-learn, usamos la matriz sparse combinada
X_train_combined = hstack([X_train_emb, X_train_tfidf]).tocsr()
X_val_combined = hstack([X_val_emb, X_val_tfidf]).tocsr()

# Para MLP, necesitamos una matriz densa. Combinamos embeddings escalados y TF-IDF denso.
X_train_combined_dense = np.hstack([X_train_emb_scaled, X_train_tfidf.toarray()])
X_val_combined_dense = np.hstack([X_val_emb_scaled, X_val_tfidf.toarray()])

# 4. Convertir datos a tensores para PyTorch
X_val_torch_emb = torch.tensor(X_val_emb_scaled, dtype=torch.float32).to(device)
X_val_torch_combined = torch.tensor(X_val_combined_dense, dtype=torch.float32).to(device)
y_val_torch = torch.tensor(y_val, dtype=torch.long).to(device)

print("\n✓ Datos escalados, vectorizados y tensores de PyTorch listos para el Nivel 1.")

In [None]:
# --- Clase genérica para MLP ---
class MLP(nn.Module):
    def __init__(self, input_size, hidden_layers, output_size, activation_fn, dropout_rate):
        super(MLP, self).__init__()
        layers = []
        current_size = input_size
        for hidden_size in hidden_layers:
            layers.append(nn.Linear(current_size, hidden_size))
            layers.append(activation_fn())
            layers.append(nn.Dropout(dropout_rate))
            current_size = hidden_size
        layers.append(nn.Linear(current_size, output_size))
        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

# --- Función genérica para entrenar y evaluar MLP en Optuna ---
def train_eval_mlp_objective(trial, X_train_data, y_train_data, X_val_tensor, y_val_tensor, input_dim):
    n_layers = trial.suggest_int('n_layers', 1, 3)
    hidden_layers = [trial.suggest_int(f'n_units_l{i}', 32, 256) for i in range(n_layers)]
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
    optimizer_name = trial.suggest_categorical('optimizer', ['Adam', 'RMSprop'])
    lr = trial.suggest_float('lr', 1e-5, 1e-2, log=True)
    activation_fn = getattr(nn, trial.suggest_categorical('activation', ['ReLU', 'Tanh']))

    model = MLP(input_dim, hidden_layers, num_classes, activation_fn, dropout_rate).to(device)
    optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    train_dataset = TensorDataset(torch.tensor(X_train_data, dtype=torch.float32), torch.tensor(y_train_data, dtype=torch.long))
    train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)

    for epoch in range(25):
        model.train()
        for data, target in train_loader:
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            loss = criterion(model(data), target)
            loss.backward()
            optimizer.step()

    model.eval()
    with torch.no_grad():
        val_loss = criterion(model(X_val_tensor), y_val_tensor).item()

    trial.report(val_loss, epoch)
    if trial.should_prune(): raise optuna.exceptions.TrialPruned()
    return val_loss

# --- 1. Objetivo para MLP (Usa solo Embeddings) ---
def objective_mlp_embeddings(trial):
    return train_eval_mlp_objective(trial, X_train_emb_scaled, y_train, X_val_torch_emb, y_val_torch, X_train_emb_scaled.shape[1])

# --- 2. Objetivo para MLP (Usa Embeddings + TF-IDF) ---
def objective_mlp_combined(trial):
    return train_eval_mlp_objective(trial, X_train_combined_dense, y_train, X_val_torch_combined, y_val_torch, X_train_combined_dense.shape[1])

# --- 3. Objetivo para XGBoost (Usa solo Embeddings) ---
def objective_xgboost_embeddings(trial):
    params = {
        'objective': 'binary:logistic', 'eval_metric': 'logloss',
        'device': 'cuda' if device.type == 'cuda' else 'cpu',
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
    }
    model = xgb.XGBClassifier(**params, early_stopping_rounds=10)
    model.fit(X_train_emb, y_train, eval_set=[(X_val_emb, y_val)], verbose=False)
    return log_loss(y_val, model.predict_proba(X_val_emb))

# --- 4. Objetivo para XGBoost (Usa Embeddings + TF-IDF) ---
def objective_xgboost_combined(trial):
    params = {
        'objective': 'binary:logistic', 'eval_metric': 'logloss',
        'device': 'cuda' if device.type == 'cuda' else 'cpu',
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
    }
    model = xgb.XGBClassifier(**params, early_stopping_rounds=10)
    model.fit(X_train_combined, y_train, eval_set=[(X_val_combined, y_val)], verbose=False)
    return log_loss(y_val, model.predict_proba(X_val_combined))

# --- 5. Objetivo para Regresión Logística (Usa solo Embeddings) ---
def objective_logistic_embeddings(trial):
    params = {'C': trial.suggest_float('C', 1e-4, 1e2, log=True), 'solver': 'liblinear', 'max_iter': 1000}
    model = LogisticRegression(**params, random_state=42)
    model.fit(X_train_emb_scaled, y_train)
    return log_loss(y_val, model.predict_proba(X_val_emb_scaled))

# --- 6. Objetivo para Regresión Logística (Usa solo TF-IDF) ---
def objective_logistic_tfidf(trial):
    params = {'C': trial.suggest_float('C', 1e-2, 1e2, log=True), 'solver': 'liblinear', 'max_iter': 1000}
    model = LogisticRegression(**params, random_state=42)
    model.fit(X_train_tfidf, y_train)
    return log_loss(y_val, model.predict_proba(X_val_tfidf))

print(f"Funciones objetivo de Optuna para el Nivel 1 definidas.")

In [None]:
models_config = {
    'XGBoost_Embeddings': {'objective_func': objective_xgboost_embeddings, 'n_trials': 25},
    'MLP_PyTorch_Embeddings': {'objective_func': objective_mlp_embeddings, 'n_trials': 30},
    'LogisticRegression_Embeddings': {'objective_func': objective_logistic_embeddings, 'n_trials': 20},
    'LogisticRegression_TFIDF': {'objective_func': objective_logistic_tfidf, 'n_trials': 20},
    'XGBoost_Combined': {'objective_func': objective_xgboost_combined, 'n_trials': 25},
    'MLP_PyTorch_Combined': {'objective_func': objective_mlp_combined, 'n_trials': 30}
}

model_results = {}

for model_name, config in models_config.items():
    print(f"\n--- Optimizando {model_name} (Nivel 1) ---")
    study = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=42))
    study.optimize(config['objective_func'], n_trials=config['n_trials'], show_progress_bar=True)

    model_results[model_name] = {
        'best_params': study.best_params,
        'best_score': study.best_value
    }
    print(f"✓ {model_name} completado. Mejor LogLoss: {study.best_value:.4f}")

In [None]:
main_classifier_models = {}
print("--- Entrenando modelos finales con los mejores hiperparámetros ---\n")

# 1. XGBoost (Embeddings)
params = model_results['XGBoost_Embeddings']['best_params']
final_xgb_emb = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss',
                                  device='cuda' if device.type == 'cuda' else 'cpu', **params)
final_xgb_emb.fit(X_train_emb, y_train)
main_classifier_models['XGBoost_Embeddings'] = final_xgb_emb
print("✓ Modelo XGBoost (Embeddings) final entrenado.")

# 2. XGBoost (Combined)
params = model_results['XGBoost_Combined']['best_params']
final_xgb_comb = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss',
                                   device='cuda' if device.type == 'cuda' else 'cpu', **params)
final_xgb_comb.fit(X_train_combined, y_train)
main_classifier_models['XGBoost_Combined'] = final_xgb_comb
print("✓ Modelo XGBoost (Combined) final entrenado.")

# 3. Regresión Logística (Embeddings)
params = model_results['LogisticRegression_Embeddings']['best_params']
final_log_emb = LogisticRegression(solver='liblinear', random_state=42, max_iter=1000, **params)
final_log_emb.fit(X_train_emb_scaled, y_train)
main_classifier_models['LogisticRegression_Embeddings'] = final_log_emb
print("✓ Modelo Regresión Logística (Embeddings) final entrenado.")

# 4. Regresión Logística (TF-IDF)
params = model_results['LogisticRegression_TFIDF']['best_params']
final_log_tfidf = LogisticRegression(solver='liblinear', random_state=42, max_iter=1000, **params)
final_log_tfidf.fit(X_train_tfidf, y_train)
main_classifier_models['LogisticRegression_TFIDF'] = final_log_tfidf
print("✓ Modelo Regresión Logística (TF-IDF) final entrenado.")

# 5. MLP (Embeddings)
params = model_results['MLP_PyTorch_Embeddings']['best_params']
hidden_layers = [params[f'n_units_l{i}'] for i in range(params['n_layers'])]
final_mlp_emb = MLP(X_train_emb_scaled.shape[1], hidden_layers, num_classes,
                    getattr(nn, params['activation']), params['dropout_rate']).to(device)
optimizer = getattr(optim, params['optimizer'])(final_mlp_emb.parameters(), lr=params['lr'])
train_loader = DataLoader(TensorDataset(torch.tensor(X_train_emb_scaled, dtype=torch.float32), torch.tensor(y_train, dtype=torch.long)), batch_size=128, shuffle=True)
for epoch in tqdm(range(30), desc="Epochs MLP (Embeddings) final"):
    final_mlp_emb.train()
    for data, target in train_loader:
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        loss = nn.CrossEntropyLoss()(final_mlp_emb(data), target)
        loss.backward()
        optimizer.step()
main_classifier_models['MLP_PyTorch_Embeddings'] = final_mlp_emb.eval()
print("✓ Modelo MLP (Embeddings) final entrenado.")

# 6. MLP (Combined)
params = model_results['MLP_PyTorch_Combined']['best_params']
hidden_layers = [params[f'n_units_l{i}'] for i in range(params['n_layers'])]
final_mlp_comb = MLP(X_train_combined_dense.shape[1], hidden_layers, num_classes,
                     getattr(nn, params['activation']), params['dropout_rate']).to(device)
optimizer = getattr(optim, params['optimizer'])(final_mlp_comb.parameters(), lr=params['lr'])
train_loader = DataLoader(TensorDataset(torch.tensor(X_train_combined_dense, dtype=torch.float32), torch.tensor(y_train, dtype=torch.long)), batch_size=128, shuffle=True)
for epoch in tqdm(range(30), desc="Epochs MLP (Combined) final"):
    final_mlp_comb.train()
    for data, target in train_loader:
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        loss = nn.CrossEntropyLoss()(final_mlp_comb(data), target)
        loss.backward()
        optimizer.step()
main_classifier_models['MLP_PyTorch_Combined'] = final_mlp_comb.eval()
print("✓ Modelo MLP (Combined) final entrenado.")

print("\n✓ Todos los modelos finales han sido entrenados.")

In [None]:
print("--- Calculando pesos para el Ensemble del Clasificador Principal (Nivel 1) ---")
val_probas = {}

# Obtener predicciones de cada modelo en el set de validación
val_probas['XGBoost_Embeddings'] = main_classifier_models['XGBoost_Embeddings'].predict_proba(X_val_emb)
val_probas['XGBoost_Combined'] = main_classifier_models['XGBoost_Combined'].predict_proba(X_val_combined)
val_probas['LogisticRegression_Embeddings'] = main_classifier_models['LogisticRegression_Embeddings'].predict_proba(X_val_emb_scaled)
val_probas['LogisticRegression_TFIDF'] = main_classifier_models['LogisticRegression_TFIDF'].predict_proba(X_val_tfidf)
with torch.no_grad():
    # MLP Embeddings
    mlp_outputs_emb = main_classifier_models['MLP_PyTorch_Embeddings'](X_val_torch_emb)
    val_probas['MLP_PyTorch_Embeddings'] = torch.softmax(mlp_outputs_emb, dim=1).cpu().numpy()
    # MLP Combined
    mlp_outputs_comb = main_classifier_models['MLP_PyTorch_Combined'](X_val_torch_combined)
    val_probas['MLP_PyTorch_Combined'] = torch.softmax(mlp_outputs_comb, dim=1).cpu().numpy()

# Calcular métricas y pesos del ensemble (mayor peso a menor log_loss)
losses = {name: log_loss(y_val, proba) for name, proba in val_probas.items()}
scores = {name: 1.0 / (loss + 1e-9) for name, loss in losses.items()}
total_score = sum(scores.values())
ensemble_weights = {name: score / total_score for name, score in scores.items()}

print("\n--- Pesos del Ensemble de Nivel 1 Calculados ---")
for name, w in sorted(ensemble_weights.items(), key=lambda item: item[1], reverse=True):
    print(f"{name:<30} | Peso: {w:.3f} | LogLoss (Val): {losses[name]:.4f}")

# Evaluar el rendimiento del ensemble en el set de validación
ensemble_proba_val = np.zeros_like(val_probas['XGBoost_Embeddings'])
for name, proba in val_probas.items():
    ensemble_proba_val += proba * ensemble_weights[name]

ensemble_log_loss_val = log_loss(y_val, ensemble_proba_val)
print(f"\nLogLoss del Ensemble L1 en Validación: {ensemble_log_loss_val:.4f}")

## 6. Entrenamiento del Clasificador de Sub-categorías (Nivel 2) con Optuna y Ensemble

Ahora, aplicamos la misma metodología robusta al clasificador de Nivel 2. Este se entrenará **únicamente con los datos de 'odio' balanceados sintéticamente**. Crearemos un ensemble de tres modelos (XGBoost, MLP, Regresión Logística) usando **características combinadas de embeddings y TF-IDF**.

In [None]:
print("--- Preparando datos y definiendo objetivos para el Clasificador de Sub-categorías (Nivel 2) ---")

if X_train_sub_emb.shape[0] > 0:
    # 1. Preparar características TF-IDF para datos de Nivel 2
    # Datos reales de 'odio'
    real_hate_texts_train = df_train_hate['text_stemmed']
    X_train_sub_tfidf_real = tfidf_vectorizer.transform(real_hate_texts_train)
    # Datos sintéticos (vector de ceros, ya que no tienen texto)
    num_synthetic = len(df_synthetic)
    X_train_sub_tfidf_synthetic = csr_matrix((num_synthetic, X_train_sub_tfidf_real.shape[1]), dtype=np.float64)
    # Combinar TF-IDF de datos reales y sintéticos
    X_train_sub_tfidf = vstack([X_train_sub_tfidf_real, X_train_sub_tfidf_synthetic])

    # 2. Combinar Embeddings y TF-IDF para Nivel 2
    X_train_sub_combined = hstack([X_train_sub_emb, X_train_sub_tfidf]).tocsr()
    num_sub_classes = len(np.unique(y_train_sub))
    print(f"Datos combinados para Nivel 2 listos. Shape: {X_train_sub_combined.shape}, {num_sub_classes} sub-clases detectadas.")

    # 3. Dividir los datos COMBINADOS para HPO
    X_sub_train_comb, X_sub_val_comb, y_sub_train, y_sub_val = train_test_split(
        X_train_sub_combined, y_train_sub, test_size=0.25, random_state=42, stratify=y_train_sub
    )

    # 4. Escalar la parte de embeddings de los datos combinados para MLP y LogReg
    scaler_L2_emb = StandardScaler()
    # Extraer y escalar la parte de embeddings
    X_sub_train_emb_part = X_sub_train_comb[:, :X_train_sub_emb.shape[1]].toarray()
    X_sub_train_emb_part_scaled = scaler_L2_emb.fit_transform(X_sub_train_emb_part)
    X_sub_val_emb_part = X_sub_val_comb[:, :X_train_sub_emb.shape[1]].toarray()
    X_sub_val_emb_part_scaled = scaler_L2_emb.transform(X_sub_val_emb_part)
    # Re-combinar con la parte de TF-IDF (que no se escala)
    X_sub_train_scaled_comb_dense = np.hstack([X_sub_train_emb_part_scaled, X_sub_train_comb[:, X_train_sub_emb.shape[1]:].toarray()])
    X_sub_val_scaled_comb_dense = np.hstack([X_sub_val_emb_part_scaled, X_sub_val_comb[:, X_train_sub_emb.shape[1]:].toarray()])

    # 5. Convertir a tensores de PyTorch
    X_sub_val_torch = torch.tensor(X_sub_val_scaled_comb_dense, dtype=torch.float32).to(device)
    y_sub_val_torch = torch.tensor(y_sub_val, dtype=torch.long).to(device)

    # --- Funciones Objetivo para Optuna (Nivel 2, con datos combinados) ---
    def objective_xgboost_L2(trial):
        params = {'objective': 'multi:softprob', 'num_class': num_sub_classes, 'eval_metric': 'mlogloss', 'device': 'cuda',
                  'n_estimators': trial.suggest_int('n_estimators', 100, 800), 'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True), 'max_depth': trial.suggest_int('max_depth', 3, 8)}
        model = xgb.XGBClassifier(**params, early_stopping_rounds=10)
        model.fit(X_sub_train_comb, y_sub_train, eval_set=[(X_sub_val_comb, y_sub_val)], verbose=False)
        return log_loss(y_sub_val, model.predict_proba(X_sub_val_comb))

    def objective_logistic_L2(trial):
        params = {'C': trial.suggest_float('C', 1e-3, 1e2, log=True), 'solver': 'liblinear', 'max_iter': 1000, 'multi_class': 'ovr'}
        model = LogisticRegression(**params, random_state=42)
        # Se entrena con los datos densos (escalados en parte)
        model.fit(X_sub_train_scaled_comb_dense, y_sub_train)
        return log_loss(y_sub_val, model.predict_proba(X_sub_val_scaled_comb_dense))

    def objective_mlp_L2(trial):
        n_layers = trial.suggest_int('n_layers', 1, 2)
        hidden_layers = [trial.suggest_int(f'n_units_l{i}', 32, 128) for i in range(n_layers)]
        lr = trial.suggest_float('lr', 1e-4, 1e-2, log=True)

        model = MLP(X_sub_train_scaled_comb_dense.shape[1], hidden_layers, num_sub_classes, nn.ReLU, 0.3).to(device)
        optimizer = optim.Adam(model.parameters(), lr=lr)
        criterion = nn.CrossEntropyLoss()
        train_dataset = TensorDataset(torch.tensor(X_sub_train_scaled_comb_dense, dtype=torch.float32), torch.tensor(y_sub_train, dtype=torch.long))
        train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

        for epoch in range(20):
            for data, target in train_loader:
                data, target = data.to(device), target.to(device)
                optimizer.zero_grad()
                loss = criterion(model(data), target)
                loss.backward()
                optimizer.step()

        model.eval()
        with torch.no_grad():
            val_loss = criterion(model(X_sub_val_torch), y_sub_val_torch).item()
        return val_loss
    print("Funciones objetivo para Nivel 2 definidas.")
else:
    print("No hay datos para preparar el Nivel 2.")

In [None]:
if X_train_sub_emb.shape[0] > 0:
    # --- 1. Búsqueda de Hiperparámetros (HPO) para Nivel 2 ---
    models_config_L2 = {
        'XGBoost_L2': {'objective_func': objective_xgboost_L2, 'n_trials': 20},
        'MLP_PyTorch_L2': {'objective_func': objective_mlp_L2, 'n_trials': 25},
        'LogisticRegression_L2': {'objective_func': objective_logistic_L2, 'n_trials': 15}
    }
    model_results_L2 = {}
    for model_name, config in models_config_L2.items():
        print(f"\n--- Optimizando {model_name} (Nivel 2) ---")
        study = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=42))
        study.optimize(config['objective_func'], n_trials=config['n_trials'], show_progress_bar=True)
        model_results_L2[model_name] = {'best_params': study.best_params}

    # --- 2. Entrenamiento de los modelos finales del ensemble de Nivel 2 ---
    print("\n--- Entrenando modelos finales del Ensemble (Nivel 2) ---")
    sub_classifier_models = {}

    # Preparar datos completos de entrenamiento L2 (escalados y densos para MLP/LogReg)
    X_train_sub_emb_part_full = X_train_sub_combined[:, :X_train_sub_emb.shape[1]].toarray()
    X_train_sub_emb_part_full_scaled = scaler_L2_emb.transform(X_train_sub_emb_part_full)
    X_train_sub_full_scaled_dense = np.hstack([X_train_sub_emb_part_full_scaled, X_train_sub_combined[:, X_train_sub_emb.shape[1]:].toarray()])

    # XGBoost L2
    params = model_results_L2['XGBoost_L2']['best_params']
    final_xgb_L2 = xgb.XGBClassifier(objective='multi:softprob', num_class=num_sub_classes, eval_metric='mlogloss',
                                     device='cuda' if device.type == 'cuda' else 'cpu', **params)
    final_xgb_L2.fit(X_train_sub_combined, y_train_sub) # Entrenar con sparse
    sub_classifier_models['XGBoost_L2'] = final_xgb_L2

    # Logistic Regression L2
    params = model_results_L2['LogisticRegression_L2']['best_params']
    final_log_L2 = LogisticRegression(solver='liblinear', random_state=42, max_iter=1000, **params)
    final_log_L2.fit(X_train_sub_full_scaled_dense, y_train_sub) # Entrenar con denso escalado
    sub_classifier_models['LogisticRegression_L2'] = final_log_L2

    # MLP L2
    params = model_results_L2['MLP_PyTorch_L2']['best_params']
    hidden_layers = [params[f'n_units_l{i}'] for i in range(params['n_layers'])]
    final_mlp_L2 = MLP(X_train_sub_full_scaled_dense.shape[1], hidden_layers, num_sub_classes, nn.ReLU, 0.3).to(device)
    optimizer = optim.Adam(final_mlp_L2.parameters(), lr=params['lr'])
    train_dataset_L2 = TensorDataset(torch.tensor(X_train_sub_full_scaled_dense, dtype=torch.float32), torch.tensor(y_train_sub, dtype=torch.long))
    train_loader_L2 = DataLoader(train_dataset_L2, batch_size=64, shuffle=True)
    for epoch in tqdm(range(30), desc="Epochs MLP L2 final"):
        for data, target in train_loader_L2:
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            loss = nn.CrossEntropyLoss()(final_mlp_L2(data), target)
            loss.backward()
            optimizer.step()
    sub_classifier_models['MLP_PyTorch_L2'] = final_mlp_L2.eval()
    print("✓ Todos los modelos del ensemble de Nivel 2 han sido entrenados.")

    # --- 3. Cálculo de pesos para el ensemble de Nivel 2 ---
    print("\n--- Calculando pesos para el Ensemble de Nivel 2 ---")
    val_probas_L2 = {}
    val_probas_L2['XGBoost_L2'] = sub_classifier_models['XGBoost_L2'].predict_proba(X_sub_val_comb)
    val_probas_L2['LogisticRegression_L2'] = sub_classifier_models['LogisticRegression_L2'].predict_proba(X_sub_val_scaled_comb_dense)
    with torch.no_grad():
        mlp_outputs = sub_classifier_models['MLP_PyTorch_L2'](X_sub_val_torch)
        val_probas_L2['MLP_PyTorch_L2'] = torch.softmax(mlp_outputs, dim=1).cpu().numpy()

    losses_L2 = {name: log_loss(y_sub_val, proba, labels=np.unique(y_train_sub)) for name, proba in val_probas_L2.items()}
    scores_L2 = {name: 1.0 / (loss + 1e-9) for name, loss in losses_L2.items()}
    total_score_L2 = sum(scores_L2.values())
    ensemble_weights_L2 = {name: score / total_score_L2 for name, score in scores_L2.items()}

    print("\n--- Pesos del Ensemble de Nivel 2 Calculados ---")
    for name, w in sorted(ensemble_weights_L2.items(), key=lambda item: item[1], reverse=True):
        print(f"{name:<25} | Peso: {w:.3f} | LogLoss (Val): {losses_L2[name]:.4f}")
else:
    print("No hay datos para entrenar el clasificador de Nivel 2.")
    sub_classifier_models = None
    ensemble_weights_L2 = None

## 7. Evaluación Final del Pipeline Jerárquico Robusto

Evaluamos el pipeline completo. Primero, usamos el **ensemble ponderado de Nivel 1** para la predicción de "Odio vs. No-Odio". Luego, para las predicciones de "odio", usamos el **ensemble ponderado de Nivel 2** para predecir la sub-categoría, ambos usando las configuraciones de características correspondientes.

In [None]:
print("--- Evaluación del pipeline jerárquico en el conjunto de prueba ---")

# 1. Preparar todas las características de prueba
X_test_emb_eval = df_test[embedding_cols].values
X_test_emb_scaled_eval = scaler_L1_emb.transform(X_test_emb_eval)
X_test_text_eval = df_test['text_stemmed'].values
X_test_tfidf_eval = tfidf_vectorizer.transform(X_test_text_eval)
X_test_combined_eval = hstack([X_test_emb_eval, X_test_tfidf_eval]).tocsr()
X_test_combined_dense_eval = np.hstack([X_test_emb_scaled_eval, X_test_tfidf_eval.toarray()])
X_test_torch_emb_eval = torch.tensor(X_test_emb_scaled_eval, dtype=torch.float32).to(device)
X_test_torch_combined_eval = torch.tensor(X_test_combined_dense_eval, dtype=torch.float32).to(device)
y_main_true = df_test['main_label'].values

# 2. Obtener predicciones del ENSEMBLE de Nivel 1
test_probas_L1 = {}
test_probas_L1['XGBoost_Embeddings'] = main_classifier_models['XGBoost_Embeddings'].predict_proba(X_test_emb_eval)
test_probas_L1['XGBoost_Combined'] = main_classifier_models['XGBoost_Combined'].predict_proba(X_test_combined_eval)
test_probas_L1['LogisticRegression_Embeddings'] = main_classifier_models['LogisticRegression_Embeddings'].predict_proba(X_test_emb_scaled_eval)
test_probas_L1['LogisticRegression_TFIDF'] = main_classifier_models['LogisticRegression_TFIDF'].predict_proba(X_test_tfidf_eval)
with torch.no_grad():
    test_probas_L1['MLP_PyTorch_Embeddings'] = torch.softmax(main_classifier_models['MLP_PyTorch_Embeddings'](X_test_torch_emb_eval), dim=1).cpu().numpy()
    test_probas_L1['MLP_PyTorch_Combined'] = torch.softmax(main_classifier_models['MLP_PyTorch_Combined'](X_test_torch_combined_eval), dim=1).cpu().numpy()

final_ensemble_proba_L1 = np.zeros_like(test_probas_L1['XGBoost_Embeddings'])
for name, proba in test_probas_L1.items():
    final_ensemble_proba_L1 += proba * ensemble_weights[name]
y_main_pred = np.argmax(final_ensemble_proba_L1, axis=1)

# 3. Evaluar Nivel 1
print("\n--- [Nivel 1] Rendimiento del Ensemble Principal (Prueba) ---")
print(classification_report(y_main_true, y_main_pred, target_names=['not-hate', 'hate']))

# 4. Obtener y evaluar predicciones del ENSEMBLE de Nivel 2
if sub_classifier_models is not None:
    df_test_true_hate = df_test[df_test['main_label'] == 1].copy()
    if not df_test_true_hate.empty:
        y_sub_true = sub_hate_only_encoder.transform(df_test_true_hate['sub_label_str'])

        # Preparar datos combinados para L2 en el conjunto de prueba
        X_test_hate_emb = df_test_true_hate[embedding_cols].values
        X_test_hate_tfidf = tfidf_vectorizer.transform(df_test_true_hate['text_stemmed'])
        X_test_hate_combined = hstack([X_test_hate_emb, X_test_hate_tfidf]).tocsr()

        # Preparar versión densa y escalada para MLP/LogReg
        X_test_hate_emb_scaled = scaler_L2_emb.transform(X_test_hate_emb)
        X_test_hate_combined_dense = np.hstack([X_test_hate_emb_scaled, X_test_hate_tfidf.toarray()])
        X_test_hate_torch = torch.tensor(X_test_hate_combined_dense, dtype=torch.float32).to(device)

        # Obtener y combinar probabilidades L2
        true_hate_probas_L2 = {}
        true_hate_probas_L2['XGBoost_L2'] = sub_classifier_models['XGBoost_L2'].predict_proba(X_test_hate_combined)
        true_hate_probas_L2['LogisticRegression_L2'] = sub_classifier_models['LogisticRegression_L2'].predict_proba(X_test_hate_combined_dense)
        with torch.no_grad():
            mlp_outputs_L2 = sub_classifier_models['MLP_PyTorch_L2'](X_test_hate_torch)
            true_hate_probas_L2['MLP_PyTorch_L2'] = torch.softmax(mlp_outputs_L2, dim=1).cpu().numpy()

        final_true_hate_proba_L2 = np.zeros_like(true_hate_probas_L2['XGBoost_L2'])
        for name, proba in true_hate_probas_L2.items():
            final_true_hate_proba_L2 += proba * ensemble_weights_L2[name]
        y_sub_pred_for_eval = np.argmax(final_true_hate_proba_L2, axis=1)

        print("\n--- [Nivel 2] Rendimiento del Ensemble de Sub-categorías (Prueba) ---")
        print(classification_report(y_sub_true, y_sub_pred_for_eval, target_names=sub_hate_only_encoder.classes_, zero_division=0))
else:
    print("\nEl clasificador de sub-categorías no fue entrenado.")

## 8. Guardado de Artefactos

Guardamos todos los componentes del pipeline jerárquico: los modelos de ambos ensembles, sus respectivos pesos, transformadores y codificadores.

In [None]:
print(f"--- Guardando artefactos en {MODEL_OUTPUT_DIR} ---")

# 1. Guardar modelos y pesos del ensemble de Nivel 1
with open(os.path.join(MODEL_OUTPUT_DIR, "main_classifier_models_L1.pkl"), 'wb') as f: pickle.dump(main_classifier_models, f)
with open(os.path.join(MODEL_OUTPUT_DIR, "ensemble_weights_L1.pkl"), 'wb') as f: pickle.dump(ensemble_weights, f)
print("✓ Modelos y pesos de Nivel 1 guardados.")

# 2. Guardar modelos y pesos del ensemble de Nivel 2
if sub_classifier_models is not None:
    with open(os.path.join(MODEL_OUTPUT_DIR, "sub_classifier_models_L2.pkl"), 'wb') as f: pickle.dump(sub_classifier_models, f)
    with open(os.path.join(MODEL_OUTPUT_DIR, "ensemble_weights_L2.pkl"), 'wb') as f: pickle.dump(ensemble_weights_L2, f)
    print("✓ Modelos y pesos de Nivel 2 guardados.")

# 3. Guardar transformadores y codificadores
with open(os.path.join(MODEL_OUTPUT_DIR, "scaler_L1_emb.pkl"), 'wb') as f: pickle.dump(scaler_L1_emb, f)
if 'scaler_L2_emb' in locals():
    with open(os.path.join(MODEL_OUTPUT_DIR, "scaler_L2_emb.pkl"), 'wb') as f: pickle.dump(scaler_L2_emb, f)
with open(os.path.join(MODEL_OUTPUT_DIR, "tfidf_vectorizer.pkl"), 'wb') as f: pickle.dump(tfidf_vectorizer, f)
if 'sub_hate_only_encoder' in locals():
    with open(os.path.join(MODEL_OUTPUT_DIR, "sub_hate_only_encoder.pkl"), 'wb') as f: pickle.dump(sub_hate_only_encoder, f)
print("✓ Scalers, TF-IDF Vectorizer y codificador de sub-etiquetas guardados.")

# 4. Guardar resultados de Optuna
with open(os.path.join(MODEL_OUTPUT_DIR, "optuna_results.pkl"), 'wb') as f:
    pickle.dump({'L1': model_results, 'L2': model_results_L2 if 'model_results_L2' in locals() else {}}, f)
print("✓ Resultados de Optuna guardados.")

print("\n🎉 Pipeline jerárquico robusto completado y todos los artefactos han sido guardados.")