In [None]:
# ============================
# 1. IMPORTS Y CONFIGURACIÓN
# ============================

import os
import random
import pickle
from pathlib import Path

import numpy as np
import pandas as pd

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup
)

import mlflow
import mlflow.pytorch

# Para reproducibilidad
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Dispositivo:", DEVICE)

# Rutas (ajusta si tu estructura es distinta)
BASE_DIR = Path("..").resolve()
DATA_DIR = BASE_DIR / "data/processed"
MODELS_DIR = BASE_DIR / "models"
MLRUNS_DIR = BASE_DIR / "mlruns"

MODELS_DIR.mkdir(exist_ok=True)
MLRUNS_DIR.mkdir(exist_ok=True)

print("BASE_DIR :", BASE_DIR)
print("DATA_DIR :", DATA_DIR)
print("MODELS_DIR:", MODELS_DIR)
print("MLRUNS_DIR:", MLRUNS_DIR)


In [None]:
# ==========================================
# 2. CARGA DEL DATASET PROCESADO (PICKLE)
# ==========================================

pkl_path = DATA_DIR / "youtube_all_versions.pkl"

with open(pkl_path, "rb") as f:
    data = pickle.load(f)

# Si el pickle es un DataFrame directamente:
if isinstance(data, pd.DataFrame):
    df = data.copy()
else:
    # Si el pickle es un dict con varias versiones, elegimos una clave.
    # AJUSTA 'text_clean' A LA CLAVE REAL QUE HAYAS USADO EN EL PREPROCESAMIENTO.
    # Por ejemplo podría ser: 'Text_clean', 'text_processed', etc.
    print("Claves disponibles en el pickle:", list(data.keys()))
    df = data["text_clean"].copy()

print("Dimensiones del DataFrame:", df.shape)
print("Columnas:", df.columns.tolist())

# Comprobamos presencia de columnas clave
assert "Text" in df.columns, "No se encontró la columna 'Text' en el DataFrame."
assert "IsHate" in df.columns, "No se encontró la columna 'IsHate' en el DataFrame."

# Vista rápida
df.head()


In [None]:
# =======================================================
# 3. SELECCIÓN DE COLUMNAS Y SPLIT TRAIN / VAL / TEST
# =======================================================

TEXT_COL = "Text"     # nombre exacto de la columna de texto
TARGET_COL = "IsHate" # 0 = normal, 1 = odio

# Subconjunto mínimo necesario
df_model = df[[TEXT_COL, TARGET_COL]].dropna().copy()

print("Distribución de clases (IsHate):")
print(df_model[TARGET_COL].value_counts(normalize=True).rename("ratio"))

X = df_model[TEXT_COL].values
y = df_model[TARGET_COL].values.astype(int)

# Train+Val vs Test (por ejemplo 80/20)
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=SEED
)

# Dentro de Train+Val, reservamos un 20% para validación (=> 64/16/20 global aprox.)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.2, stratify=y_temp, random_state=SEED
)

print("Tamaños:")
print("  Train:", len(X_train))
print("  Val  :", len(X_val))
print("  Test :", len(X_test))


In [None]:
# ===================================================
# 4. DATA AUGMENTATION LIGERO (OPCIONAL)
# ===================================================
# Aquí se hace un ejemplo muy simple: duplicar aleatoriamente algunos ejemplos
# de la clase minoritaria con pequeñas perturbaciones (ej. shuffle de palabras).
# Puedes sustituir esto por back-translation u otra técnica más sofisticada.

def simple_word_shuffle(text, max_swaps=2):
    words = text.split()
    if len(words) < 4:
        return text
    n_swaps = random.randint(1, max_swaps)
    words = words.copy()
    for _ in range(n_swaps):
        i, j = random.sample(range(len(words)), 2)
        words[i], words[j] = words[j], words[i]
    return " ".join(words)

def augment_minority(X, y, factor=0.3):
    X = list(X)
    y = list(y)
    X = np.array(X)
    y = np.array(y)

    # Identificar clase minoritaria
    unique, counts = np.unique(y, return_counts=True)
    class_counts = dict(zip(unique, counts))
    print("Recuentos antes de augment:", class_counts)

    minority_class = min(class_counts, key=class_counts.get)
    maj_class = max(class_counts, key=class_counts.get)

    n_min = class_counts[minority_class]
    n_maj = class_counts[maj_class]

    # Número de nuevos ejemplos (por factor sobre minoria)
    n_new = int(n_min * factor)
    if n_new == 0:
        return X, y

    minority_indices = np.where(y == minority_class)[0]
    new_texts = []
    new_labels = []

    for _ in range(n_new):
        idx = random.choice(minority_indices)
        original_text = X[idx]
        aug_text = simple_word_shuffle(original_text)
        new_texts.append(aug_text)
        new_labels.append(minority_class)

    X_aug = np.concatenate([X, np.array(new_texts)])
    y_aug = np.concatenate([y, np.array(new_labels)])

    unique2, counts2 = np.unique(y_aug, return_counts=True)
    print("Recuentos después de augment:", dict(zip(unique2, counts2)))

    return X_aug, y_aug

# Aplica augment solo al conjunto de entrenamiento
X_train_aug, y_train_aug = augment_minority(X_train, y_train, factor=0.3)


In [None]:
# ==============================================
# 5. TOKENIZER Y DATASETS PARA TRANSFORMERS
# ==============================================

MODEL_NAME = "bert-base-uncased"  # puedes cambiarlo
MAX_LENGTH = 128                  # ajusta si tus textos son más largos

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class HateDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = list(texts)
        self.labels = list(labels)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = int(self.labels[idx])

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            truncation=True,
            padding="max_length",
            return_tensors="pt"
        )

        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item["labels"] = torch.tensor(label, dtype=torch.long)
        return item

train_dataset = HateDataset(X_train_aug, y_train_aug, tokenizer, MAX_LENGTH)
val_dataset   = HateDataset(X_val,      y_val,      tokenizer, MAX_LENGTH)
test_dataset  = HateDataset(X_test,     y_test,     tokenizer, MAX_LENGTH)

BATCH_SIZE = 16

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_dataset,   batch_size=BATCH_SIZE, shuffle=False)
test_loader  = DataLoader(test_dataset,  batch_size=BATCH_SIZE, shuffle=False)

len(train_dataset), len(val_dataset), len(test_dataset)


In [None]:
# =======================================
# 6. CÁLCULO DE CLASS WEIGHTS (OPCIONAL)
# =======================================

from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.array([0, 1]),
    y=y_train_aug
)

class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(DEVICE)
print("Class weights:", class_weights_tensor)


In [None]:
# ==========================================
# 7. MODELO MULTILINGUAL BERT + OPTIMIZADOR + SCHEDULER
# ==========================================

from transformers import AutoModelForSequenceClassification

MULTILINGUAL_MODEL_NAME = "bert-base-multilingual-cased"
num_labels = 2

model = AutoModelForSequenceClassification.from_pretrained(
    MULTILINGUAL_MODEL_NAME,
    num_labels=num_labels,
    hidden_dropout_prob=0.3,
    attention_probs_dropout_prob=0.3
)
model.to(DEVICE)

for name, param in model.named_parameters():
    if name.startswith("bert.encoder.layer.") and int(name.split(".")[3]) < 8:
        param.requires_grad = False

criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)

EPOCHS = 4
LR = 1e-5
WARMUP_RATIO = 0.06

optimizer = torch.optim.AdamW(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr=LR,
    weight_decay=0.05
)

total_steps = len(train_loader) * EPOCHS
warmup_steps = int(total_steps * WARMUP_RATIO)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

print("Total steps:", total_steps, "Warmup steps:", warmup_steps)


In [None]:
# ===================================
# 8. FUNCIONES TRAIN / EVAL LOOP
# ===================================

def train_one_epoch(model, data_loader, optimizer, scheduler, criterion, device):
    model.train()
    epoch_loss = 0.0

    all_preds = []
    all_labels = []

    for batch in data_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=None
        )
        logits = outputs.logits

        loss = criterion(logits, labels)
        loss.backward()

        # Grad clipping para estabilidad
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        optimizer.step()
        scheduler.step()

        epoch_loss += loss.item() * input_ids.size(0)

        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.detach().cpu().numpy())
        all_labels.extend(labels.detach().cpu().numpy())

    avg_loss = epoch_loss / len(data_loader.dataset)
    f1 = f1_score(all_labels, all_preds, average="binary")
    precision = precision_score(all_labels, all_preds, average="binary")
    recall = recall_score(all_labels, all_preds, average="binary")

    return avg_loss, f1, precision, recall


def eval_one_epoch(model, data_loader, criterion, device):
    model.eval()
    epoch_loss = 0.0

    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=None
            )
            logits = outputs.logits

            loss = criterion(logits, labels)
            epoch_loss += loss.item() * input_ids.size(0)

            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.detach().cpu().numpy())
            all_labels.extend(labels.detach().cpu().numpy())

    avg_loss = epoch_loss / len(data_loader.dataset)
    f1 = f1_score(all_labels, all_preds, average="binary")
    precision = precision_score(all_labels, all_preds, average="binary")
    recall = recall_score(all_labels, all_preds, average="binary")

    return avg_loss, f1, precision, recall, all_labels, all_preds


In [None]:
# ==============================
# 9. CONFIGURACIÓN DE MLFLOW
# ==============================

# Directorio donde se guardan los mlruns al mismo nivel que notebooks
mlflow.set_tracking_uri(f"file:{MLRUNS_DIR}")
mlflow.set_experiment("bert_hate_detection_youtube")

print("MLflow tracking URI:", mlflow.get_tracking_uri())


In [None]:
# ======================================================
# 10. ENTRENAMIENTO CON MLFLOW + EARLY STOPPING
#     (control overfitting <~5 puntos F1 train-val)
# ======================================================

PATIENCE = 2  # epochs sin mejora para parar
BEST_VAL_F1 = 0.0
PATIENCE_COUNTER = 0

best_model_state = None

with mlflow.start_run(run_name="bert_base_youtube_hate") as run:
    mlflow.log_param("model_name", MODEL_NAME)
    mlflow.log_param("max_length", MAX_LENGTH)
    mlflow.log_param("batch_size", BATCH_SIZE)
    mlflow.log_param("epochs", EPOCHS)
    mlflow.log_param("learning_rate", LR)
    mlflow.log_param("warmup_ratio", WARMUP_RATIO)
    mlflow.log_param("seed", SEED)

    for epoch in range(1, EPOCHS + 1):
        print(f"\n===== Epoch {epoch}/{EPOCHS} =====")

        train_loss, train_f1, train_prec, train_rec = train_one_epoch(
            model, train_loader, optimizer, scheduler, criterion, DEVICE
        )
        val_loss, val_f1, val_prec, val_rec, _, _ = eval_one_epoch(
            model, val_loader, criterion, DEVICE
        )

        print(f"Train  - Loss: {train_loss:.4f} | F1: {train_f1:.4f} | P: {train_prec:.4f} | R: {train_rec:.4f}")
        print(f"Val    - Loss: {val_loss:.4f} | F1: {val_f1:.4f} | P: {val_prec:.4f} | R: {val_rec:.4f}")

        # Diferencia de F1 (aprox. overfitting)
        f1_diff = train_f1 - val_f1
        print(f"Diff F1 (train - val): {f1_diff:.4f}")

        # Log en MLflow
        mlflow.log_metric("train_loss", train_loss, step=epoch)
        mlflow.log_metric("train_f1", train_f1, step=epoch)
        mlflow.log_metric("train_precision", train_prec, step=epoch)
        mlflow.log_metric("train_recall", train_rec, step=epoch)

        mlflow.log_metric("val_loss", val_loss, step=epoch)
        mlflow.log_metric("val_f1", val_f1, step=epoch)
        mlflow.log_metric("val_precision", val_prec, step=epoch)
        mlflow.log_metric("val_recall", val_rec, step=epoch)
        mlflow.log_metric("f1_diff_train_val", f1_diff, step=epoch)

        # Early stopping por F1 de validación
        if val_f1 > BEST_VAL_F1:
            BEST_VAL_F1 = val_f1
            PATIENCE_COUNTER = 0
            best_model_state = model.state_dict()
            print(">>> Nuevo mejor modelo (F1 val).")
        else:
            PATIENCE_COUNTER += 1
            print(f"Sin mejora en F1 val. Patience {PATIENCE_COUNTER}/{PATIENCE}")
            if PATIENCE_COUNTER >= PATIENCE:
                print(">>> Early stopping activado.")
                break

    # Restaurar mejores pesos
    if best_model_state is not None:
        model.load_state_dict(best_model_state)

    # Evaluación final en validación después de early stopping
    val_loss, val_f1, val_prec, val_rec, _, _ = eval_one_epoch(
        model, val_loader, criterion, DEVICE
    )
    print("\nResultados finales en VALIDACIÓN:")
    print(f"Loss: {val_loss:.4f} | F1: {val_f1:.4f} | P: {val_prec:.4f} | R: {val_rec:.4f}")
    mlflow.log_metric("final_val_loss", val_loss)
    mlflow.log_metric("final_val_f1", val_f1)
    mlflow.log_metric("final_val_precision", val_prec)
    mlflow.log_metric("final_val_recall", val_rec)

    # Puedes loguear también la diferencia de F1 final (medida de overfitting)
    train_loss_final, train_f1_final, _, _ = train_one_epoch(
        model, train_loader, optimizer, scheduler, criterion, DEVICE
    )
    f1_diff_final = train_f1_final - val_f1
    print(f"F1 train final: {train_f1_final:.4f} | Diff F1 final (train - val): {f1_diff_final:.4f}")
    mlflow.log_metric("final_train_f1", train_f1_final)
    mlflow.log_metric("final_f1_diff_train_val", f1_diff_final)

    # Log del modelo en MLflow
    mlflow.pytorch.log_model(model, artifact_path="model")

print("Entrenamiento completado.")


In [None]:
# ===============================
# 11. EVALUACIÓN EN CONJUNTO TEST
# ===============================

test_loss, test_f1, test_prec, test_rec, y_true_test, y_pred_test = eval_one_epoch(
    model, test_loader, criterion, DEVICE
)

print("Resultados en TEST:")
print(f"Loss: {test_loss:.4f}")
print(f"F1  : {test_f1:.4f}")
print(f"P   : {test_prec:.4f}")
print(f"R   : {test_rec:.4f}")

print("\nClassification report (TEST):")
print(classification_report(y_true_test, y_pred_test, digits=4))


In [None]:
# =============================================
# 12. GUARDADO DEL MODELO Y TOKENIZER (../models)
# =============================================

final_model_dir = MODELS_DIR / "bert_youtube_hate"
final_model_dir.mkdir(exist_ok=True, parents=True)

print("Guardando modelo en:", final_model_dir)

# Guardar modelo y tokenizer al estilo Hugging Face
model.save_pretrained(final_model_dir)
tokenizer.save_pretrained(final_model_dir)

# Guardar también las columnas usadas y parámetros básicos
config_dict = {
    "text_column": TEXT_COL,
    "target_column": TARGET_COL,
    "model_name": MODEL_NAME,
    "max_length": MAX_LENGTH,
    "batch_size": BATCH_SIZE,
    "seed": SEED
}

with open(final_model_dir / "config_training.pkl", "wb") as f:
    pickle.dump(config_dict, f)

print("Modelo y configuración guardados correctamente.")


In [None]:
# =======================
# 13. RESUMEN FINAL
# =======================

print("===============================================")
print("RESUMEN FINAL - MODELO BERT DETECCIÓN DE ODIO")
print("===============================================\n")

print(f"- Modelo base       : {MODEL_NAME}")
print(f"- Columna de texto  : {TEXT_COL}")
print(f"- Columna objetivo  : {TARGET_COL}")
print(f"- Tamaño Train (aug): {len(train_dataset)}")
print(f"- Tamaño Val        : {len(val_dataset)}")
#print(f("- Tamaño Test      : {len(test_dataset)}"))

print("\nMétricas finales:")
print(f"- Val F1            : {val_f1:.4f}")
print(f"- Val Precision     : {val_prec:.4f}")
print(f("- Val Recall       : {val_rec:.4f}"))
print(f"- Test F1           : {test_f1:.4f}")
print(f"- Test Precision    : {test_prec:.4f}")
print(f("- Test Recall       : {test_rec:.4f}"))

print("\nControl de overfitting (aprox. diff F1 train-val final):")
print(f"- F1 train final    : {train_f1_final:.4f}")
print(f("- F1 val final      : {val_f1:.4f}"))
print(f("- Diff F1           : {f1_diff_final:.4f} (objetivo < 0.05)"))

print("\nArtefactos guardados:")
print(f"- MLflow runs       : {MLRUNS_DIR}")
print(f("- Modelo HF         : {final_model_dir}"))
print("- Configuración     : config_training.pkl")

print("\nPipeline completado.")
