# Librerías

In [None]:
import numpy as np
from datasets import load_dataset
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    set_seed,
)

set_seed(42)

# Dataset usado

In [None]:
# Cargamos el dataset https://huggingface.co/datasets/mteb/SpanishSentimentClassification
ds = load_dataset("mteb/SpanishSentimentClassification")
ds

In [None]:
# Ejemplos
display(ds["train"].shuffle(100).select(range(10)).to_pandas())
display(ds["test"].shuffle(100).select(range(10)).to_pandas())

In [None]:
from collections import Counter

# Conteo en train
train_counts = Counter(ds["train"]["label"])
test_counts  = Counter(ds["test"]["label"])

print("TRAIN:")
print("  #0:", train_counts[0])
print("  #1:", train_counts[1])

print("\nTEST:")
print("  #0:", test_counts[0])
print("  #1:", test_counts[1])

# Preprocesado

In [None]:
# Descargamos el tokenizador
model_name = "BSC-LT/mRoBERTa"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Longitud máxima de secuencia
max_length = 256

# El tokenizador devolverá el texto tokenizado y mapeado a IDs de vocabulario
def preprocess(batch):
    return tokenizer(batch["text"], truncation=True, max_length=max_length)

# tokenizamos todo el dataset
encoded = ds.map(preprocess, batched=True)
# renombramos la columna 'label' a 'labels' para que el Trainer lo reconozca
encoded = encoded.rename_column("label", "labels")
# nos quedamos solo con las columnas que nos interesan
encoded = encoded.remove_columns([c for c in encoded["train"].column_names
                                 if c not in ["input_ids", "attention_mask", "labels"]])

# inputs_ids: ids de los tokens
# attention_mask: máscara de atención
# labels: etiquetas de clase
encoded


In [None]:
print(encoded["train"]["input_ids"][0])
print(tokenizer.decode(encoded["train"]["input_ids"][0]))
print(encoded["train"]["attention_mask"][0])
print(f"Label: {encoded['train']['labels'][0]}")

# El modelo

In [None]:
num_labels = 2
id2label = {0: "NEG", 1: "POS"}
label2id = {"NEG": 0, "POS": 1}

# Descargamos el modelo preentrenado y le ponemos una capa de clasificación de dos clases
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    id2label=id2label,
    label2id=label2id,
)

# Entrenamiento

In [None]:
# Métricas de evaluación: accuracy, macro-F1, precision/recall/F1 para la clase POS=0
acc = evaluate.load("accuracy")
f1_macro = evaluate.load("f1")
prec = evaluate.load("precision")
rec = evaluate.load("recall")

# función que se le pasará al Trainer para computar las métricas
# recibe un par (logits, labels)
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    # macro-F1 (robusto si hay desbalanceo)
    out = {}
    out.update(acc.compute(predictions=preds, references=labels))
    out["f1_macro"] = f1_macro.compute(predictions=preds, references=labels, average="macro")["f1"]

    # binary precision/recall/f1 tomando POS=1 como clase positiva
    out["precision_pos"] = prec.compute(predictions=preds, references=labels, average="binary", pos_label=0)["precision"]
    out["recall_pos"] = rec.compute(predictions=preds, references=labels, average="binary", pos_label=0)["recall"]
    out["f1_pos"] = f1_macro.compute(predictions=preds, references=labels, average="binary", pos_label=0)["f1"]

    return out

In [None]:
from transformers import EarlyStoppingCallback

# Data collator para hacer padding dinámico
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Parámetros de entrenamiento
args = TrainingArguments(
    output_dir="mroberta-spanish-sentiment",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_macro", # métrica para seleccionar el mejor modelo, dataset desbalanceado
    greater_is_better=True,
    logging_strategy="epoch",
    logging_steps=1,
    report_to="none",
    save_total_limit=2,  
)

# Creamos el Trainer
# Se encarga de entrenar el modelo y evaluar en cada epoch
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=encoded["train"],
    eval_dataset=encoded["validation"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],  # <-- early stopping
)


In [None]:
# entrenamos
trainer.train()

In [None]:
from pprint import pprint
# Podemos evaluar el modelo en el conjunto de test
test_metrics = trainer.evaluate(encoded["test"])
pprint(test_metrics)


In [None]:
# Vamos a usar el modelo para hacer predicciones
from transformers import pipeline

# para predicción end-to-end se suele usar pipeline
clf = pipeline("text-classification", model=trainer.model, tokenizer=tokenizer, device=0)

raw_test = ds["test"].shuffle(100)
for i in range(8):
    text = raw_test[i]["text"]
    gold = raw_test[i]["label"]
    pred = clf(text)[0]
    print(f"\n[{i + 1}] gold={gold} text: {text[:180]}")
    pprint(pred)
