# XLM-Roberta-Base Model

## Load the Dataset and Preprocess


In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
import torch
import numpy as np
from sklearn.metrics import f1_score


# Load your dataset
ds = load_dataset("higopires/RePro-categories-multilabel")

def filter_inadequada(example):
    return example["INADEQUADA"] == 0

ds = ds.filter(filter_inadequada)


# Preprocessing
model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_function(sample):
    tokenized = tokenizer(sample["review_text"], truncation=True, padding=True, max_length=512)
    labels = []
    for i in range(len(sample["review_text"])):
        label_row = [
            float(sample["ENTREGA"][i]),
            float(sample["OUTROS"][i]),
            float(sample["PRODUTO"][i]),
            float(sample["CONDICOESDERECEBIMENTO"][i]),
            float(sample["ANUNCIO"][i])
        ]
        labels.append(label_row)
    tokenized["labels"] = labels
    return tokenized

tokenized_dataset = ds.map(preprocess_function, batched=True)


  from .autonotebook import tqdm as notebook_tqdm


## Load the Model

In [2]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=5, problem_type="multi_label_classification", ignore_mismatched_sizes=True
)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Training Setup


In [3]:
training_args = TrainingArguments(
    output_dir="./resultsTransformer",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=True,
    save_total_limit=2,
    
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = (torch.sigmoid(torch.tensor(logits)) > 0.5).int().numpy()
    f1 = f1_score(labels, predictions, average='micro', zero_division=0)
    return {'f  1': float(f1)}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


## Fine Tune

In [4]:
trainer.train()


Epoch,Training Loss,Validation Loss,F 1
1,No log,0.150031,0.910858
2,0.272400,0.129391,0.930369
3,0.130800,0.126143,0.934733
4,0.094600,0.131631,0.934868
5,0.064100,0.133617,0.936477


TrainOutput(global_step=2400, training_loss=0.12532694975535075, metrics={'train_runtime': 522.0436, 'train_samples_per_second': 73.5, 'train_steps_per_second': 4.597, 'total_flos': 8830923212552880.0, 'train_loss': 0.12532694975535075, 'epoch': 5.0})

## Evaluation

In [5]:
y_pred = trainer.predict(tokenized_dataset["test"])
print(y_pred.metrics)

{'test_loss': 0.12738604843616486, 'test_f  1': 0.935546875, 'test_runtime': 2.1668, 'test_samples_per_second': 445.828, 'test_steps_per_second': 28.153}


## Save the Model

In [6]:
trainer.save_model("xlm_finetuned")

In [8]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

# Load the fine-tuned model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("xlm_finetuned")
tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

# Your custom review text
text = "Vendo um produto que não funciona e não consigo o reembolso. O atendimento foi péssimo e não me ajudaram em nada. no meu anuncio está tudo certo, mas não me entregaram o produto. Não recomendo a ninguém."

# Tokenize and prepare input
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)

# Get model predictions
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    probs = torch.sigmoid(logits).cpu().numpy()[0]

# Print results
categories = ["ENTREGA", "OUTROS", "PRODUTO", "CONDICOESDERECEBIMENTO", "ANUNCIO"]
for cat, prob in zip(categories, probs):
    print(f"{cat}: {prob:.2f}")

# Binary predictions (0 or 1 for each label)
preds = (probs > 0.5).astype(int)
print("Predicted labels:", dict(zip(categories, preds)))

ENTREGA: 0.57
OUTROS: 1.00
PRODUTO: 0.09
CONDICOESDERECEBIMENTO: 1.00
ANUNCIO: 0.92
Predicted labels: {'ENTREGA': np.int64(1), 'OUTROS': np.int64(1), 'PRODUTO': np.int64(0), 'CONDICOESDERECEBIMENTO': np.int64(1), 'ANUNCIO': np.int64(1)}
