# Bert Large Portuguese Cased Model

## Load the Dataset and Preprocess


In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
import torch
import numpy as np
from sklearn.metrics import f1_score


# Load your dataset
ds = load_dataset("higopires/RePro-categories-multilabel")

def filter_inadequada(example):
    return example["INADEQUADA"] == 0

ds = ds.filter(filter_inadequada)


# Preprocessing
model_name = "neuralmind/bert-large-portuguese-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_function(sample):
    tokenized = tokenizer(sample["review_text"], truncation=True, padding=True, max_length=512)
    labels = []
    for i in range(len(sample["review_text"])):
        label_row = [
            float(sample["ENTREGA"][i]),
            float(sample["OUTROS"][i]),
            float(sample["PRODUTO"][i]),
            float(sample["CONDICOESDERECEBIMENTO"][i]),
            float(sample["ANUNCIO"][i])
        ]
        labels.append(label_row)
    tokenized["labels"] = labels
    return tokenized

tokenized_dataset = ds.map(preprocess_function, batched=True)

tokenized_dataset["train"] = tokenized_dataset["train"].select(range(4000))
tokenized_dataset["validation"] = tokenized_dataset["validation"].select(range(400))
tokenized_dataset["test"] = tokenized_dataset["test"].select(range(400))


  from .autonotebook import tqdm as notebook_tqdm


## Load the Model

In [2]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=5, problem_type="multi_label_classification", ignore_mismatched_sizes=True
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-large-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Training Setup


In [None]:
training_args = TrainingArguments(
    output_dir="./resultsTransformer",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=True,
    save_total_limit=2,
    
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = (torch.sigmoid(torch.tensor(logits)) > 0.5).int().numpy()
    f1 = f1_score(labels, predictions, average='micro', zero_division=0)
    return {'f  1': float(f1)}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


## Fine Tune

In [4]:
trainer.train()


Epoch,Training Loss,Validation Loss


: 

## Evaluation

In [20]:
y_pred = trainer.predict(tokenized_dataset["test"])
print(y_pred.metrics)

{'test_loss': 0.1295471042394638, 'test_f1': 0.930460333006856, 'test_runtime': 2.1103, 'test_samples_per_second': 457.745, 'test_steps_per_second': 28.905}


## Save the Model

In [None]:
trainer.save_model("bert_large_portuguese_cased_finetuned")

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

# Load the fine-tuned model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained("bert_large_portuguese_cased_finetuned")
tokenizer = AutoTokenizer.from_pretrained("neuralmind/bert-large-portuguese-cased")

# Your custom review text
text = "Sigam o meu canal no youtube, e me sigam no instagram @higopires"

# Tokenize and prepare input
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)

# Get model predictions
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    probs = torch.sigmoid(logits).cpu().numpy()[0]

# Print results
categories = ["ENTREGA", "OUTROS", "PRODUTO", "CONDICOESDERECEBIMENTO", "ANUNCIO"]
for cat, prob in zip(categories, probs):
    print(f"{cat}: {prob:.2f}")

# Binary predictions (0 or 1 for each label)
preds = (probs > 0.5).astype(int)
print("Predicted labels:", dict(zip(categories, preds)))

ENTREGA: 0.02
OUTROS: 0.87
PRODUTO: 0.66
CONDICOESDERECEBIMENTO: 0.07
ANUNCIO: 0.08
Predicted labels: {'ENTREGA': np.int64(0), 'OUTROS': np.int64(1), 'PRODUTO': np.int64(1), 'CONDICOESDERECEBIMENTO': np.int64(0), 'ANUNCIO': np.int64(0)}
