In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np
import pandas as pd
import os
from models import columns,vectorize_data
from resultsAnalyse import drawConfusionMatrix
import torch
from sklearn.metrics import f1_score
def filter_inadequada(example):
    return example["INADEQUADA"] == 0

ds = load_dataset("higopires/RePro-categories-multilabel")
print(len(ds["train"]))
#remove INADEQUADA examples
ds = ds.filter(filter_inadequada)
#remove only keep the firs 1000 examples
#ds["train"] = ds["train"].filter(lambda example, idx: idx < 20, with_indices=True)
#ds["test"] = ds["test"].filter(lambda example, idx: idx < 5, with_indices=True)
#ds["validation"] = ds["validation"].filter(lambda example, idx: idx < 5, with_indices=True)
print(len(ds["train"]))

8002
7674


In [2]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Convert logits to probabilities and then to binary predictions
    predictions = (torch.sigmoid(torch.tensor(logits)) > 0.5).int().numpy()
    #print(f"DEBUG: Original logits shape: {logits.shape}")
    #print(f"DEBUG: Original labels shape: {labels.shape}")
    # Calculate sample-wise F1 score
    f1 = f1_score(labels, predictions, average='micro', zero_division=0)
    
    return {'f1_micro': float(f1)}
#model_name = "neuralmind/bert-large-portuguese-cased"
model_name = "neuralmind/bert-large-portuguese-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
def preprocess_function(sample):
    # Tokenize text
    tokenized = tokenizer(sample["review_text"], truncation=True, padding=True, max_length=512, return_tensors="pt")
    
    # Convert binary label columns to a list (e.g., [1, 0, 1, 0, 0, 0])
    labels = []
    for i in range(len(sample["review_text"])):
        label_row = [
            float(sample["ENTREGA"][i]),
            float(sample["OUTROS"][i]),
            float(sample["PRODUTO"][i]),
            float(sample["CONDICOESDERECEBIMENTO"][i]),
            float(sample["ANUNCIO"][i])
        ]
        labels.append(label_row)
    
    tokenized["labels"] = torch.tensor(labels, dtype=torch.float)
    return tokenized
#check if output layer has 5 outputs
#print(model.classifier.out_features)
#model.classifier.out_features = 5  # Explicitly ensure final layer has 5 outputs

Add the LORA

In [3]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5,problem_type="multi_label_classification",ignore_mismatched_sizes=True)
#print(model.classifier)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-large-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
from peft import LoraConfig, TaskType
from peft import get_peft_model
peft_config = LoraConfig(task_type=TaskType.SEQ_CLS #for sequence classification
                         , inference_mode=False #for inference mode
                         , r=64, lora_alpha=32, lora_dropout=0.1) #for dropout

In [5]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
#unfreeze the classifier layer
for param in model.base_model.model.classifier.parameters():
   param.requires_grad = True  # Make this layer trainable
model.print_trainable_parameters()
device = "cuda" if torch.cuda.is_available() else "cpu"

trainable params: 6,296,581 || all params: 340,698,122 || trainable%: 1.8481
trainable params: 6,301,706 || all params: 340,698,122 || trainable%: 1.8496


In [6]:
model.to(device)
tokenized_dataset = ds.map(preprocess_function, batched=True)

In [7]:
training_args = TrainingArguments(
    output_dir="./resultsTransformer",
    learning_rate=1e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    eval_strategy="epoch", # run validation at the end of each epoch
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1_micro",

)
tokenizer = AutoTokenizer.from_pretrained(model_name)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,

    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)


  trainer = Trainer(
No label_names provided for model class `PeftModelForSequenceClassification`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [8]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1 Micro
1,0.5402,0.390997,0.654088
2,0.3535,0.284333,0.806185
3,0.2869,0.255257,0.820751
4,0.2616,0.233096,0.839559
5,0.2403,0.218723,0.8529
6,0.2297,0.205758,0.859551
7,0.2197,0.197811,0.86899
8,0.2114,0.191978,0.87513
9,0.2048,0.188551,0.881426
10,0.2042,0.187515,0.879973


TrainOutput(global_step=9600, training_loss=0.2677590525150299, metrics={'train_runtime': 9646.4059, 'train_samples_per_second': 7.955, 'train_steps_per_second': 0.995, 'total_flos': 6.898756224996792e+16, 'train_loss': 0.2677590525150299, 'epoch': 10.0})

In [9]:
y_pred = trainer.predict(tokenized_dataset["test"])
y_pred.metrics

{'test_loss': 0.20323899388313293,
 'test_f1_micro': 0.8645690834473324,
 'test_runtime': 43.9664,
 'test_samples_per_second': 21.971,
 'test_steps_per_second': 2.752}

In [10]:
trainer.save_model("lora_Portuguese_model")

In [11]:
del model  # Remove Python reference
torch.cuda.empty_cache()  # Clear GPU memory