In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoModel
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np
import pandas as pd
import os
from models import columns,vectorize_data
from resultsAnalyse import drawConfusionMatrix
import torch
from sklearn.metrics import f1_score
def filter_inadequada(example):
    return example["INADEQUADA"] == 0

ds = load_dataset("higopires/RePro-categories-multilabel")
print(len(ds["train"]))
#remove INADEQUADA examples
ds = ds.filter(filter_inadequada)
print(len(ds["train"]))

8002
7674


In [2]:


print(ds["train"].features)
#run_blitr(train_df,val_df,test_df)


{'review_text': Value(dtype='string', id=None), 'ENTREGA': Value(dtype='int64', id=None), 'OUTROS': Value(dtype='int64', id=None), 'PRODUTO': Value(dtype='int64', id=None), 'CONDICOESDERECEBIMENTO': Value(dtype='int64', id=None), 'INADEQUADA': Value(dtype='int64', id=None), 'ANUNCIO': Value(dtype='int64', id=None)}


In [15]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    # Convert logits to probabilities and then to binary predictions
    predictions = (torch.sigmoid(torch.tensor(logits)) > 0.5).int().numpy()
    
    # Calculate sample-wise F1 score
    f1 = f1_score(labels, predictions, average='micro', zero_division=0)
    
    return {'f1': float(f1)}

def preprocess_function(sample):
    model_name = "distilbert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    # Tokenize text
    tokenized = tokenizer(sample["review_text"], truncation=True, padding=True)
    
    # Convert binary label columns to a list (e.g., [1, 0, 1, 0, 0, 0])
    labels = []
    for i in range(len(sample["review_text"])):
        label_row = [
            float(sample["ENTREGA"][i]),
            float(sample["OUTROS"][i]),
            float(sample["PRODUTO"][i]),
            float(sample["CONDICOESDERECEBIMENTO"][i]),
            float(sample["ANUNCIO"][i])
        ]
        labels.append(label_row)
    
    tokenized["labels"] = labels
    return tokenized

model_name = "distilbert-base-uncased"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5,problem_type="multi_label_classification", ignore_mismatched_sizes=True )

tokenized_dataset = ds.map(preprocess_function, batched=True)



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([6]) in the checkpoint and torch.Size([5]) in the model instantiated
- classifier.weight: found shape torch.Size([6, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
from sklearn.metrics import f1_score
small_pred = np.array([[0, 1, 0], [1, 0, 1]])
small_true = np.array([[0, 1, 0], [1, 0, 0]])
print(f1_score(small_true, small_pred, average='samples'))

0.8333333333333333


In [17]:
print(tokenized_dataset["train"]["labels"][2])

[1.0, 0.0, 1.0, 0.0, 0.0]


In [None]:
training_args = TrainingArguments(
    output_dir="./resultsTransformer",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    eval_strategy="epoch", # run validation at the end of each epoch
    save_strategy="epoch",
    load_best_model_at_end=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_name)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

#trainer.evaluate()

In [19]:
trainer.train()

  0%|          | 0/4800 [00:00<?, ?it/s]

  0%|          | 0/60 [00:00<?, ?it/s]

{'eval_loss': 0.17419873178005219, 'eval_f1': 0.9060000000000001, 'eval_runtime': 6.5045, 'eval_samples_per_second': 146.361, 'eval_steps_per_second': 9.224, 'epoch': 1.0}
{'loss': 0.2512, 'grad_norm': 1.5126404762268066, 'learning_rate': 8.958333333333334e-06, 'epoch': 1.04}


  0%|          | 0/60 [00:00<?, ?it/s]

{'eval_loss': 0.15865370631217957, 'eval_f1': 0.912729658792651, 'eval_runtime': 6.4446, 'eval_samples_per_second': 147.722, 'eval_steps_per_second': 9.31, 'epoch': 2.0}
{'loss': 0.1376, 'grad_norm': 2.5648069381713867, 'learning_rate': 7.916666666666667e-06, 'epoch': 2.08}


  0%|          | 0/60 [00:00<?, ?it/s]

{'eval_loss': 0.15751013159751892, 'eval_f1': 0.9146622734761121, 'eval_runtime': 6.461, 'eval_samples_per_second': 147.345, 'eval_steps_per_second': 9.286, 'epoch': 3.0}
{'loss': 0.1094, 'grad_norm': 2.1984260082244873, 'learning_rate': 6.875e-06, 'epoch': 3.12}


  0%|          | 0/60 [00:00<?, ?it/s]

{'eval_loss': 0.15651944279670715, 'eval_f1': 0.91351529102269, 'eval_runtime': 6.2883, 'eval_samples_per_second': 151.393, 'eval_steps_per_second': 9.542, 'epoch': 4.0}
{'loss': 0.0896, 'grad_norm': 1.04468834400177, 'learning_rate': 5.833333333333334e-06, 'epoch': 4.17}


  0%|          | 0/60 [00:00<?, ?it/s]

{'eval_loss': 0.16033464670181274, 'eval_f1': 0.9148306478132193, 'eval_runtime': 6.3337, 'eval_samples_per_second': 150.306, 'eval_steps_per_second': 9.473, 'epoch': 5.0}
{'loss': 0.0771, 'grad_norm': 2.3940703868865967, 'learning_rate': 4.791666666666668e-06, 'epoch': 5.21}


  0%|          | 0/60 [00:00<?, ?it/s]

{'eval_loss': 0.16528066992759705, 'eval_f1': 0.9111617312072893, 'eval_runtime': 6.782, 'eval_samples_per_second': 140.371, 'eval_steps_per_second': 8.847, 'epoch': 6.0}
{'loss': 0.0648, 'grad_norm': 6.081669330596924, 'learning_rate': 3.7500000000000005e-06, 'epoch': 6.25}


  0%|          | 0/60 [00:00<?, ?it/s]

{'eval_loss': 0.16705229878425598, 'eval_f1': 0.9168865435356202, 'eval_runtime': 5.7987, 'eval_samples_per_second': 164.175, 'eval_steps_per_second': 10.347, 'epoch': 7.0}
{'loss': 0.0541, 'grad_norm': 1.2114629745483398, 'learning_rate': 2.7083333333333334e-06, 'epoch': 7.29}


  0%|          | 0/60 [00:00<?, ?it/s]

{'eval_loss': 0.17322543263435364, 'eval_f1': 0.9174852652259331, 'eval_runtime': 5.843, 'eval_samples_per_second': 162.93, 'eval_steps_per_second': 10.269, 'epoch': 8.0}
{'loss': 0.0498, 'grad_norm': 2.4577016830444336, 'learning_rate': 1.6666666666666667e-06, 'epoch': 8.33}


  0%|          | 0/60 [00:00<?, ?it/s]

{'eval_loss': 0.17443686723709106, 'eval_f1': 0.9160405096373735, 'eval_runtime': 6.0623, 'eval_samples_per_second': 157.035, 'eval_steps_per_second': 9.897, 'epoch': 9.0}
{'loss': 0.0431, 'grad_norm': 0.282556414604187, 'learning_rate': 6.25e-07, 'epoch': 9.38}


  0%|          | 0/60 [00:00<?, ?it/s]

{'eval_loss': 0.17563661932945251, 'eval_f1': 0.9159306509649984, 'eval_runtime': 5.9368, 'eval_samples_per_second': 160.355, 'eval_steps_per_second': 10.106, 'epoch': 10.0}
{'train_runtime': 2390.7056, 'train_samples_per_second': 32.099, 'train_steps_per_second': 2.008, 'train_loss': 0.09397848129272461, 'epoch': 10.0}


TrainOutput(global_step=4800, training_loss=0.09397848129272461, metrics={'train_runtime': 2390.7056, 'train_samples_per_second': 32.099, 'train_steps_per_second': 2.008, 'total_flos': 1.016534686824e+16, 'train_loss': 0.09397848129272461, 'epoch': 10.0})

In [8]:
y_pred = trainer.predict(tokenized_dataset["test"])
y_pred.metrics

  0%|          | 0/63 [00:00<?, ?it/s]

{'test_loss': 0.15109142661094666,
 'test_f1': 0.8948350964530181,
 'test_runtime': 6.6208,
 'test_samples_per_second': 152.095,
 'test_steps_per_second': 9.515}

In [9]:
trainer.save_model("finetunedTransformer_1")