In [1]:
# change this if running locally
!pip install datasets
!pip install --force-reinstall --no-deps git+https://github.com/huggingface/transformers
!pip install scikit-multilearn

Collecting git+https://github.com/huggingface/transformers
  Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-wupc4pj4
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers /tmp/pip-req-build-wupc4pj4
  Resolved https://github.com/huggingface/transformers to commit 24c91f095fec4d90fa6901ef17146b4f4c21d0a3
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: transformers
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-4.48.0.dev0-py3-none-any.whl size=10328608 sha256=094381c19b76ffc8de55cc3ffa180e3e5e06550e94b0d1b3ed25c3b87ea658dd
  Stored in directory: /tmp/pip-ephem-wheel-cache-khc52hzs/wheels/c0/14/d6/6c9a5582d2ac191ec0a483be151a4495fe1eb2a6706ca49f1b
Successfully built transformer

In [2]:
import pandas as pd
import numpy as np
import nltk
import torch
import datasets
from datasets import Dataset, DatasetDict
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification
from typing import Union
from skmultilearn.model_selection import iterative_train_test_split

MODEL_NAME = "ModernBERT-large"
MODEL_PATH = "answerdotai/" + MODEL_NAME

In [3]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
# change this if running locally
file = '/content/ptbr.csv'
data = pd.read_csv(file)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2226 entries, 0 to 2225
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        2226 non-null   object
 1   text      2226 non-null   object
 2   Anger     2226 non-null   int64 
 3   Disgust   2226 non-null   int64 
 4   Fear      2226 non-null   int64 
 5   Joy       2226 non-null   int64 
 6   Sadness   2226 non-null   int64 
 7   Surprise  2226 non-null   int64 
dtypes: int64(6), object(2)
memory usage: 139.2+ KB


In [5]:
data.head()

Unnamed: 0,id,text,Anger,Disgust,Fear,Joy,Sadness,Surprise
0,ptbr_train_track_b_00001,Essa é a democracia e liberdade que a e seu p...,1,0,0,0,0,0
1,ptbr_train_track_b_00002,fiz isso! vou ficar prestando atenção se o che...,0,0,0,0,0,0
2,ptbr_train_track_b_00003,anao meu irmao ta em casa kk lixo,1,0,0,0,1,0
3,ptbr_train_track_b_00004,"Mas... a pandemia vem pra ficar, meu chapa. At...",0,0,2,0,0,0
4,ptbr_train_track_b_00005,Não entendi o post. Está enaltecendo a invasão...,2,1,0,0,0,0


### Pré-processamento dos dados
1. Case folding
2. Remover stop words
3. Remover acentuação e pontuação


In [6]:
stopwords = nltk.corpus.stopwords.words('portuguese')
stopwords[:20]

['a',
 'à',
 'ao',
 'aos',
 'aquela',
 'aquelas',
 'aquele',
 'aqueles',
 'aquilo',
 'as',
 'às',
 'até',
 'com',
 'como',
 'da',
 'das',
 'de',
 'dela',
 'delas',
 'dele']

In [7]:
data["clean_text"] = data["text"].copy()
# Case folding
data["clean_text"] = data["clean_text"].str.lower().str.strip()
# Remover stopwords
data["clean_text"] = data["clean_text"].replace({r"\b" + stopword + r"\b": "" for stopword in stopwords}, regex=True)
# Remover pontuação
data["clean_text"] = data["clean_text"].str.replace(r"[\.!?\\\-.,]", "", regex=True)
data["clean_text"] = data["clean_text"].str.replace(r"\s+", " ", regex=True)


In [8]:
data["text"], data["clean_text"]

(0       Essa é a democracia e liberdade que a  e seu p...
 1       fiz isso! vou ficar prestando atenção se o che...
 2                       anao meu irmao ta em casa kk lixo
 3       Mas... a pandemia vem pra ficar, meu chapa. At...
 4       Não entendi o post. Está enaltecendo a invasão...
                               ...                        
 2221                                 Vc merece muito mais
 2222    Também , maioria sem nada para fazer , aí acab...
 2223    acho que chegou a hora de esquecer os erros do...
 2224                           mas é óbvio oq vc esperava
 2225    vamos ajudar na votação?! comentem a foto pref...
 Name: text, Length: 2226, dtype: object,
 0                  democracia liberdade presidente pregam
 1         fiz vou ficar prestando atenção cheiro continua
 2                              anao irmao ta casa kk lixo
 3        pandemia vem pra ficar chapa todos tomarem va...
 4        entendi post enaltecendo invasão polônia alem...
              

### Adicionar emoção neutra

Para casos que não tem nenhuma emoção na linha, adicionamos uma nova emoção: neutro

In [9]:
BASE_EMOTIONS = ["Anger", "Disgust", "Fear", "Joy", "Sadness", "Surprise"]

data["Neutral"] = 0
no_emotions_mask = data[BASE_EMOTIONS].sum(axis=1) == 0
data.loc[no_emotions_mask, "Neutral"] = 1

In [10]:
EMOTIONS = BASE_EMOTIONS + ["Neutral"]

### Iterative train test split
Uma vez que o dataset é desbalanceado, precisamos garantir que os dados de treino e teste tenham proporções similares de cada classe. Entretanto, já que o nosso problema é multi classe, utilizar o *train_test_split* do scikit-learn não funciona, uma vez que ele não lida bem com problemas multi classe, pois nesse tipo de problema há muitas combinações de classe possíveis. Sendo assim, utilizamos a função *iterative_train_test_split* que se propõe a resolver esse problema

In [11]:
def concat_X_y(X: np.array, y: np.array, columns: list[str]) -> pd.DataFrame:
    concatted_np = np.concatenate((X, y), axis=1)
    concatted = pd.DataFrame(concatted_np, columns=columns)
    return concatted


def train_test_val_split(
        data: pd.DataFrame,
        feature_label: str,
        targets_labels: list[str]
    ) -> Union[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    X = data[feature_label].to_numpy().reshape(-1, 1)
    y = data[targets_labels].to_numpy()

    X_train, y_train, X_test, y_test = iterative_train_test_split(
        X,
        y,
        test_size=0.1,
    )
    X_train, y_train, X_val, y_val = iterative_train_test_split(
        X_train,
        y_train,
        test_size=0.1,
    )
    columns = [feature_label] + targets_labels

    train = concat_X_y(X_train, y_train, columns)
    test = concat_X_y(X_test, y_test, columns)
    val = concat_X_y(X_val, y_val, columns)

    return train, test, val

train, test, val = train_test_val_split(data, "clean_text", EMOTIONS)


### Preparação para a estrutura do Huggingface

In [12]:
def create_dataset_dict(train: pd.DataFrame, val: pd.DataFrame, test: pd.DataFrame) -> DatasetDict:
    train_dataset = Dataset.from_pandas(train)
    val_dataset = Dataset.from_pandas(val)
    test_dataset = Dataset.from_pandas(test)

    return DatasetDict({
        'train': train_dataset,
        'validation': val_dataset,
        'test': test_dataset
    })

dataset = create_dataset_dict(train, val, test)
print(dataset)
dataset["train"][0]


DatasetDict({
    train: Dataset({
        features: ['clean_text', 'Anger', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise', 'Neutral'],
        num_rows: 1802
    })
    validation: Dataset({
        features: ['clean_text', 'Anger', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise', 'Neutral'],
        num_rows: 200
    })
    test: Dataset({
        features: ['clean_text', 'Anger', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise', 'Neutral'],
        num_rows: 224
    })
})


{'clean_text': 'fiz vou ficar prestando atenção cheiro continua',
 'Anger': 0,
 'Disgust': 0,
 'Fear': 0,
 'Joy': 0,
 'Sadness': 0,
 'Surprise': 0,
 'Neutral': 1}

In [13]:
labels = dataset['train'].column_names
labels.remove('clean_text')
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
print(labels)
print(id2label)
print(label2id)

['Anger', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise', 'Neutral']
{0: 'Anger', 1: 'Disgust', 2: 'Fear', 3: 'Joy', 4: 'Sadness', 5: 'Surprise', 6: 'Neutral'}
{'Anger': 0, 'Disgust': 1, 'Fear': 2, 'Joy': 3, 'Sadness': 4, 'Surprise': 5, 'Neutral': 6}


In [14]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Usando o dispositivo: {device}")

Usando o dispositivo: cuda


In [15]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
def tokenize_data(example):
  text = example["clean_text"]
  example_labels = {k: example[k] for k in example.keys() if k in labels}
  labels_dict = [0. for i in range(len(labels))]
  for label, value in example_labels.items():
       label_id = label2id[label]
       labels_dict[label_id] = float(value)
  encoding = tokenizer(text, truncation=True, max_length=128)
  encoding["labels"] = labels_dict

  return encoding
tokenized_dataset = dataset.map(tokenize_data, remove_columns=dataset['train'].column_names)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/1802 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/224 [00:00<?, ? examples/s]

In [16]:
example = tokenized_dataset['train'][0]
print(example.keys())

dict_keys(['input_ids', 'attention_mask', 'labels'])


In [17]:
tokenizer.decode(example['input_ids'])

'[CLS]fiz vou ficar prestando atenção cheiro continua[SEP]'

In [18]:
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

### Finetuning do modelo

In [19]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH,
                                    problem_type="multi_label_classification",
                                    num_labels=len(labels),
                                    id2label=id2label,
                                    label2id=label2id)
model.to(device)

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ModernBertForSequenceClassification(
  (model): ModernBertModel(
    (embeddings): ModernBertEmbeddings(
      (tok_embeddings): Embedding(50368, 1024, padding_idx=50283)
      (norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (drop): Dropout(p=0.0, inplace=False)
    )
    (layers): ModuleList(
      (0): ModernBertEncoderLayer(
        (attn_norm): Identity()
        (attn): ModernBertAttention(
          (Wqkv): Linear(in_features=1024, out_features=3072, bias=False)
          (rotary_emb): ModernBertRotaryEmbedding()
          (Wo): Linear(in_features=1024, out_features=1024, bias=False)
          (out_drop): Identity()
        )
        (mlp_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (mlp): ModernBertMLP(
          (Wi): Linear(in_features=1024, out_features=5248, bias=False)
          (act): GELUActivation()
          (drop): Dropout(p=0.0, inplace=False)
          (Wo): Linear(in_features=2624, out_features=1024, bias=False)
        

In [20]:
batch_size = 8
metric_name = "f1"

args = TrainingArguments(
    MODEL_NAME + "-multisentiment-portuguese",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    label_names=["labels"],
    #push_to_hub=True,
)



In [21]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, precision_score, recall_score
from transformers import EvalPrediction

def sigmoid(x):
   return 1/(1 + np.exp(-x))

# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    predictions = sigmoid(predictions)
    y_pred = (predictions > 0.5).astype(int).reshape(-1)
    y_true = labels.astype(int).reshape(-1)

    f1_macro_average = f1_score(y_true=y_true, y_pred=y_pred, average="macro")
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average="macro")
    recall = recall_score(y_true, y_pred, average="macro")

    metrics = {
        'f1': f1_macro_average,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall
    }
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result

In [22]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [23]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],  # Subsample para treino rápido
    eval_dataset=tokenized_dataset["validation"],    # Subsample para validação
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

  trainer = Trainer(


In [24]:
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mkrusovitor[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,No log,0.475929,0.279579,0.766429,0.271458,0.288588
2,No log,0.485855,0.277466,0.724286,0.269648,0.298309
3,0.481800,0.463578,0.226562,0.828571,0.207143,0.25
4,0.481800,0.438224,0.285201,0.822143,0.294786,0.286701
5,0.442300,0.486002,0.28148,0.776429,0.273435,0.290358
6,0.442300,0.537801,0.278093,0.810714,0.284098,0.28076
7,0.017000,0.540196,0.281328,0.825,0.298918,0.282577
8,0.017000,0.633636,0.278283,0.815714,0.28533,0.281022
9,-1.277800,0.570661,0.27885,0.820714,0.289122,0.281284
10,-1.277800,0.661639,0.294166,0.813571,0.295429,0.296579


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=2260, training_loss=-0.2916578917376763, metrics={'train_runtime': 1636.5349, 'train_samples_per_second': 11.011, 'train_steps_per_second': 1.381, 'total_flos': 2288510917323288.0, 'train_loss': -0.2916578917376763, 'epoch': 10.0})

In [25]:
trainer.save_model("model")