In [2]:
# change this if running locally
!pip install datasets
# !pip install --force-reinstall --no-deps git+https://github.com/huggingface/transformers
!pip install scikit-multilearn
!pip install triton



In [3]:
import pandas as pd
import numpy as np
import nltk
import torch
import datasets
from datasets import Dataset, DatasetDict
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification
from typing import Union
from skmultilearn.model_selection import iterative_train_test_split, IterativeStratification

BERT = "bert-large-uncased"
BERTAPORU = "pablocosta/bert-base-portuguese-cased"
ALBERT = "albert-large-v2"
MODEL_NAME = "bertabaporu-large-uncased"
MODEL_PATH = ALBERT

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /home/victor/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
# change this if running locally
file = 'public_data/train/track_b/ptbr.csv'
# file = '/ptbr.csv'
data = pd.read_csv(file)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2226 entries, 0 to 2225
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        2226 non-null   object
 1   text      2226 non-null   object
 2   Anger     2226 non-null   int64 
 3   Disgust   2226 non-null   int64 
 4   Fear      2226 non-null   int64 
 5   Joy       2226 non-null   int64 
 6   Sadness   2226 non-null   int64 
 7   Surprise  2226 non-null   int64 
dtypes: int64(6), object(2)
memory usage: 139.3+ KB


In [6]:
data.head()

Unnamed: 0,id,text,Anger,Disgust,Fear,Joy,Sadness,Surprise
0,ptbr_train_track_b_00001,Essa é a democracia e liberdade que a e seu p...,1,0,0,0,0,0
1,ptbr_train_track_b_00002,fiz isso! vou ficar prestando atenção se o che...,0,0,0,0,0,0
2,ptbr_train_track_b_00003,anao meu irmao ta em casa kk lixo,1,0,0,0,1,0
3,ptbr_train_track_b_00004,"Mas... a pandemia vem pra ficar, meu chapa. At...",0,0,2,0,0,0
4,ptbr_train_track_b_00005,Não entendi o post. Está enaltecendo a invasão...,2,1,0,0,0,0


### Pré-processamento dos dados
1. Case folding
2. Remover stop words
3. Remover acentuação e pontuação


In [7]:
stopwords = nltk.corpus.stopwords.words('portuguese')
stopwords[:20]

['a',
 'à',
 'ao',
 'aos',
 'aquela',
 'aquelas',
 'aquele',
 'aqueles',
 'aquilo',
 'as',
 'às',
 'até',
 'com',
 'como',
 'da',
 'das',
 'de',
 'dela',
 'delas',
 'dele']

In [8]:
data["clean_text"] = data["text"].copy()
# Case folding
data["clean_text"] = data["clean_text"].str.lower()
# Remover stopwords
data["clean_text"] = data["clean_text"].replace({r"\b" + stopword + r"\b": "" for stopword in stopwords}, regex=True)
# Remover pontuação
data["clean_text"] = data["clean_text"].str.replace(r"[\.!?\\\-.,]", "", regex=True)
data["clean_text"] = data["clean_text"].str.replace(r"\s+", " ", regex=True)
data["clean_text"] = data["clean_text"].str.strip()

In [9]:
data["text"], data["clean_text"]

(0       Essa é a democracia e liberdade que a  e seu p...
 1       fiz isso! vou ficar prestando atenção se o che...
 2                       anao meu irmao ta em casa kk lixo
 3       Mas... a pandemia vem pra ficar, meu chapa. At...
 4       Não entendi o post. Está enaltecendo a invasão...
                               ...                        
 2221                                 Vc merece muito mais
 2222    Também , maioria sem nada para fazer , aí acab...
 2223    acho que chegou a hora de esquecer os erros do...
 2224                           mas é óbvio oq vc esperava
 2225    vamos ajudar na votação?! comentem a foto pref...
 Name: text, Length: 2226, dtype: object,
 0                  democracia liberdade presidente pregam
 1         fiz vou ficar prestando atenção cheiro continua
 2                              anao irmao ta casa kk lixo
 3       pandemia vem pra ficar chapa todos tomarem vac...
 4       entendi post enaltecendo invasão polônia alemã...
              

### Adicionar emoção neutra

Para casos que não tem nenhuma emoção na linha, adicionamos uma nova emoção: neutro

In [10]:
BASE_EMOTIONS = ["Anger", "Disgust", "Fear", "Joy", "Sadness", "Surprise"]
base_emotions = True
if not base_emotions:
    data["Neutral"] = 0
    no_emotions_mask = data[BASE_EMOTIONS].sum(axis=1) == 0
    data.loc[no_emotions_mask, "Neutral"] = 1
    EMOTIONS = BASE_EMOTIONS + ["Neutral"]
else:
    EMOTIONS = BASE_EMOTIONS


In [11]:
data[EMOTIONS].sum()

Anger       1024
Disgust       96
Fear         148
Joy          902
Sadness      468
Surprise     209
dtype: int64

### Iterative train test split
Uma vez que o dataset é desbalanceado, precisamos garantir que os dados de treino e teste tenham proporções similares de cada classe. Entretanto, já que o nosso problema é multi classe, utilizar o *train_test_split* do scikit-learn não funciona, uma vez que ele não lida bem com problemas multi classe, pois nesse tipo de problema há muitas combinações de classe possíveis. Sendo assim, utilizamos a função *iterative_train_test_split* que se propõe a resolver esse problema

In [12]:
def concat_X_y(X: np.array, y: np.array, columns: list[str]) -> pd.DataFrame:
    concatted_np = np.concatenate((X, y), axis=1)
    concatted = pd.DataFrame(concatted_np, columns=columns)
    return concatted


def train_test_val_split(
        data: pd.DataFrame,
        feature_label: str,
        targets_labels: list[str], 
        seed: int = None
    ) -> Union[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    if seed is not None:
        np.random.seed(seed)
    X = data[feature_label].to_numpy().reshape(-1, 1)
    y = data[targets_labels].to_numpy()

    X_train, y_train, X_test, y_test = iterative_train_test_split(
        X,
        y,
        test_size=0.2,
    )
    X_test, y_test, X_val, y_val = iterative_train_test_split(
        X_test,
        y_test,
        test_size=0.5,
    )
    columns = [feature_label] + targets_labels

    train = concat_X_y(X_train, y_train, columns)
    test = concat_X_y(X_test, y_test, columns)
    val = concat_X_y(X_val, y_val, columns)

    return train, test, val

train, test, val = train_test_val_split(data, "clean_text", EMOTIONS, seed=42)


In [13]:
print(train.shape)
print(test.shape)
print(val.shape)
val.head()

(1780, 7)
(223, 7)
(223, 7)


Unnamed: 0,clean_text,Anger,Disgust,Fear,Joy,Sadness,Surprise
0,democracia liberdade presidente pregam,1,0,0,0,0,0
1,pro pessoal fandom canon querem kkkk pena,0,0,0,0,1,0
2,porque toda semana precisam eleger inimigo pra...,2,1,0,0,0,0
3,bati cotovelo porta chorar baba ranho dores ca...,2,0,0,0,1,0
4,tao linda parece,0,0,0,1,0,0


### Preparação para a estrutura do Huggingface

In [14]:
def create_dataset_dict(train: pd.DataFrame, val: pd.DataFrame, test: pd.DataFrame) -> DatasetDict:
    train_dataset = Dataset.from_pandas(train)
    val_dataset = Dataset.from_pandas(val)
    test_dataset = Dataset.from_pandas(test)

    return DatasetDict({
        'train': train_dataset,
        'validation': val_dataset,
        'test': test_dataset
    })

dataset = create_dataset_dict(train, val, test)
print(dataset)
dataset["train"][0]


DatasetDict({
    train: Dataset({
        features: ['clean_text', 'Anger', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise'],
        num_rows: 1780
    })
    validation: Dataset({
        features: ['clean_text', 'Anger', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise'],
        num_rows: 223
    })
    test: Dataset({
        features: ['clean_text', 'Anger', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise'],
        num_rows: 223
    })
})


{'clean_text': 'fiz vou ficar prestando atenção cheiro continua',
 'Anger': 0,
 'Disgust': 0,
 'Fear': 0,
 'Joy': 0,
 'Sadness': 0,
 'Surprise': 0}

In [15]:
labels = dataset['train'].column_names
labels.remove('clean_text')
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
print(labels)
print(id2label)
print(label2id)

['Anger', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise']
{0: 'Anger', 1: 'Disgust', 2: 'Fear', 3: 'Joy', 4: 'Sadness', 5: 'Surprise'}
{'Anger': 0, 'Disgust': 1, 'Fear': 2, 'Joy': 3, 'Sadness': 4, 'Surprise': 5}


In [16]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Usando o dispositivo: {device}")

Usando o dispositivo: cpu


In [17]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
def tokenize_data(example):
  text = example["clean_text"]
  example_labels = {k: example[k] for k in example.keys() if k in labels}
  labels_dict = [0. for i in range(len(labels))]
  for label, value in example_labels.items():
       label_id = label2id[label]
       labels_dict[label_id] = float(value)
  encoding = tokenizer(text, truncation=True, max_length=128)
  encoding["labels"] = labels_dict

  return encoding
tokenized_dataset = dataset.map(tokenize_data, remove_columns=dataset['train'].column_names)

Map: 100%|██████████| 1780/1780 [00:00<00:00, 5136.90 examples/s]
Map: 100%|██████████| 223/223 [00:00<00:00, 5239.71 examples/s]
Map: 100%|██████████| 223/223 [00:00<00:00, 4774.62 examples/s]


In [18]:
example = tokenized_dataset['train'][0]
print(example.keys())

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])


In [19]:
tokenizer.decode(example['input_ids'])

'[CLS] fiz vou ficar prestando atencao cheiro continua [SEP]'

In [20]:
tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

### Finetuning do modelo

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH,
                                    problem_type="multi_label_classification",
                                    num_labels=len(labels),
                                    id2label=id2label,
                                    label2id=label2id)
model.to(device)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at pablocosta/bertabaporu-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(64000, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1

In [35]:
model.num_parameters()/10**6

369.42951

In [34]:

print(next(model.parameters()).dtype)

torch.float32


In [32]:
print("Gbytes: ", 4*model.num_parameters()/(10**9))

Gbytes:  1.47771804


In [None]:
batch_size = 8
metric_name = "f1"

args = TrainingArguments(
    MODEL_NAME + "-multisentiment-portuguese",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    label_names=["labels"],
    torch_compile=True,
    optim="adafactor"
    #push_to_hub=True,
)

In [49]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, precision_score, recall_score
from transformers import EvalPrediction

def sigmoid(x):
   return 1/(1 + np.exp(-x))

# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    predictions = sigmoid(predictions)
    y_pred = (predictions > 0.5).astype(int).reshape(-1)
    y_true = labels.astype(int).reshape(-1)

    f1_macro_average = f1_score(y_true=y_true, y_pred=y_pred, average="macro")
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average="macro")
    recall = recall_score(y_true, y_pred, average="macro")

    metrics = {
        'f1': f1_macro_average,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall
    }
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions,
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds,
        labels=p.label_ids)
    return result

In [50]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],  # Subsample para treino rápido
    eval_dataset=tokenized_dataset["validation"],    # Subsample para validação
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

In [None]:
trainer.train()

In [None]:
result = trainer.predict(tokenized_dataset["test"])

In [None]:
result.metrics