In [284]:
import pandas as pd
import numpy as np
import nltk
import torch
import datasets
from datasets import Dataset, DatasetDict
from transformers import AutoModel, AutoTokenizer
from typing import Union
from skmultilearn.model_selection import iterative_train_test_split

MODEL_NAME = "neuralmind/bert-base-portuguese-cased"

In [285]:
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to /home/victor/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [286]:
file = 'public_data/train/track_a/ptbr.csv'
data = pd.read_csv(file)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2226 entries, 0 to 2225
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        2226 non-null   object
 1   text      2226 non-null   object
 2   Anger     2226 non-null   int64 
 3   Disgust   2226 non-null   int64 
 4   Fear      2226 non-null   int64 
 5   Joy       2226 non-null   int64 
 6   Sadness   2226 non-null   int64 
 7   Surprise  2226 non-null   int64 
dtypes: int64(6), object(2)
memory usage: 139.3+ KB


In [287]:
data.head()

Unnamed: 0,id,text,Anger,Disgust,Fear,Joy,Sadness,Surprise
0,ptbr_train_track_a_00001,"minha vó me disse que era frango e eu comi, ti...",0,0,0,0,1,0
1,ptbr_train_track_a_00002,Está e a nossa deputada Benedita linda guerrei...,0,0,0,1,0,0
2,ptbr_train_track_a_00003,só falta as roupas kkkkkkkkkkk,0,0,0,1,0,0
3,ptbr_train_track_a_00004,Eu tmb. Comecei a sair de casa agora (fui pela...,0,0,0,0,1,0
4,ptbr_train_track_a_00005,Peço a Deus que nossos dirigentes tenham realm...,0,0,0,0,0,0


### Pré-processamento dos dados
1. Case folding
2. Remover stop words
3. Remover acentuação e pontuação


In [288]:
stopwords = nltk.corpus.stopwords.words('portuguese')
stopwords[:20]

['a',
 'à',
 'ao',
 'aos',
 'aquela',
 'aquelas',
 'aquele',
 'aqueles',
 'aquilo',
 'as',
 'às',
 'até',
 'com',
 'como',
 'da',
 'das',
 'de',
 'dela',
 'delas',
 'dele']

In [289]:
data["clean_text"] = data["text"].copy()
# Case folding
data["clean_text"] = data["clean_text"].str.lower()
# Remover stopwords
data["clean_text"] = data["clean_text"].replace({r"\b" + stopword + r"\b": "" for stopword in stopwords}, regex=True)
# Remover acentuação e pontuação
data["clean_text"] = data["clean_text"].str.replace(r"[\.!?\\\-.,]", "", regex=True)
data["clean_text"] = data["clean_text"].str.replace(r"\s+", " ", regex=True)


In [290]:
data["text"], data["clean_text"]

(0       minha vó me disse que era frango e eu comi, ti...
 1       Está e a nossa deputada Benedita linda guerrei...
 2                          só falta as roupas kkkkkkkkkkk
 3       Eu tmb. Comecei a sair de casa agora (fui pela...
 4       Peço a Deus que nossos dirigentes tenham realm...
                               ...                        
 2221              Eu acho que o CAP vai surpreender hein.
 2222    23:59 - Lula sabia de toda a corrupção no seu ...
 2223    O Brasil precisa URGENTE de pessoas sérias e c...
 2224    Sera que só eu acho que ta passando da hora de...
 2225                                     falta só 2 porra
 Name: text, Length: 2226, dtype: object,
 0         vó disse frango comi gosto frango mto inocente 
 1        deputada benedita linda guerreira parabéns ju...
 2                                falta roupas kkkkkkkkkkk
 3        tmb comecei sair casa agora ( primeira vez ci...
 4       peço deus dirigentes realmente iluminação toma...
              

### Adicionar emoção neutra

Para casos que não tem nenhuma emoção na linha, adicionamos uma nova emoção: neutro

In [291]:
BASE_EMOTIONS = ["Anger", "Disgust", "Fear", "Joy", "Sadness", "Surprise"]

data["Neutral"] = 0
no_emotions_mask = data[BASE_EMOTIONS].sum(axis=1) == 0
data.loc[no_emotions_mask, "Neutral"] = 1

In [292]:
EMOTIONS = BASE_EMOTIONS + ["Neutral"]

### Iterative train test split
Uma vez que o dataset é desbalanceado, precisamos garantir que os dados de treino e teste tenham proporções similares de cada classe. Entretanto, já que o nosso problema é multi classe, utilizar o *train_test_split* do scikit-learn não funciona, uma vez que ele não lida bem com problemas multi classe, pois nesse tipo de problema há muitas combinações de classe possíveis. Sendo assim, utilizamos a função *iterative_train_test_split* que se propõe a resolver esse problema

In [293]:
def concat_X_y(X: np.array, y: np.array, columns: list[str]) -> pd.DataFrame:
    concatted_np = np.concatenate((X, y), axis=1)
    concatted = pd.DataFrame(concatted_np, columns=columns)
    return concatted


def train_test_val_split(
        data: pd.DataFrame,
        feature_label: str,
        targets_labels: list[str]
    ) -> Union[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    X = data[feature_label].to_numpy().reshape(-1, 1)
    y = data[targets_labels].to_numpy()
    
    X_train, y_train, X_test, y_test = iterative_train_test_split(
        X,
        y,
        test_size=0.1,
    )
    X_train, y_train, X_val, y_val = iterative_train_test_split(
        X_train,
        y_train,
        test_size=0.1,
    )
    columns = [feature_label] + targets_labels

    train = concat_X_y(X_train, y_train, columns)
    test = concat_X_y(X_test, y_test, columns)
    val = concat_X_y(X_val, y_val, columns)

    return train, test, val

train, test, val = train_test_val_split(data, "clean_text", EMOTIONS)


### Preparação para a estrutura do Huggingface

In [294]:
def create_dataset_dict(train: pd.DataFrame, val: pd.DataFrame, test: pd.DataFrame) -> DatasetDict:
    train_dataset = Dataset.from_pandas(train)
    val_dataset = Dataset.from_pandas(val)
    test_dataset = Dataset.from_pandas(test)

    return DatasetDict({
        'train': train_dataset,
        'validation': val_dataset,
        'test': test_dataset
    })

dataset = create_dataset_dict(train, val, test)
print(dataset)
dataset["train"][0]


DatasetDict({
    train: Dataset({
        features: ['clean_text', 'Anger', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise', 'Neutral'],
        num_rows: 1803
    })
    validation: Dataset({
        features: ['clean_text', 'Anger', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise', 'Neutral'],
        num_rows: 201
    })
    test: Dataset({
        features: ['clean_text', 'Anger', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise', 'Neutral'],
        num_rows: 222
    })
})


{'clean_text': ' deputada benedita linda guerreira parabéns juntos',
 'Anger': 0,
 'Disgust': 0,
 'Fear': 0,
 'Joy': 1,
 'Sadness': 0,
 'Surprise': 0,
 'Neutral': 0}

In [295]:
labels = dataset['train'].column_names
labels.remove('clean_text')
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
print(labels)
print(id2label)
print(label2id)

['Anger', 'Disgust', 'Fear', 'Joy', 'Sadness', 'Surprise', 'Neutral']
{0: 'Anger', 1: 'Disgust', 2: 'Fear', 3: 'Joy', 4: 'Sadness', 5: 'Surprise', 6: 'Neutral'}
{'Anger': 0, 'Disgust': 1, 'Fear': 2, 'Joy': 3, 'Sadness': 4, 'Surprise': 5, 'Neutral': 6}


In [296]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Usando o dispositivo: {device}")    

Usando o dispositivo: cpu


In [297]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
def tokenize_data(examples):
  # take a batch of texts
  text = examples["clean_text"]
  # encode them
  encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
  # add labels
  labels_batch = {k: examples[k] for k in examples.keys() if k in labels}
  # create numpy array of shape (batch_size, num_labels)
  labels_matrix = np.zeros((len(text), len(labels)))
  # fill numpy array
  for idx, label in enumerate(labels):
    labels_matrix[:, idx] = labels_batch[label]

  encoding["labels"] = labels_matrix.tolist()
  
  return encoding
tokenized_dataset = dataset.map(tokenize_data, batched=True, remove_columns=dataset['train'].column_names)


Map: 100%|██████████| 1803/1803 [00:00<00:00, 13443.45 examples/s]
Map: 100%|██████████| 201/201 [00:00<00:00, 10201.91 examples/s]
Map: 100%|██████████| 222/222 [00:00<00:00, 11060.32 examples/s]


In [298]:
example = tokenized_dataset['train'][0]
print(example.keys())

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])


In [299]:
tokenizer.decode(example['input_ids'])

'[CLS] deputada benedita linda guerreira parabens juntos [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]'

In [300]:
example['labels']

[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]

In [301]:

tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "token_type_ids", "labels"])

### Finetuning do modelo

In [302]:
model = AutoModel.from_pretrained("bert-base-uncased", 
                                    problem_type="multi_label_classification", 
                                    num_labels=len(labels),
                                    id2label=id2label,
                                    label2id=label2id)
model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [303]:
batch_size = 8
metric_name = "f1"

args = TrainingArguments(
    f"bert-finetuned-sem_eval-english",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    label_names=labels,
    #push_to_hub=True,
)



In [304]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
    
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': 0.5,
               'roc_auc': 0.5,
               'accuracy': 0.5}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [311]:
tokenized_dataset["train"][0]['input_ids']

tensor([  101,  2139, 18780,  8447,  3841,  2098,  6590,  8507, 24613,  7895,
        11498, 10609,  2015, 12022, 13122,   102,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])

In [305]:
tokenized_dataset["train"][0]['labels']

tensor([0., 0., 0., 1., 0., 0., 0.])

In [312]:
outputs = model(input_ids=tokenized_dataset['train']['input_ids'][0].unsqueeze(0))
outputs

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.1797,  0.2918,  0.4437,  ..., -0.3215,  0.6285, -0.5523],
         [-0.1547, -0.2740,  0.2547,  ..., -0.5709,  0.4332,  0.1200],
         [-0.9757,  0.3277,  0.5104,  ..., -0.3233, -0.0021,  0.4701],
         ...,
         [-0.0134, -0.1094,  0.9175,  ..., -0.8142,  0.1922, -0.8266],
         [-0.0871,  0.3117,  0.9132,  ..., -0.7265,  0.0421, -1.0621],
         [ 0.2999,  0.3538,  0.9837,  ..., -0.6457, -0.1859, -1.1811]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-5.0431e-01, -3.8209e-01, -6.6837e-01,  4.6732e-01,  2.6598e-01,
         -1.4578e-01,  2.5774e-01,  3.6207e-01, -1.0864e-01, -9.9904e-01,
          5.9572e-01,  3.5002e-01,  9.4429e-01,  4.0216e-01,  6.3162e-01,
          2.4129e-01,  6.2522e-01, -3.5713e-01,  2.5146e-01,  4.4992e-01,
          5.7732e-01,  9.9963e-01,  2.9795e-01,  3.9139e-01,  4.2761e-01,
          2.7042e-01, -6.3709e-02,  7.7303e-01,  8.3687e-01,  6.303

In [307]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"].shuffle(seed=42).select(range(1000)),  # Subsample para treino rápido
    eval_dataset=tokenized_dataset["validation"].shuffle(seed=42).select(range(100)),    # Subsample para validação
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [308]:
tokenized_dataset['train']['input_ids'][0]

tensor([  101,  2139, 18780,  8447,  3841,  2098,  6590,  8507, 24613,  7895,
        11498, 10609,  2015, 12022, 13122,   102,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0])

In [310]:
trainer.train()

ValueError: The model did not return a loss from the inputs, only the following keys: last_hidden_state,pooler_output. For reference, the inputs it received are input_ids,token_type_ids,attention_mask.