In [1]:
import sys; sys.path.append("../")

# [BERT](https://huggingface.co/docs/transformers/model_doc/bert)

`float16` -> baisse la précision pour accélérer les calculs<br>
`sdpa` -> Scaled Dot Product Attention [documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)

In [2]:
from transformers import BertModel
from torch import float16

bert_model_name : str = "bert-base-uncased"

# model = BertModel.from_pretrained(bert_model_name, 
#     torch_dtype=float16, attn_implementation="sdpa"
# )
#First time launching : 2m10.5s
# model.save_pretrained("../models/2025-03-04-bert-base-uncased", from_pt = True)
model = BertModel.from_pretrained("../models/2025-03-04-bert-base-uncased",
            torch_dtype = float16, attn_implementation = "sdpa")
# second launch : 2.0s

In [3]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

# [Fine-tuning un classifieur multi-label](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/BERT/Fine_tuning_BERT_(and_friends)_for_multi_label_text_classification.ipynb#scrollTo=4wxY3x-ZZz8h)

The dataset consists of tweets, labeled with one or more emotions

In [138]:
custom_bert_model_name = "../models/2025-03-04-bert-base-uncased"
model_name = "bert-base-uncased" # FIXME

In [139]:
from datasets import load_dataset
dataset = load_dataset("sem_eval_2018_task_1", "subtask5.english")
# 10s dont forget to press y + 'Enter'

In [140]:
dataset

DatasetDict({
    train: Dataset({
        features: ['ID', 'Tweet', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust'],
        num_rows: 6838
    })
    test: Dataset({
        features: ['ID', 'Tweet', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust'],
        num_rows: 3259
    })
    validation: Dataset({
        features: ['ID', 'Tweet', 'anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust'],
        num_rows: 886
    })
})

In [141]:
dataset["train"].to_pandas()

Unnamed: 0,ID,Tweet,anger,anticipation,disgust,fear,joy,love,optimism,pessimism,sadness,surprise,trust
0,2017-En-21441,“Worry is a down payment on a problem you may ...,False,True,False,False,False,False,True,False,False,False,True
1,2017-En-31535,Whatever you decide to do make sure it makes y...,False,False,False,False,True,True,True,False,False,False,False
2,2017-En-21068,@Max_Kellerman it also helps that the majorit...,True,False,True,False,True,False,True,False,False,False,False
3,2017-En-31436,Accept the challenges so that you can literall...,False,False,False,False,True,False,True,False,False,False,False
4,2017-En-22195,My roommate: it's okay that we can't spell bec...,True,False,True,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6833,2017-En-21383,@nicky57672 Hi! We are working towards your hi...,False,False,False,False,False,False,False,False,False,False,False
6834,2017-En-41441,@andreamitchell said @berniesanders not only d...,False,True,False,False,False,False,False,False,False,True,False
6835,2017-En-10886,@isthataspider @dhodgs i will fight this guy! ...,True,False,True,False,False,False,False,True,False,False,False
6836,2017-En-40662,i wonder how a guy can broke his penis while h...,False,False,False,False,False,False,False,False,False,True,False


In [142]:
labels = [label for label in dataset['train'].features.keys() if label not in ['ID', 'Tweet']]
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}

## Tokenize

utilise un tokeniser par défaut (`AutoTokenizer` from `transformer`)

In [143]:
from transformers import AutoTokenizer
import numpy as np 

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [144]:
from transformers.tokenization_utils_base import BatchEncoding
from datasets.formatting.formatting import LazyBatch

def preprocess_data(batch_of_rows : LazyBatch) -> BatchEncoding:
    # Takes in a batch of rows (as a : LazyBatch ~ dataframe ish) 
    
    # collect the text and tokenize it 
    text = batch_of_rows["Tweet"]
    encoding : BatchEncoding = tokenizer(
        text, padding = "max_length", truncation = True, max_length = 128 
    )
    # Create a mattrix collecting all the metadata (emotions associated to the 
    # tweet)
    labels_matrix = np.zeros((len(text), len(labels)))
    for label in label2id:
        labels_matrix[:,label2id[label]] = batch_of_rows[label]

    # Associate the metadata to the encodings
    encoding["labels"] = labels_matrix.tolist()
    return encoding

Le format `BatchEncoding` permet d'utiliser un modèle notamment grace au label `input_ids` et formatte les entrées selon ce qui est attendu (`[CLS] / [SEP] / ...`)

In [145]:
encoded_dataset = dataset.map(preprocess_data,
    batched = True, remove_columns = dataset["train"].column_names
)
# Comme on utilise pytorch on met nos données au format torch
encoded_dataset.set_format("torch")

In [146]:
print(encoded_dataset['train'][0].keys())

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])


In [147]:
print("Original input :\n",dataset['train'][0]["Tweet"])
print('- '*50)
print("Input tokenized :\n",encoded_dataset['train'][0]["input_ids"])
print('- '*50)
print("Input decoded :\n",tokenizer.decode(encoded_dataset['train'][0]["input_ids"]))

Original input :
 “Worry is a down payment on a problem you may never have'.  Joyce Meyer.  #motivation #leadership #worry
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
Input tokenized :
 tensor([  101,  1523,  4737,  2003,  1037,  2091,  7909,  2006,  1037,  3291,
         2017,  2089,  2196,  2031,  1005,  1012, 11830, 11527,  1012,  1001,
        14354,  1001,  4105,  1001,  4737,   102,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0

In [148]:
from torch import Tensor

def get_labels(example_labels : Tensor) -> list[str]:
    return [id2label[idx] 
            for idx,label in enumerate(example_labels) if label == 1]

print(encoded_dataset["train"][0]["labels"]) # est une matrice !!!
print("=> ",get_labels(encoded_dataset["train"][0]["labels"]))

tensor([0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 1.])
=>  ['anticipation', 'optimism', 'trust']


## Classifieur

On load un classifieur (BERT + couche linéaire initialement aléatoire) que l'on va entraîner.

In [149]:
from transformers import AutoModelForSequenceClassification
encoder_classifier = AutoModelForSequenceClassification.from_pretrained(
        custom_bert_model_name,
        problem_type = "multi_label_classification", num_labels = len(labels),
        id2label = id2label, label2id = label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ../models/2025-03-04-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


L'alerte : 
```
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ../models/2025-03-04-bert-base-uncased/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
```
indique bien que nous n'avons récupéré QUE le modèle de plongement ("bert"), mais que la couche linéaire elle n'existait pas et a donc été intialisée aléatoirement. Nous sommes donc vivement invités à entraîner *au moins* le classifieur (`['classifier.bias', 'classifier.weight']`)

## Entraînement

Avec l'API huggingface `Trainer`. On doit alors créer 2 objets : 
- `TrainingArguments`
- `Trainer`

In [150]:
from transformers import TrainingArguments, Trainer

batch_size = 8
metric_name = "f1"

training_args = TrainingArguments(
    output_dir = "../models/2025-03-04-classifieur_entraine",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    #push_to_hub=True,
)

### Définition de métrique d'entraînement

In [151]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
from torch.nn import Sigmoid

# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/ 
def multi_label_metrics(results_matrix, labels : Tensor, threshold : float = 0.5
                        ) -> dict:
    '''Taking a results matrix (batch_size x num_labels), the function (with a 
    threshold) associates labels to the results => y_pred
    From this y_pred matrix, evaluate the f1_micro, roc_auc and accuracy metrics
    '''
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = Sigmoid()
    probs = sigmoid(Tensor(results_matrix))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    return {'f1': f1_micro_average,
             'roc_auc': roc_auc,
             'accuracy': accuracy}

def compute_metrics(model_output: EvalPrediction):
    if isinstance(model_output.predictions,tuple):
        results_matrix = model_output.predictions[0]
    else:
        results_matrix = model_output.predictions

    metrics = multi_label_metrics(results_matrix=results_matrix, 
        labels=model_output.label_ids)
    return metrics

#### Exemple de `forward pass`

In [152]:
fp_ex_input_ids = encoded_dataset['train']['input_ids'][0]
fp_ex_input_labels = encoded_dataset['train']['labels'][0]
print(f"fp_ex_input_ids : ({fp_ex_input_ids.type()})\n",fp_ex_input_ids)
print('-  ' * 20)
print(f"fp_ex_input_ids.unsqueeze(0) : ({fp_ex_input_ids.unsqueeze(0).type()})\n",fp_ex_input_ids.unsqueeze(0))
print('- ' * 30)
print(f"fp_ex_input_labels : ({fp_ex_input_labels.type()})\n",fp_ex_input_labels)
print('-  ' * 20)
print(f"fp_ex_input_labels.unsqueeze(0) : ({fp_ex_input_labels.unsqueeze(0).type()})\n",fp_ex_input_labels.unsqueeze(0))
print('- ' * 30)

# NOTE Je sais pas bien ce que ce à quoi ce "unsqueeze" sert
from time import time

t1 = time()
outputs = encoder_classifier(
    input_ids=fp_ex_input_ids.unsqueeze(0), 
    labels=fp_ex_input_labels.unsqueeze(0))
print(f"{time() - t1:.2f} s — outputs : {type(outputs)}\n",outputs)

del fp_ex_input_ids, fp_ex_input_labels, outputs, t1

fp_ex_input_ids : (torch.LongTensor)
 tensor([  101,  1523,  4737,  2003,  1037,  2091,  7909,  2006,  1037,  3291,
         2017,  2089,  2196,  2031,  1005,  1012, 11830, 11527,  1012,  1001,
        14354,  1001,  4105,  1001,  4737,   102,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     

In [155]:
trainer = Trainer(encoder_classifier, training_args,
                  train_dataset = encoded_dataset["train"].select(range(0,20)),
                  eval_dataset = encoded_dataset["validation"].select(range(0,10)),
                  tokenizer = tokenizer,
                  compute_metrics = compute_metrics)

  trainer = Trainer(encodeur_classifier, training_args,


In [156]:
t1 = time()
trainer.train()
print(f"{time()-t1:2f} s to train")
del t1

Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,No log,0.710008,0.311688,0.538523,0.0
2,No log,0.672419,0.327869,0.569556,0.0
3,No log,0.633361,0.32,0.572499,0.0
4,No log,0.609141,0.266667,0.541734,0.0
5,No log,0.601324,0.272727,0.547352,0.0


22.884125 s to train
