In [1]:
# import des librairies
import os
import pandas as pd
import torch
from transformers import AutoTokenizer,AutoModel
from datasets import load_dataset
from torch.utils.data import TensorDataset, DataLoader
import torch.nn as nn
from torch import optim
from sentence_transformers import SentenceTransformer
import numpy as np
from transformers import RobertaModel,RobertaConfig, RobertaTokenizer
from transformers import Trainer
from datasets import Dataset
from transformers import TrainingArguments
import transformers


device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
print(f"Using {device} device")

Using mps device


In [2]:
import transformers
print(transformers.__version__)

4.53.0


In [3]:
# lire le dataset

import pandas as pd

base_path = "hf://datasets/jingjietan/essays-big5/data/"

splits = {
    'train': f"{base_path}train-00000-of-00001.parquet",
    'validation': f"{base_path}validation-00000-of-00001.parquet",
    'test': f"{base_path}test-00000-of-00001.parquet"
}

df_train = pd.read_parquet(splits['train'])
df_val = pd.read_parquet(splits['validation'])
df_test = pd.read_parquet(splits['test'])

df_train.head()


Unnamed: 0,O,C,E,A,N,ptype,text,__index_level_0__
0,1,0,0,1,1,19,it is wednesday. I can't wait until friday bec...,774
1,1,1,1,0,1,29,"wow, I want to go talk to the socialist organi...",178
2,1,0,1,1,0,22,"I wish polygamy was still legal. Well, not pol...",1881
3,1,0,1,0,0,20,"Well, lets see . . . I guess the foremost thin...",1563
4,0,1,0,1,1,11,College? I wonder how it will be? I just ...,1594


In [19]:
#labels
df_train["labels"] = df_train[["O", "C", "E", "A", "N"]].values.tolist()
df_val["labels"] = df_val[["O", "C", "E", "A", "N"]].values.tolist()
# verifier que chaque éléments de labels soit integer
df_train["labels"] = df_train["labels"].apply(lambda x: list(map(int, x)))
df_val["labels"] = df_val["labels"].apply(lambda x: list(map(int, x)))
# creation du dataset train et val
dataset_train = Dataset.from_pandas(df_train['text', 'labels'])
dataset_val = Dataset.from_pandas(df_val['text', 'labels'])

In [20]:
#tokenizer prendre 128 car trop volumineux sur CPU
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

def tokenize(example):
    return tokenizer(example["text"], padding="max_length", truncation=True,max_length=128)
# Tokenization du dataset
dataset_train = dataset_train.map(tokenize, batched=True)
dataset_val   = dataset_val.map(tokenize, batched=True)

Map:   0%|          | 0/1578 [00:00<?, ? examples/s]

Map:   0%|          | 0/395 [00:00<?, ? examples/s]

In [21]:
dataset_train

Dataset({
    features: ['labels', 'text', 'input_ids', 'attention_mask'],
    num_rows: 1578
})

In [22]:
# supprimer la colonne text
dataset_train = dataset_train.remove_columns("text")
dataset_val = dataset_val.remove_columns("text")

In [23]:
dataset_train

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 1578
})

In [24]:
# Transfer learning de Roberta sur le classifier mettre 5 car 5 labels

class CustomRobertaClassifier(nn.Module):
    def __init__(self, dropout_rate=0.2):
        super().__init__()
        self.backbone = RobertaModel.from_pretrained("roberta-base")
        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(self.backbone.config.hidden_size, 5)  

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.classifier(x)

        loss = None
        if labels is not None:
            loss_fn = nn.BCEWithLogitsLoss()
            loss = loss_fn(logits, labels.float())

        return {"loss": loss, "logits": logits}
    
# Initialiser le modèle
model = CustomRobertaClassifier()
model.to(device)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
#Preparation des metrics 
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    probs = 1 / (1 + np.exp(-logits))
    preds = (probs > 0.5).astype(int)

    results = {}
    for i, trait in enumerate(["O", "C", "E", "A", "N"]):
        results[f"{trait}_acc"] = accuracy_score(labels[:, i], preds[:, i])
        results[f"{trait}_f1"] = f1_score(labels[:, i], preds[:, i])

    # Global average
    results["avg_accuracy"] = np.mean([results[f"{t}_acc"] for t in "OCEAN"])
    results["avg_f1"] = np.mean([results[f"{t}_f1"] for t in "OCEAN"])

    return results

In [26]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./roberta_ocean_classifier",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    save_total_limit=1,
    report_to="none"
)

In [27]:
print(dataset_train[0])

{'labels': 1, 'input_ids': [0, 405, 16, 18862, 46836, 4, 38, 64, 75, 2067, 454, 6664, 21746, 142, 38, 524, 164, 184, 7, 192, 1518, 261, 4, 38, 2649, 123, 98, 203, 4, 38, 64, 75, 2067, 7, 192, 123, 4, 80, 55, 360, 4, 42, 34, 57, 10, 182, 251, 80, 688, 4, 86, 3974, 182, 5764, 259, 4, 38, 33, 10, 319, 9, 481, 86, 15, 127, 1420, 77, 38, 524, 45, 11, 1380, 4, 1380, 4, 16797, 1380, 4, 16797, 16, 1531, 98, 444, 4, 24, 269, 3168, 162, 6, 8, 8546, 4, 7670, 858, 428, 4218, 16, 6269, 4, 7285, 80, 2345, 9, 8265, 162, 600, 4, 141, 524, 38, 164, 7, 2145, 70, 9, 167, 1110, 4, 38, 399, 75, 190, 2073, 2600, 24, 142, 38, 399, 75, 1346, 24, 4, 53, 38, 197, 33, 44949, 281, 1780, 28648, 26, 14, 24, 21, 2679, 4, 37, 21, 2758, 162, 59, 141, 51, 847, 103, 233, 9, 10, 4758, 18, 2900, 66, 11, 41, 9280, 4, 14, 16, 7735, 4, 5, 2129, 4758, 4, 28648, 16, 7735, 350, 4, 38, 460, 5170, 114, 37, 3829, 162, 4, 37, 64, 28, 98, 1266, 77, 97, 82, 32, 198, 53, 98, 2579, 77, 24, 16, 95, 5, 80, 9, 201, 4, 38, 222, 619, 1256, 

In [28]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    compute_metrics=compute_metrics,
)
trainer.train()



Epoch,Training Loss,Validation Loss


RuntimeError: MPS backend out of memory (MPS allocated: 17.79 GB, other allocations: 344.72 MB, max allowed: 18.13 GB). Tried to allocate 24.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).