### Installation des bibliothèques 

pip install numpy  
pip install pandas 
pip install scikit-learn
pip install torch
pip install transformers
pip install datasets

In [32]:
pip install transformers[torch]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.33.0-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.33.0-py3-none-any.whl (315 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m315.1/315.1 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hInstalling collected packages: accelerate
Successfully installed accelerate-0.33.0
Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch

# Exemple de données
data = {
    'text': ['Phrase exemple 1', 'Phrase exemple 2', 'Phrase exemple 3'],
    'label': [1, 2, 3]
}

df = pd.DataFrame(data)

# Convertir les étiquettes pour qu'elles commencent à 0
df['label'] = df['label'] - 1

# Séparer les données en train et test
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Convertir les DataFrames en objets Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Charger le tokenizer BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=512)

# Appliquer la tokenisation aux datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Réorganiser les colonnes pour être compatibles avec le modèle
train_dataset = train_dataset.rename_column("label", "labels")
test_dataset = test_dataset.rename_column("label", "labels")
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

# Charger le modèle BERT pré-entraîné et ajouter une couche de classification
num_labels = len(set(df['label']))  # Assurez-vous que num_labels est correct
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

# Définir les arguments d'entraînement
training_args = TrainingArguments(
    output_dir='./results',          # Répertoire pour sauvegarder les résultats
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    logging_dir='./logs',            # Répertoire pour les logs
    logging_steps=10,                # Fréquence des logs
)

# Créer le Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Entraîner le modèle
trainer.train()

# Évaluer le modèle
results = trainer.evaluate()
print(results)


Map: 100%|██████████| 2/2 [00:00<00:00, 386.54 examples/s]
Map: 100%|██████████| 1/1 [00:00<00:00, 206.95 examples/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,1.687422
2,No log,1.815349
3,No log,1.901266


{'eval_loss': 1.9012664556503296, 'eval_runtime': 0.4499, 'eval_samples_per_second': 2.223, 'eval_steps_per_second': 2.223, 'epoch': 3.0}


### Autre exemple avec one hot


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
import numpy as np

# Exemple de données
data = {
    'text': ['Phrase exemple 1', 'Phrase exemple 2', 'Phrase exemple 3'],
    'label': [[1, 0, 0], [0, 1, 0], [0, 0, 1]]  # Étiquettes encodées en one-hot
}

df = pd.DataFrame(data)

# Séparer les données en train et test
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Convertir les DataFrames en objets Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)


  from .autonotebook import tqdm as notebook_tqdm
2024-08-13 10:03:08.982535: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-08-13 10:03:09.010662: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-13 10:03:09.046710: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-13 10:03:09.057851: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-13 10:03:09.0

### Tokenization

In [2]:
# Charger le tokenizer BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)

# Appliquer la tokenisation aux datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)


Map: 100%|██████████| 2/2 [00:00<00:00, 328.28 examples/s]
Map: 100%|██████████| 1/1 [00:00<00:00, 211.01 examples/s]


### Préparer les etiquettes

In [3]:
def convert_labels_to_indices(labels):
    return [np.argmax(label) for label in labels]

# Convertir les étiquettes one-hot en indices
train_dataset = train_dataset.map(lambda examples: {'labels': convert_labels_to_indices(examples['label'])}, batched=True)
test_dataset = test_dataset.map(lambda examples: {'labels': convert_labels_to_indices(examples['label'])}, batched=True)

# Réorganiser les colonnes pour être compatibles avec le modèle
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


Map: 100%|██████████| 2/2 [00:00<00:00, 388.58 examples/s]
Map: 100%|██████████| 1/1 [00:00<00:00, 240.32 examples/s]


### Model

In [4]:
# Charger le modèle BERT pré-entraîné et ajouter une couche de classification
num_labels = len(set(convert_labels_to_indices(df['label'])))  # Assurez-vous que num_labels est correct
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)

# Définir les arguments d'entraînement
training_args = TrainingArguments(
    output_dir='./results',          # Répertoire pour sauvegarder les résultats
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    evaluation_strategy="epoch",
    logging_dir='./logs',            # Répertoire pour les logs
    logging_steps=10,                # Fréquence des logs
)

# Créer le Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Entraîner le modèle
trainer.train()

# Évaluer le modèle
results = trainer.evaluate()
print(results)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,1.514023
2,No log,1.633128
3,No log,1.724015


{'eval_loss': 1.7240149974822998, 'eval_runtime': 0.1232, 'eval_samples_per_second': 8.117, 'eval_steps_per_second': 8.117, 'epoch': 3.0}
