In [5]:
import pandas as pd
from transformers import CamembertTokenizer, CamembertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import numpy as np
from sklearn.metrics import accuracy_score
import torch

# Chargement des données
train_data = pd.read_csv("..//Projet TALN/train.csv")
test_data = pd.read_csv("..//Projet TALN/test.csv")

# Conversion des étiquettes textuelles en valeurs numériques
label_to_index = {"Entrée": 0, "Plat principal": 1, "Dessert": 2}  # Adapté à votre cas d'usage
train_data['type'] = train_data['type'].map(label_to_index)
test_data['type'] = test_data['type'].map(label_to_index)

train_texts = train_data['titre'].tolist()
train_labels = train_data['type'].tolist()
test_texts = test_data['titre'].tolist()
test_labels = test_data['type'].tolist()

# Tokenisation
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')

class RecipesDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)  # Correction ici pour le type
        return item

    def __len__(self):
        return len(self.labels)

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)

train_dataset = RecipesDataset(train_encodings, train_labels)
test_dataset = RecipesDataset(test_encodings, test_labels)

# Entraînement du modèle
model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=len(set(train_labels)))

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    do_train=True,
    do_eval=True,
    no_cuda=False,
    load_best_model_at_end=True,
    evaluation_strategy="steps",
    eval_steps=50,
    save_steps=50,
    save_total_limit=2
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()

# Évaluation
def get_predictions(model, dataset):
    outputs = trainer.predict(dataset)
    logits = outputs.predictions
    predictions = np.argmax(logits, axis=-1)
    return predictions

predictions = get_predictions(model, test_dataset)
accuracy = accuracy_score(test_labels, predictions)
print(f'Accuracy: {accuracy}')


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss
50,1.1036,1.093652
100,1.0508,1.044361
150,0.898,0.852012
200,0.7131,0.728173
250,0.7422,0.660225
300,0.721,0.594118
350,0.6655,0.572801
400,0.5763,0.664053
450,0.6686,0.56403
500,0.6954,0.621049


KeyboardInterrupt: 

In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import CamembertTokenizer, CamembertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Chargement des données
train_data = pd.read_csv("../data/train.csv")
test_data = pd.read_csv("../data/test.csv")

# Conversion des étiquettes textuelles en valeurs numériques
label_to_index = {"Entrée": 0, "Plat principal": 1, "Dessert": 2}  # Adapter selon vos étiquettes
train_data['type'] = train_data['type'].map(label_to_index)
test_data['type'] = test_data['type'].map(label_to_index)

train_texts = train_data['titre'].tolist()
train_labels = train_data['type'].tolist()
test_texts = test_data['titre'].tolist()
test_labels = test_data['type'].tolist()

# Tokenisation
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')

class RecipesDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)

train_dataset = RecipesDataset(train_encodings, train_labels)
test_dataset = RecipesDataset(test_encodings, test_labels)

# Configuration de l'entraînement
model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=len(set(train_labels)))

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    do_train=True,
    do_eval=True,
    no_cuda=False,
    load_best_model_at_end=True,
    evaluation_strategy="steps",
    eval_steps=50,
    save_steps=50,
    save_total_limit=2
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

trainer.train()

# Fonction pour obtenir les prédictions
def get_predictions(model, dataset):
    model.eval()
    dataloader = DataLoader(dataset, batch_size=8)
    predictions = []
    for batch in dataloader:
        inputs = {k: v.to(model.device) for k, v in batch.items() if k != 'labels'}
        with torch.no_grad():
            outputs = model(**inputs)
        logits = outputs.logits
        predictions.extend(logits.argmax(-1).cpu().numpy())
    return np.array(predictions)

# Obtention des prédictions
predictions = get_predictions(model, test_dataset)

# Calcul des métriques d'évaluation en utilisant test_labels et predictions
accuracy = accuracy_score(test_labels, predictions)
precision = precision_score(test_labels, predictions, average='weighted')
recall = recall_score(test_labels, predictions, average='weighted')
f1 = f1_score(test_labels, predictions, average='weighted')
conf_matrix = confusion_matrix(test_labels, predictions)
class_report = classification_report(test_labels, predictions)

# Affichage des métriques d'évaluation
print("Evaluation Metrics:")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Currently logged in as: [33mchabbahdjamel[0m ([33mtaln2[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011167917600000186, max=1.0…

Step,Training Loss,Validation Loss
50,No log,1.082119
100,No log,1.004417
150,No log,0.804573
200,No log,0.718004
250,No log,0.658452
300,No log,0.622111
350,No log,0.604764
400,No log,0.557866
450,No log,0.517586
500,0.774700,0.530926


Checkpoint destination directory ./results/checkpoint-4650 already exists and is non-empty. Saving will proceed but saved results may be invalid.


Evaluation Metrics:
Accuracy: 0.829971181556196
Precision: 0.832166351368279
Recall: 0.829971181556196
F1 Score: 0.830103987904655
Confusion Matrix:
[[247  69  21]
 [109 515  20]
 [  4  13 390]]
Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.73      0.71       337
           1       0.86      0.80      0.83       644
           2       0.90      0.96      0.93       407

    accuracy                           0.83      1388
   macro avg       0.82      0.83      0.82      1388
weighted avg       0.83      0.83      0.83      1388



In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

fig = ConfusionMatrixDisplay.from_predictions(test_labels, predictions)
fig.ax_.set_title("Matrice de confusion TF IDF")

print(fig.confusion_matrix)