In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import CamembertTokenizer, CamembertForSequenceClassification, Trainer, TrainingArguments
import torch

# Étape 1 : Chargement des données
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

# Étape 2 : Prétraitement des données
# Supposons que vos données contiennent deux colonnes : 'text' (titre + instructions) et 'label' (Plat principal, Entrée, Dessert)
# Vous devez d'abord convertir les étiquettes en indices numériques
label_dict = {"Plat principal": 0, "Entrée": 1, "Dessert": 2}
train_data['type'] = train_data['type'].replace(label_dict)
test_data['type'] = test_data['type'].replace(label_dict)

# Tokenisation
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")

def tokenize_function(examples):
    return tokenizer(examples['titre'], padding="max_length", truncation=True)

train_encodings = tokenize_function(train_data.to_dict(orient='list'))
test_encodings = tokenize_function(test_data.to_dict(orient='list'))

# Convertir en Dataset de PyTorch
class RecipesDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = RecipesDataset(train_encodings, train_data['type'].tolist())
test_dataset = RecipesDataset(test_encodings, test_data['type'].tolist())

# Étape 3 : Entraînement du modèle
model = CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=3)

training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=3,              
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=8,   
    warmup_steps=500,                
    weight_decay=0.01,               
    logging_dir='./logs',            
)

trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=train_dataset,         
)

trainer.train()

# Étape 4 : Évaluation du modèle
def get_predictions(model, dataset):
    outputs = trainer.predict(test_dataset)
    predictions = outputs.predictions.argmax(-1)
    return predictions

predictions = get_predictions(model, test_dataset)
accuracy = accuracy_score(test_data['type'].tolist(), predictions)
print(f'Accuracy: {accuracy}')


  train_data['type'] = train_data['type'].replace(label_dict)
  test_data['type'] = test_data['type'].replace(label_dict)
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit: