In [20]:
import pandas as pd
df = pd.read_csv('training_data.csv')
df.head()



Unnamed: 0,id,sentence,difficulty
0,0,Les coûts kilométriques réels peuvent diverger...,C1
1,1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1
2,2,Le test de niveau en français est sur le site ...,A1
3,3,Est-ce que ton mari est aussi de Boston?,A1
4,4,"Dans les écoles de commerce, dans les couloirs...",B1


In [21]:
df.columns
df.dtypes

id             int64
sentence      object
difficulty    object
dtype: object

In [2]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(df['sentence'], df['difficulty'], test_size=0.2)


In [3]:
from transformers import CamembertTokenizer

tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True)
val_encodings = tokenizer(val_texts.tolist(), truncation=True, padding=True)


  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
import torch

# Tenseurs pour les textes d'entraînement
train_input_ids = torch.tensor(train_encodings['input_ids'])
train_attention_mask = torch.tensor(train_encodings['attention_mask'])

train_labels = torch.tensor(pd.get_dummies(train_labels).values.tolist())


In [5]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

# Créer un DataLoader
batch_size = 16  # Vous pouvez choisir une taille de lot appropriée en fonction de votre configuration et de la mémoire disponible.
train_dataset = TensorDataset(train_input_ids, train_attention_mask, train_labels)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, sampler=RandomSampler(train_dataset))


In [7]:
from transformers import CamembertForSequenceClassification

# Charger le modèle CamemBERT
  # Remplacez 3 par le nombre de catégories dans votre tâche de classification
num_labels = 6
model = CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=num_labels)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)


Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


CamembertForSequenceClassification(
  (roberta): CamembertModel(
    (embeddings): CamembertEmbeddings(
      (word_embeddings): Embedding(32005, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): CamembertEncoder(
      (layer): ModuleList(
        (0-11): 12 x CamembertLayer(
          (attention): CamembertAttention(
            (self): CamembertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): CamembertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=Tru

In [9]:
from transformers import AdamW, get_linear_schedule_with_warmup
from torch.nn import CrossEntropyLoss
import torch

# Configuration de l'entraînement
optimizer = AdamW(model.parameters(), lr=5e-5)

epochs = 3  # Définissez le nombre d'époques ici
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0, 
                                            num_training_steps=total_steps)
loss_fn = CrossEntropyLoss()

# Boucle d'entraînement
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_dataloader:
        batch = tuple(t.to(device) for t in batch)  # Assurez-vous que tout est transféré sur le bon périphérique
        b_input_ids, b_input_mask, b_labels = batch

        # Convertir les étiquettes booléennes en entiers
        b_labels = b_labels.long()

        optimizer.zero_grad()
        outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{epochs} - Loss: {avg_train_loss}")

    print(b_input_ids.shape)  # Doit être quelque chose comme (16, seq_length)
    print(b_labels.shape)    # Doit être (16,)






ValueError: Expected input batch_size (16) to match target batch_size (96).

In [None]:
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score

# Évaluation
def evaluate_model(model, validation_dataloader):
    model.eval()
    predictions , true_labels = [], []

    with torch.no_grad():
        for batch in validation_dataloader:
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch

            outputs = model(b_input_ids, attention_mask=b_input_mask)
            logits = outputs.logits
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            predictions.append(logits)
            true_labels.append(label_ids)
    
    # Calcul des métriques
    predictions = np.concatenate(predictions, axis=0)
    true_labels = np.concatenate(true_labels, axis=0)
    preds_flat = np.argmax(predictions, axis=1).flatten()
    labels_flat = true_labels.flatten()
    accuracy = accuracy_score(labels_flat, preds_flat)
    recall = recall_score(labels_flat, preds_flat, average='macro')
    precision = precision_score(labels_flat, preds_flat, average='macro')
    f1 = f1_score(labels_flat, preds_flat, average='macro')
    return accuracy, recall, precision, f1

accuracy, recall, precision, f1 = evaluate_model(model, validation_dataloader)
print(f"Accuracy: {accuracy}\nRecall: {recall}\nPrecision: {precision}\nF1 Score: {f1}")
