In [2]:
#!pip3 install tokenizer
#!pip3 install sentencepiece
import pandas as pd
import torch
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AdamW
from transformers import CamembertTokenizer
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import DataLoader, SequentialSampler, TensorDataset
from transformers import CamembertForSequenceClassification, CamembertTokenizer
from tqdm import trange
import nltk
import tokenizer as tokenizer_2
import re
from nltk.tokenize import word_tokenize
import string

epochs = 10
MAX_LEN = 128
batch_size = 8
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the dataset, I selected only 5000 sample because of memory limitation
df = pd.read_csv('training_data_cleaned_length.csv').reset_index(drop=True)
df.head()

# Mapping des valeurs de la colonne "difficulty"
difficulty_mapping = {
    'A1': 0,
    'A2': 1,
    'B1': 2,
    'B2': 3,
    'C1': 4,
    'C2': 5
}

# Utiliser la fonction map pour encoder les valeurs
df['difficulty_encoded'] = df['difficulty'].map(difficulty_mapping)

unique_labels = df['difficulty_encoded'].unique()
print(unique_labels)

# Creates list of texts and labels
text = df['sentence'].to_list()
length = df['length']
labels = df['difficulty_encoded'].to_list()  # Utilisez les labels encodés

# Utilisez le tokenizer Camembert
tokenizer = CamembertTokenizer.from_pretrained("camembert-base", do_lower_case=True)


# Utilisez le tokenizer pour convertir les phrases en tokens
input_ids = [tokenizer.encode(sent, add_special_tokens=True, max_length=MAX_LEN, pad_to_max_length=True, truncation=True) for sent in text]

# Créez des masques d'attention
attention_masks = []
# Créez un masque de 1 pour chaque token suivi de 0 pour le padding
for seq in input_ids:
    seq_mask = [float(i > 0) for i in seq]
    attention_masks.append(seq_mask)

    # Convertissez les listes en tenseurs PyTorch
input_ids = torch.tensor(input_ids)
input_lgth = torch.tensor(length)
attention_masks = torch.tensor(attention_masks)
labels = torch.tensor(labels)

# Créez un DataLoader pour gérer les lots de données
dataset = TensorDataset(input_ids, input_lgth, attention_masks, labels)
dataloader = DataLoader(dataset, batch_size=batch_size, sampler=RandomSampler(dataset))

# Vous pouvez maintenant utiliser dataloader pour l'entraînement de votre modèle.
# Use train_test_split to split our data into train and validation sets for training
train_inputs, validation_inputs, train_lgth, validation_lgth, train_labels, validation_labels, train_masks, validation_masks = train_test_split(input_ids, input_lgth, labels, attention_masks,
                                                            random_state=42, test_size=0.2)


# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(train_inputs).to(device)
validation_inputs = torch.tensor(validation_inputs).to(device)
train_lgth = torch.tensor(train_lgth).to(device)
validation_lgth = torch.tensor(validation_lgth).to(device)
train_labels = torch.tensor(train_labels).to(device)
validation_labels = torch.tensor(validation_labels).to(device)
train_masks = torch.tensor(train_masks).to(device)
validation_masks = torch.tensor(validation_masks).to(device)


# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop,
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_lgth, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_lgth, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)


# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top.
model = CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=6)
model.to(device)


param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]
from transformers import AdamW
from sklearn.metrics import accuracy_score

# Define the optimizer and set the learning rate
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return accuracy_score(labels_flat, pred_flat)


# Store our loss and accuracy for plotting if we want to visualize training evolution per epochs after the training process
train_loss_set = []

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):
    # Tracking variables for training
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    # Train the model
    model.train()
    for step, batch in enumerate(train_dataloader):
        # Add batch to device CPU or GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_lgth, b_input_mask, b_labels = batch
        # Clear out the gradients (by default they accumulate)
        optimizer.zero_grad()
        # Forward pass
        outputs = model(input_ids=b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        # Get loss value
        loss = outputs.loss
        # Add it to train loss list
        train_loss_set.append(loss.item())
        # Backward pass
        loss.backward()
        # Update parameters and take a step using the computed gradient
        optimizer.step()
# Update tracking variables
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    print("Train loss: {}".format(tr_loss/nb_tr_steps))

    # Tracking variables for validation
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    # Validation of the model
    model.eval()
    # Evaluate data for one epoch
    for batch in validation_dataloader:
        # Add batch to device CPU or GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_lgth, b_input_mask, b_labels = batch
        # Telling the model not to compute or store gradients, saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions
            outputs =  model(input_ids=b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            logits = outputs.logits

        # Move logits and labels to CPU if GPU is used
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        tmp_eval_accuracy = flat_accuracy(logits, label_ids)

        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))







[4 0 2 1 3 5]


sentencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  train_inputs = torch.tensor(train_inputs).to(device)
  validation_inputs = torch.tensor(validation_inputs).to(device)
  train_lgth = torch.tensor(train_lgth).to(device)
  validation_lgth = torch.tensor(validation_lgth).to(device)
  train_labels = torch.tensor(train_labels).to(device)
  validation_labels = torch.tensor(validation_labels).to(device)
  train_masks = torch.tensor(train_masks).to(device)
  validation_masks = torch.tensor(validation_masks).to(device)


model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

Train loss: 1.5818481507400672


Epoch:  10%|█         | 1/10 [01:35<14:20, 95.58s/it]

Validation Accuracy: 0.44479166666666664
Train loss: 1.2820983706663052


Epoch:  20%|██        | 2/10 [03:12<12:51, 96.47s/it]

Validation Accuracy: 0.478125
Train loss: 1.1358435655633607


Epoch:  30%|███       | 3/10 [04:49<11:14, 96.42s/it]

Validation Accuracy: 0.5364583333333334
Train loss: 0.9868515074873964


Epoch:  40%|████      | 4/10 [06:25<09:37, 96.30s/it]

Validation Accuracy: 0.5416666666666666
Train loss: 0.8404595196867982


Epoch:  50%|█████     | 5/10 [08:01<08:01, 96.28s/it]

Validation Accuracy: 0.5552083333333333
Train loss: 0.6763753799411157


Epoch:  60%|██████    | 6/10 [09:37<06:25, 96.31s/it]

Validation Accuracy: 0.565625
Train loss: 0.5099224116032323


Epoch:  70%|███████   | 7/10 [11:13<04:48, 96.27s/it]

Validation Accuracy: 0.5739583333333333
Train loss: 0.40602117984866104


Epoch:  80%|████████  | 8/10 [12:50<03:12, 96.22s/it]

Validation Accuracy: 0.5520833333333334
Train loss: 0.2923111803053568


Epoch:  90%|█████████ | 9/10 [14:26<01:36, 96.21s/it]

Validation Accuracy: 0.55
Train loss: 0.2115765274541142


Epoch: 100%|██████████| 10/10 [16:02<00:00, 96.24s/it]

Validation Accuracy: 0.5635416666666667





In [3]:
# Charger le nouveau jeu de données
# Remplacez 'new_data.csv' par le chemin de votre fichier de nouvelles phrases
new_df = pd.read_csv('unlabelled_test_data_length.csv')
new_texts = new_df['sentence'].tolist()  # Assurez-vous que la colonne contient les phrases
new_length = new_df['length']
# Préparer les données pour le modèle
tokenizer = CamembertTokenizer.from_pretrained('camembert-base', do_lower_case=True)
new_input_ids = [tokenizer.encode(sent, add_special_tokens=True, max_length=MAX_LEN, pad_to_max_length=True, truncation=True) for sent in new_texts]
new_attention_masks = [[float(i > 0) for i in seq] for seq in new_input_ids]

# Convertir en tenseurs
new_input_ids = torch.tensor(new_input_ids)
new_input_lgth = torch.tensor(new_length)
new_attention_masks = torch.tensor(new_attention_masks)

# Créer un DataLoader
prediction_data = TensorDataset(new_input_ids, new_input_lgth, new_attention_masks)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

# Prédiction
model.eval()
predictions = []

for batch in prediction_dataloader:
    # Ajouter batch à GPU
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_lgth, b_input_mask = batch

    with torch.no_grad():
        outputs = model(input_ids=b_input_ids, attention_mask=b_input_mask)

    logits = outputs.logits
    logits = logits.detach().cpu().numpy()
    predictions.append(logits)

# Convertir les prédictions en étiquettes de difficulté
predicted_labels = [np.argmax(p, axis=1).flatten() for p in predictions]
predicted_labels = np.concatenate(predicted_labels)

# Créer un DataFrame pour le CSV
output_df = pd.DataFrame({
    'id': new_df.index,  # ou une autre colonne d'identification si disponible
    'difficulty': [list(difficulty_mapping.keys())[list(difficulty_mapping.values()).index(label)] for label in predicted_labels]
})

# Enregistrer en CSV
output_df.to_csv('predicted_difficulties5.csv', index=False)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


# Nouvelle section