In [8]:
import pandas as pd
import torch
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AdamW
from transformers import CamembertTokenizer
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import CamembertTokenizer
from transformers import CamembertForSequenceClassification, CamembertTokenizer
from tqdm import trange

In [9]:
epochs = 5
MAX_LEN = 128
batch_size = 16
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [10]:
# Load the dataset, I selected only 5000 sample because of memory limitation
df = pd.read_csv('training_data_cleaned.csv').reset_index(drop=True)
df.head()

Unnamed: 0,id,sentence,difficulty,cleaned_text
0,0,Les coûts kilométriques réels peuvent diverger...,C1,cots kilomtriques rels peuvent diverger sensib...
1,1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1,bleu cest couleur prfre naime vert
2,2,Le test de niveau en français est sur le site ...,A1,test niveau franais site internet lcole
3,3,Est-ce que ton mari est aussi de Boston?,A1,estce mari aussi boston
4,4,"Dans les écoles de commerce, dans les couloirs...",B1,coles commerce couloirs places financires arri...


In [11]:
# Mapping des valeurs de la colonne "difficulty"
difficulty_mapping = {
    'A1': 0,
    'A2': 1,
    'B1': 2,
    'B2': 3,
    'C1': 4,
    'C2': 5
}


In [12]:
# Utiliser la fonction map pour encoder les valeurs
df['difficulty_encoded'] = df['difficulty'].map(difficulty_mapping)


In [13]:
unique_labels = df['difficulty_encoded'].unique()
print(unique_labels)


[4 0 2 1 3 5]


In [2]:
tokenizer = CamembertTokenizer.from_pretrained('camembert-base',do_lower_case=True)

NameError: name 'CamembertTokenizer' is not defined

In [None]:
# Creates list of texts and labels
text = df['sentence'].to_list()
labels = df['difficulty_encoded'].to_list()  # Utilisez les labels encodés


In [None]:
# Utilisez le tokenizer Camembert
tokenizer = CamembertTokenizer.from_pretrained('camembert-base', do_lower_case=True)


In [None]:
# Utilisez le tokenizer pour convertir les phrases en tokens
input_ids = [tokenizer.encode(sent, add_special_tokens=True, max_length=MAX_LEN, pad_to_max_length=True, truncation=True) for sent in text]


In [None]:
# Créez des masques d'attention
attention_masks = []
# Créez un masque de 1 pour chaque token suivi de 0 pour le padding
for seq in input_ids:
    seq_mask = [float(i > 0) for i in seq]
    attention_masks.append(seq_mask)


In [None]:
# Convertissez les listes en tenseurs PyTorch
input_ids = torch.tensor(input_ids)
attention_masks = torch.tensor(attention_masks)
labels = torch.tensor(labels)

# Créez un DataLoader pour gérer les lots de données
dataset = TensorDataset(input_ids, attention_masks, labels)
dataloader = DataLoader(dataset, batch_size=batch_size, sampler=RandomSampler(dataset))

In [None]:
# Vous pouvez maintenant utiliser dataloader pour l'entraînement de votre modèle.
# Use train_test_split to split our data into train and validation sets for training
train_inputs, validation_inputs, train_labels, validation_labels, train_masks, validation_masks = train_test_split(input_ids, labels, attention_masks,
                                                            random_state=42, test_size=0.2)


In [None]:
# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(train_inputs).to(device)
validation_inputs = torch.tensor(validation_inputs).to(device)
train_labels = torch.tensor(train_labels).to(device)
validation_labels = torch.tensor(validation_labels).to(device)
train_masks = torch.tensor(train_masks).to(device)
validation_masks = torch.tensor(validation_masks).to(device)


In [None]:
# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)


In [None]:
# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top. 
model = CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=6)
model.to(device)


In [None]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]


In [None]:
from transformers import AdamW
from sklearn.metrics import accuracy_score

# Define the optimizer and set the learning rate
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return accuracy_score(labels_flat, pred_flat)


In [None]:
# Store our loss and accuracy for plotting if we want to visualize training evolution per epochs after the training process
train_loss_set = []

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):  
    # Tracking variables for training
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
  
    # Train the model
    model.train()
    for step, batch in enumerate(train_dataloader):
        # Add batch to device CPU or GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        # Clear out the gradients (by default they accumulate)
        optimizer.zero_grad()
        # Forward pass
        outputs = model(input_ids=b_input_ids, attention_mask=b_input_mask, labels=b_labels)
        # Get loss value
        loss = outputs.loss
        # Add it to train loss list
        train_loss_set.append(loss.item())    
        # Backward pass
        loss.backward()
        # Update parameters and take a step using the computed gradient
        optimizer.step()
# Update tracking variables
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

    print("Train loss: {}".format(tr_loss/nb_tr_steps))


In [15]:
# Tracking variables for validation
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    # Validation of the model
    model.eval()
    # Evaluate data for one epoch
    for batch in validation_dataloader:
        # Add batch to device CPU or GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        # Telling the model not to compute or store gradients, saving memory and speeding up validation
        with torch.no_grad():
            # Forward pass, calculate logit predictions
            outputs =  model(input_ids=b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            logits = outputs.logits
    
        # Move logits and labels to CPU if GPU is used
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1

    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))

#52% accuracy on validation set

IndentationError: unexpected indent (3014841711.py, line 2)

In [None]:
#hello, bonne sieste 