In [1]:
import pandas as pd
import numpy as np
#we read the training data
df = pd.read_csv('../training/training_data.csv')

In [2]:
# Drop rows with missing 'sentence' or 'difficulty' in training data
df = df.dropna(subset=['sentence', 'difficulty'])
# remove duplicates
df = df.drop_duplicates(subset=['sentence'])
#add a column with the number of words in the text ('sentence' column)
df['n_words'] = df['sentence'].apply(lambda x: len(x.split()))
#add a column with the average length of the words in the text ('sentence' column)
df['avg_word_length'] = df['sentence'].apply(lambda x: np.mean([len(w) for w in x.split()]))
df

Unnamed: 0,id,sentence,difficulty,n_words,avg_word_length
0,0,Les coûts kilométriques réels peuvent diverger...,C1,38,5.736842
1,1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1,12,4.250000
2,2,Le test de niveau en français est sur le site ...,A1,13,4.153846
3,3,Est-ce que ton mari est aussi de Boston?,A1,8,4.125000
4,4,"Dans les écoles de commerce, dans les couloirs...",B1,34,5.176471
...,...,...,...,...,...
4795,4795,"C'est pourquoi, il décida de remplacer les hab...",B2,26,5.384615
4796,4796,Il avait une de ces pâleurs splendides qui don...,C1,21,4.666667
4797,4797,"Et le premier samedi de chaque mois, venez ren...",A2,14,4.785714
4798,4798,Les coûts liés à la journalisation n'étant pas...,C2,32,6.093750


In [3]:
# we first want to understand the features of our dataset, so we will see how many sentences are available per level of difficulty
print(df.groupby('difficulty').size())

difficulty
A1    813
A2    795
B1    795
B2    792
C1    798
C2    807
dtype: int64


# Preprocessing
## 1. Lemmatization

In [8]:
X = df['sentence']
# Lemmatization # we use fr_core_news_md because we are working with French text
import spacy
nlp = spacy.load('fr_core_news_md')
# Assuming X is a pandas Series
def lemmatize_text(doc):
    return ' '.join([token.lemma_ for token in doc])

# Use spaCy's pipe method for more efficient batch processing
lemmatized_texts = [lemmatize_text(doc) for doc in nlp.pipe(X)]

# Convert list back to pandas Series if necessary
X_lemmatized = pd.Series(lemmatized_texts)

Separate the data into train and test:

In [4]:
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder
y = df['difficulty'].values

# Define the order of your labels
labels_ordered = ['A1', 'A2', 'B1', 'B2', 'C1', 'C2']

# Initialize the LabelEncoder
encoder = LabelEncoder()

# Manually fit the encoder to the ordered labels
encoder.fit(labels_ordered)

# Encode your actual labels
y_encoded = encoder.transform(y)

# Output the encoding to verify
label_mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
print("Label mapping:", label_mapping)

X_train, X_test, y_train, y_test = train_test_split(X_lemmatized, y_encoded, test_size=0.2, random_state=42)


Label mapping: {'A1': 0, 'A2': 1, 'B1': 2, 'B2': 3, 'C1': 4, 'C2': 5}


Create tokenizer and model

In [5]:
from transformers import CamembertTokenizer, CamembertForSequenceClassification, FlaubertTokenizer, FlaubertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch
# 1) tokenization > used to encode the sentences
# we could do the tokenization either with Camembert or Flaubert
# 2) padding > used to make all the sentences of the same length
# 3) attention masks > to give the same weight to all the words, regardless of their length
num_classes = df['difficulty'].nunique()
chosen_tokenizer = 'camembert'

if chosen_tokenizer == 'camembert':
    tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
    model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=num_classes)
elif chosen_tokenizer == 'flaubert':
    tokenizer = FlaubertTokenizer.from_pretrained('flaubert/flaubert_base_cased')
    model = FlaubertForSequenceClassification.from_pretrained('flaubert/flaubert_base_cased', num_labels=num_classes)

tokenized = X_train.apply((lambda x_: tokenizer.encode(x_, add_special_tokens=True)))
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

attention_mask = np.where(padded != 0, 1, 0)
# now we load the data into a torch dataloader 
# respecting the input expected by the BERT model

input_ids = torch.tensor(padded)
#create the attention mask copying with sourceTensor.clone()
attention_mask_tensor = torch.tensor(attention_mask)
labels = torch.tensor(y_train)
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
dataset = TensorDataset(input_ids, attention_mask_tensor, labels)
dataloader = DataLoader(dataset, batch_size=32)
from transformers import  get_linear_schedule_with_warmup
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(dataloader)*epochs)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# training loop
from tqdm import tqdm
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
epochs = 1
for epoch in range(1, epochs+1):
    model.train()
    loss_train_total = 0
    progress_bar = tqdm(dataloader, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        outputs = model(**inputs)
        loss = outputs.loss
        loss_train_total += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item() / len(batch[0]))})
    torch.save(model.state_dict(), f'BERT_ft_epoch{epoch}.model')
    tqdm.write(f'\nEpoch {epoch}')
    loss_train_avg = loss_train_total/len(dataloader)
    tqdm.write(f'Training loss: {loss_train_avg}')

                                                                                


Epoch 1
Training loss: 1.6941598991552989


Predicting Levels
We can now use our model to predict the level of a text. To do this, we need to correctly encode our text in the same way as our data was encoded during training.

In [9]:
def predict_text(text, device):
    encoded_text = tokenizer.encode_plus(
        text,
        max_length=128,
        add_special_tokens=True,
        return_token_type_ids=False,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    input_ids = encoded_text['input_ids'].to(device)
    attention_mask = encoded_text['attention_mask'].to(device)
    output = model(input_ids, attention_mask)
    _, prediction = torch.max(output[0], dim=1)
    return prediction[0].item()

y_pred = []
for text in tqdm(X_test):
    y_pred.append(predict_text(text, device))
    
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=['A1', 'A2', 'B1', 'B2', 'C1', 'C2']))

  0%|          | 0/960 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 960/960 [01:28<00:00, 10.83it/s]

              precision    recall  f1-score   support

          A1       0.42      0.94      0.58       166
          A2       0.29      0.15      0.19       158
          B1       0.42      0.10      0.17       166
          B2       0.44      0.12      0.19       153
          C1       0.29      0.11      0.16       152
          C2       0.37      0.83      0.51       165

    accuracy                           0.38       960
   macro avg       0.37      0.37      0.30       960
weighted avg       0.37      0.38      0.30       960




