# Read the training data

In [11]:
import pandas as pd
import numpy as np
#we read the training data
df = pd.read_csv('../training/training_data.csv')

In [43]:
# Drop rows with missing 'sentence' or 'difficulty' in training data
df = df.dropna(subset=['sentence', 'difficulty'])
# remove duplicates
df = df.drop_duplicates(subset=['sentence'])
df.reset_index(drop=True, inplace=True)
#add a column with the number of words in the text ('sentence' column)
df['n_words'] = df['sentence'].apply(lambda x: len(x.split()))
#add a column with the average length of the words in the text ('sentence' column)
df['avg_word_length'] = df['sentence'].apply(lambda x: np.mean([len(w) for w in x.split()]))
df

Unnamed: 0,id,sentence,difficulty,n_words,avg_word_length
0,0,Les coûts kilométriques réels peuvent diverger...,C1,38,5.736842
1,1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1,12,4.250000
2,2,Le test de niveau en français est sur le site ...,A1,13,4.153846
3,3,Est-ce que ton mari est aussi de Boston?,A1,8,4.125000
4,4,"Dans les écoles de commerce, dans les couloirs...",B1,34,5.176471
...,...,...,...,...,...
4795,4795,"C'est pourquoi, il décida de remplacer les hab...",B2,26,5.384615
4796,4796,Il avait une de ces pâleurs splendides qui don...,C1,21,4.666667
4797,4797,"Et le premier samedi de chaque mois, venez ren...",A2,14,4.785714
4798,4798,Les coûts liés à la journalisation n'étant pas...,C2,32,6.093750


In [44]:
# we first want to understand the features of our dataset, so we will see how many sentences are available per level of difficulty
print(df.groupby('difficulty').size())

difficulty
A1    813
A2    795
B1    795
B2    792
C1    798
C2    807
dtype: int64


# Preprocessing of the target variable
## 1. Label encoding of the target variable


In [45]:
from sklearn.preprocessing import LabelEncoder
y = df['difficulty'].values

# Define the order of your labels
labels_ordered = ['A1', 'A2', 'B1', 'B2', 'C1', 'C2']

# Initialize the LabelEncoder
encoder = LabelEncoder()

# Manually fit the encoder to the ordered labels
encoder.fit(labels_ordered)

# Encode your actual labels
y_encoded = encoder.transform(y)

# Output the encoding to verify
label_mapping = dict(zip(encoder.classes_, encoder.transform(encoder.classes_)))
print("Label mapping:", label_mapping)
print(y, y_encoded)

Label mapping: {'A1': 0, 'A2': 1, 'B1': 2, 'B2': 3, 'C1': 4, 'C2': 5}
['C1' 'A1' 'A1' ... 'A2' 'C2' 'C2'] [4 0 0 ... 1 5 5]


# Preprocessing of the features
## 1. Tokenization:
Tokenization is the first step. It consists in breaking down each sentence into individual words or subwords. For french we can use a CamemBERT tokenizer or a FlauBert tokenizer.


NB: FlauBERT: Uses </w> to denote the end of a word. This is typical of tokenizers that use subword segmentation like SentencePiece or BPE (Byte Pair Encoding) that FlauBERT employs. This method helps in handling unknown words better by breaking down words into more frequently occurring subwords.
CamemBERT: Uses a special character (▁, an underscore) to denote the beginning of a new word. This is common in tokenizers that are designed to handle languages where whitespace alone isn't a reliable separator of words.
## 2. Attention Masking, Padding and Truncation

In [46]:
from transformers import CamembertTokenizer, CamembertForSequenceClassification, FlaubertTokenizer, FlaubertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch
# 1) tokenization > used to encode the sentences
# we could do the tokenization either with Camembert or Flaubert
num_classes = df['difficulty'].nunique()
chosen_tokenizer = 'camembert'

if chosen_tokenizer == 'camembert':
    tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
    model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=num_classes)
elif chosen_tokenizer == 'flaubert':
    tokenizer = FlaubertTokenizer.from_pretrained('flaubert/flaubert_base_cased')
    model = FlaubertForSequenceClassification.from_pretrained('flaubert/flaubert_base_cased', num_labels=num_classes)

tokens = tokenizer.tokenize("Ceci est une phrase française.")
print(tokens)

# now we can proceed with the tokenization of the sentences
#the dataclass handles the tokenization of sentences. it also uses the tokenizer to convert the tokens into 
# input ids (sequences of ints that uniquely identify each token in the vocabulary)
# and attention masks (sequences of 1s and 0s that indicate which tokens should be attended to and which should not)
# the dataclass also handles padding and truncation of the sentences (most ml models require consistent size)
# it also handles labels when available so that it can be used for training and validation
class CEFRDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len, labels=None):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        item = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }
        # Add labels to your dataset only if they are provided, i.e., during training and validation.
        if self.labels is not None:
            label = self.labels[idx]
            item['labels'] = torch.tensor(label, dtype=torch.long)
        return item


dataset = CEFRDataset(df['sentence'], tokenizer, max_len=128, labels=y_encoded)



Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


['▁Ceci', '▁est', '▁une', '▁phrase', '▁française', '.']


## Train the model
### We do k-fold cross validation to find the best hyperparameters for the model

In [61]:
from torch.utils.data import DataLoader
#split the dataset into training and validation datasets (in the future we will do a k-fold cross validation)
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
# Define number of folds
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using {device} device")
model.to(device)  # Move model to GPU if available

num_epochs = 5


n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

import torch
import tqdm
from transformers import AdamW, get_scheduler

# Define your learning rate and other hyperparameters
learning_rates = [1e-5, 5e-5, 1e-4]  # Example learning rates to test

# Initialize results container for hyperparameter tuning
hyperparam_results = {}

for lr in learning_rates:
    fold_results = []
    for fold, (train_idx, val_idx) in enumerate(kf.split(df)):
        print(f"Training with lr={lr}, fold {fold+1}/{n_splits}")

        # Split data according to current fold
        train_sentences = df.iloc[train_idx]['sentence'].reset_index(drop=True)
        train_labels = y_encoded[train_idx]
        val_sentences = df.iloc[val_idx]['sentence'].reset_index(drop=True)
        val_labels = y_encoded[val_idx]

        # Create datasets
        train_dataset = CEFRDataset(train_sentences, tokenizer, max_len=128, labels=train_labels)
        val_dataset = CEFRDataset(val_sentences, tokenizer, max_len=128, labels=val_labels)

        # Create data loaders
        train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

        # Initialize model and move it to the device
        model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=len(np.unique(y_encoded)))
        model.to(device)

        # Initialize optimizer and scheduler
        optimizer = AdamW(model.parameters(), lr=lr)
        num_training_steps = num_epochs * len(train_loader)
        lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

        # Training loop
        model.train()
        for epoch in range(num_epochs):
            for batch in tqdm.tqdm(train_loader, desc=f"Epoch {epoch + 1} Training"):
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                loss = outputs.loss
                loss.backward()
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()

        # Validation loop
        total_eval_accuracy = 0
        model.eval()
        for batch in tqdm.tqdm(val_loader, desc="Validating"):
            batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                outputs = model(**batch)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)
            accuracy = (predictions == batch['labels']).float().mean()
            total_eval_accuracy += accuracy.item()

        avg_val_accuracy = total_eval_accuracy / len(val_loader)
        fold_results.append(avg_val_accuracy)
        print(f"Fold {fold+1} validation accuracy: {avg_val_accuracy}")

    # Average accuracy across folds for current learning rate
    average_accuracy = sum(fold_results) / len(fold_results)
    hyperparam_results[lr] = average_accuracy
    print(f"Average validation accuracy with lr={lr}: {average_accuracy}")

# Identify best performing hyperparameters
best_lr = max(hyperparam_results, key=hyperparam_results.get)
print(f"Best learning rate: {best_lr} with an average validation accuracy of {hyperparam_results[best_lr]}")

# hyperparameter tuning


Using cpu device
Training with lr=1e-05, fold 1/5


Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Training: 100%|██████████| 240/240 [09:49<00:00,  2.46s/it]
Epoch 2 Training: 100%|██████████| 240/240 [09:44<00:00,  2.44s/it]
Epoch 3 Training: 100%|██████████| 240/240 [09:59<00:00,  2.50s/it]
Epoch 4 Training: 100%|██████████| 240/240 [07:28<00:00,  1.87s/it]
Epoch 5 Training: 100%|██████████| 240/240 [06:54<00:00,  1.73s/it]
Validating: 100%|██████████| 60/60 [00:25<00:00,  2.36it/s]


Fold 1 validation accuracy: 0.5489583333333333
Training with lr=1e-05, fold 2/5


Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1 Training: 100%|██████████| 240/240 [06:50<00:00,  1.71s/it]
Epoch 2 Training: 100%|██████████| 240/240 [06:49<00:00,  1.71s/it]
Epoch 3 Training:  40%|███▉      | 95/240 [07:48<04:40,  1.93s/it]  

In [52]:
from sklearn.metrics import classification_report, confusion_matrix
#first maybe we need to train the model with the best hyperparameters on a 20% validation set
# Split data into training and validation sets
train_sentences, val_sentences, train_labels, val_labels = train_test_split(
    df['sentence'], y_encoded, test_size=0.1, random_state=100)

train_dataset = CEFRDataset(train_sentences, tokenizer, max_len=128, labels=train_labels)
val_dataset = CEFRDataset(val_sentences, tokenizer, max_len=128, labels=val_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)
model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=len(np.unique(y_encoded)))
model.to(device)
optimizer = AdamW(model.parameters(), lr=best_lr)

model.train()
for epoch in range(num_epochs):
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

model.eval()
true_labels = []
predicted_labels = []
for batch in val_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    true_labels.extend(batch['labels'].cpu().numpy())
    predicted_labels.extend(predictions.cpu().numpy())

print(classification_report(true_labels, predicted_labels))
print(confusion_matrix(true_labels, predicted_labels))

              precision    recall  f1-score   support

           0       0.83      0.71      0.77        91
           1       0.48      0.61      0.54        72
           2       0.63      0.52      0.57        92
           3       0.44      0.59      0.50        70
           4       0.48      0.42      0.45        71
           5       0.67      0.63      0.65        84

    accuracy                           0.59       480
   macro avg       0.59      0.58      0.58       480
weighted avg       0.60      0.59      0.59       480

[[65 24  2  0  0  0]
 [ 9 44 17  2  0  0]
 [ 4 23 48 15  0  2]
 [ 0  0  7 41 15  7]
 [ 0  0  0 24 30 17]
 [ 0  0  2 12 17 53]]


In [None]:
#todo : train on the full dataset and save the model!!!
full_dataset = CEFRDataset(df['sentence'], tokenizer, max_len=128, labels=y_encoded)
full_loader = DataLoader(full_dataset, batch_size=16, shuffle=True)
optimizer = AdamW(model.parameters(), lr=best_lr)
num_training_steps = num_epochs * len(full_loader)
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

model.train()
for epoch in range(num_epochs):
    for batch in tqdm.tqdm(full_loader, desc=f"Training Epoch {epoch+1}"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

In [53]:
model_path = "/Users/vaienti/Library/CloudStorage/OneDrive-epfl.ch/2024_courses/data_science_and_machine_learning/models/"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)


('/Users/vaienti/Library/CloudStorage/OneDrive-epfl.ch/2024_courses/data_science_and_machine_learning/models/tokenizer_config.json',
 '/Users/vaienti/Library/CloudStorage/OneDrive-epfl.ch/2024_courses/data_science_and_machine_learning/models/special_tokens_map.json',
 '/Users/vaienti/Library/CloudStorage/OneDrive-epfl.ch/2024_courses/data_science_and_machine_learning/models/sentencepiece.bpe.model',
 '/Users/vaienti/Library/CloudStorage/OneDrive-epfl.ch/2024_courses/data_science_and_machine_learning/models/added_tokens.json')

In [57]:
import torch
from torch.utils.data import DataLoader

# Assuming the model and tokenizer are already loaded and set up

# Load the test dataset
test = pd.read_csv('../test/unlabelled_test_data.csv')
test_sentences = test['sentence'].reset_index(drop=True)
test_dataset = CEFRDataset(test_sentences, tokenizer, max_len=128)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Run inference
predictions = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        pred = torch.argmax(logits, dim=1)
        
        predictions.extend(pred.cpu().numpy())

# Decode predictions
decoded_predictions = encoder.inverse_transform(predictions)

# Add decoded predictions to the DataFrame
test['predicted_labels'] = decoded_predictions
import copy
test_results = copy.deepcopy(test)
test_results = test_results.drop(columns=['sentence'])
test_results = test_results.rename(columns={'predicted_labels': 'difficulty'})
test_results.to_csv('../kaggle_submissions/test_with_predictions.csv', index=False)




Inference complete. Results saved to 'test_with_predictions.csv'.


## 1. Lemmatization of the sentences

Create tokenizer and model

Separate the data into train and test:

In [5]:
from transformers import CamembertTokenizer, CamembertForSequenceClassification, FlaubertTokenizer, FlaubertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch
# 1) tokenization > used to encode the sentences
# we could do the tokenization either with Camembert or Flaubert
# 2) padding > used to make all the sentences of the same length
# 3) attention masks > to give the same weight to all the words, regardless of their length
num_classes = df['difficulty'].nunique()
chosen_tokenizer = 'camembert'

if chosen_tokenizer == 'camembert':
    tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
    model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=num_classes)
elif chosen_tokenizer == 'flaubert':
    tokenizer = FlaubertTokenizer.from_pretrained('flaubert/flaubert_base_cased')
    model = FlaubertForSequenceClassification.from_pretrained('flaubert/flaubert_base_cased', num_labels=num_classes)

tokenized = X_train.apply((lambda x_: tokenizer.encode(x_, add_special_tokens=True)))
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

attention_mask = np.where(padded != 0, 1, 0)
# now we load the data into a torch dataloader 
# respecting the input expected by the BERT model

input_ids = torch.tensor(padded)
#create the attention mask copying with sourceTensor.clone()
attention_mask_tensor = torch.tensor(attention_mask)
labels = torch.tensor(y_train)
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
dataset = TensorDataset(input_ids, attention_mask_tensor, labels)
dataloader = DataLoader(dataset, batch_size=32)
from transformers import  get_linear_schedule_with_warmup
from torch.optim import AdamW

optimizer = AdamW(model.parameters(), lr=1e-5, eps=1e-8)

scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(dataloader)*epochs)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# training loop
from tqdm import tqdm
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
epochs = 1
for epoch in range(1, epochs+1):
    model.train()
    loss_train_total = 0
    progress_bar = tqdm(dataloader, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        outputs = model(**inputs)
        loss = outputs.loss
        loss_train_total += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item() / len(batch[0]))})
    torch.save(model.state_dict(), f'BERT_ft_epoch{epoch}.model')
    tqdm.write(f'\nEpoch {epoch}')
    loss_train_avg = loss_train_total/len(dataloader)
    tqdm.write(f'Training loss: {loss_train_avg}')

                                                                                


Epoch 1
Training loss: 1.6941598991552989


Predicting Levels
We can now use our model to predict the level of a text. To do this, we need to correctly encode our text in the same way as our data was encoded during training.

In [9]:
def predict_text(text, device):
    encoded_text = tokenizer.encode_plus(
        text,
        max_length=128,
        add_special_tokens=True,
        return_token_type_ids=False,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    input_ids = encoded_text['input_ids'].to(device)
    attention_mask = encoded_text['attention_mask'].to(device)
    output = model(input_ids, attention_mask)
    _, prediction = torch.max(output[0], dim=1)
    return prediction[0].item()

y_pred = []
for text in tqdm(X_test):
    y_pred.append(predict_text(text, device))
    
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred, target_names=['A1', 'A2', 'B1', 'B2', 'C1', 'C2']))

  0%|          | 0/960 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 960/960 [01:28<00:00, 10.83it/s]

              precision    recall  f1-score   support

          A1       0.42      0.94      0.58       166
          A2       0.29      0.15      0.19       158
          B1       0.42      0.10      0.17       166
          B2       0.44      0.12      0.19       153
          C1       0.29      0.11      0.16       152
          C2       0.37      0.83      0.51       165

    accuracy                           0.38       960
   macro avg       0.37      0.37      0.30       960
weighted avg       0.37      0.38      0.30       960




