In [None]:
!pip3 install tokenizer
!pip3 install sentencepiece
import pandas as pd
import torch
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import train_test_split
from transformers import AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import CamembertForSequenceClassification, CamembertTokenizer
from tqdm import trange
import nltk
import tokenizer
import re
from nltk.tokenize import word_tokenize
import string
from sklearn.metrics import accuracy_score

MAX_LEN = 128
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Load the dataset, I selected only 5000 sample because of memory limitation
df = pd.read_csv('training_data.csv').reset_index(drop=True)
df.head()

# Mapping des valeurs de la colonne "difficulty"
difficulty_mapping = {
    'A1': 0,
    'A2': 1,
    'B1': 2,
    'B2': 3,
    'C1': 4,
    'C2': 5
}

# Utiliser la fonction map pour encoder les valeurs
df['difficulty_encoded'] = df['difficulty'].map(difficulty_mapping)

unique_labels = df['difficulty_encoded'].unique()
print(unique_labels)

# Creates list of texts and labels
text = df['sentence'].to_list()
labels = df['difficulty_encoded'].to_list()  # Utilisez les labels encodés

# Initialisation du tokenizer et du modèle
tokenizer = CamembertTokenizer.from_pretrained("camembert-base", do_lower_case=True)
model = CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=6)
model.to(device)

# Définir une taille de lot initiale, si nécessaire
batch_size = 16


    # Tokenisation des données
input_ids = [tokenizer.encode(sent, add_special_tokens=True, max_length=MAX_LEN, pad_to_max_length=True, truncation=True) for sent in text]
attention_masks = [[float(i > 0) for i in seq] for seq in input_ids]

    # Conversion en tenseurs
input_ids = torch.tensor(input_ids)
attention_masks = torch.tensor(attention_masks)
labels2 = torch.tensor(labels)

    # Division en ensembles d'entraînement et de validation
train_inputs, validation_inputs, train_labels, validation_labels, train_masks, validation_masks = train_test_split(input_ids, labels2, attention_masks, random_state=42, test_size=0.2)

    # Préparation des DataLoader
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

# Définition de la fonction train_and_evaluate
def train_and_evaluate(model, learning_rate, batch_size, num_epochs, device, df,train_dataloader, validation_dataloader, eps):

    # Définition de l'optimiseur
    optimizer = AdamW(model.parameters(), lr=learning_rate, eps=eps)

    # Boucle d'entraînement
    for _ in trange(num_epochs, desc="Epoch"):
        model.train()
        for batch in train_dataloader:
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            optimizer.zero_grad()
            outputs = model(input_ids=b_input_ids, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

        # Évaluation du modèle
        eval_accuracy = 0
        nb_eval_steps = 0
        model.eval()
        for batch in validation_dataloader:
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            with torch.no_grad():
                outputs = model(input_ids=b_input_ids, attention_mask=b_input_mask)
                logits = outputs.logits

            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            tmp_eval_accuracy = accuracy_score(np.argmax(logits, axis=1).flatten(), label_ids.flatten())
            eval_accuracy += tmp_eval_accuracy
            nb_eval_steps += 1

        eval_accuracy /= nb_eval_steps

    return eval_accuracy

# Exemple d'utilisation de la fonction
learning_rates = [1e-5, 2e-5, 5e-5]
batch_sizes = [8, 16, 32]
epochs = [3, 5, 7]
eps_values = [1e-8, 1e-7, 1e-6]

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
df = pd.read_csv('training_data.csv')  # Assurez-vous d'avoir le dataframe 'df' chargé avec vos données

best_accuracy = 0
best_model_path = "best_model.pt"

for lr in learning_rates:
    for batch in batch_sizes:
        for epoch in epochs:
            for eps in eps_values:
                accuracy = train_and_evaluate(model, lr, batch, epoch, device, df, train_dataloader, validation_dataloader, eps)
                print(f"LR: {lr}, Batch: {batch}, Epoch: {epoch}, Eps: {eps}, Accuracy: {accuracy}")

                if accuracy > best_accuracy:
                                    best_accuracy = accuracy
                                    torch.save(model.state_dict(), best_model_path)
                                    print(f"Meilleur modèle sauvegardé avec accuracy: {accuracy}")


In [None]:
# Charger le meilleur modèle
# Remplacez 'best_model_path' par le chemin de votre meilleur modèle sauvegardé
model = CamembertForSequenceClassification.from_pretrained('best_model')
model.to(device)

# Charger le nouveau jeu de données
new_df = pd.read_csv('unlabelled_test_data.csv')
new_texts = new_df['sentence'].tolist()  # Assurez-vous que la colonne contient les phrases

# Préparer les données pour le modèle
tokenizer = CamembertTokenizer.from_pretrained('camembert-base', do_lower_case=True)
new_input_ids = [tokenizer.encode(sent, add_special_tokens=True, max_length=MAX_LEN, pad_to_max_length=True, truncation=True) for sent in new_texts]
new_attention_masks = [[float(i > 0) for i in seq] for seq in new_input_ids]

# Convertir en tenseurs
new_input_ids = torch.tensor(new_input_ids)
new_attention_masks = torch.tensor(new_attention_masks)

# Créer un DataLoader
prediction_data = TensorDataset(new_input_ids, new_attention_masks)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

# Prédiction
model.eval()
predictions = []

for batch in prediction_dataloader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_input_mask = batch

    with torch.no_grad():
        outputs = model(input_ids=b_input_ids, attention_mask=b_input_mask)

    logits = outputs.logits
    logits = logits.detach().cpu().numpy()
    predictions.append(logits)

# Convertir les prédictions en étiquettes de difficulté
predicted_labels = [np.argmax(p, axis=1).flatten() for p in predictions]
predicted_labels = np.concatenate(predicted_labels)

# Créer un DataFrame pour le CSV
output_df = pd.DataFrame({
    'id': new_df.index,  # ou une autre colonne d'identification si disponible
    'difficulty': [list(difficulty_mapping.keys())[list(difficulty_mapping.values()).index(label)] for label in predicted_labels]
})

# Enregistrer en CSV
output_df.to_csv('predicted_difficulties.csv', index=False)
