In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
%%capture
!pip install transformers[torch]
!pip install datasets
!pip install pyarrow
!pip install evaluate

In [3]:
import pandas as pd

In [4]:
my_data = pd.read_csv("/content/drive/MyDrive/Data Science and Machine Learning/Project_Data_Science/training_data.csv")

In [5]:
my_data.head()

Unnamed: 0,id,sentence,difficulty
0,0,Les coûts kilométriques réels peuvent diverger...,C1
1,1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1
2,2,Le test de niveau en français est sur le site ...,A1
3,3,Est-ce que ton mari est aussi de Boston?,A1
4,4,"Dans les écoles de commerce, dans les couloirs...",B1


In [6]:
my_data.shape

(4800, 3)

In [7]:
from transformers import AutoModel, AutoTokenizer
from transformers import CamembertTokenizer
from typing import Dict

In [8]:
%%capture
tokenizer = AutoTokenizer.from_pretrained('camembert-base')

In [9]:
def process_data(row) -> Dict:
    # Clean the text
    text = row['sentence']
    text = str(text)
    text = ' '.join(text.split())

    # Get tokens
    encodings = tokenizer(text, padding="max_length", truncation=True, max_length=256)

    # Convert difficulty labels to integers
    difficulty_mapping = {'A1': 0, 'A2': 1, 'B1': 2, 'B2': 3, 'C1': 4, 'C2': 5}
    label = difficulty_mapping.get(row['difficulty'], 0)

    encodings['label'] = label
    encodings['text'] = text

    return encodings

In [10]:
#test
print(process_data({'sentence': "J'aime les films d'actions",'difficulty': 'B2'}))

{'input_ids': [5, 121, 11, 660, 19, 1379, 18, 11, 6379, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [11]:
# Store the encodings into an array to generate dataset
processed_data = []

for i in range(len(my_data[:4800])):
        processed_data.append(process_data(my_data.iloc[i]))

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
new_df = pd.DataFrame(processed_data)

train_df, valid_df = train_test_split(new_df,test_size=0.2,random_state=2022)

In [14]:
import pyarrow as pa
from datasets import Dataset

In [15]:
train_hg = Dataset(pa.Table.from_pandas(train_df))
valid_hg = Dataset(pa.Table.from_pandas(valid_df))

In [16]:
print(type(train_hg))

<class 'datasets.arrow_dataset.Dataset'>


In [17]:
from transformers import AutoModelForSequenceClassification

In [18]:
%%capture
model1 = AutoModelForSequenceClassification.from_pretrained('camembert-base',num_labels=6)

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [19]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [20]:
def compute_metrics(p):
    predictions, labels = p.predictions, p.label_ids
    predictions = np.argmax(predictions, axis=1)

    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions, average='weighted')
    recall = recall_score(labels, predictions, average='weighted')
    f1 = f1_score(labels, predictions, average='weighted')

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

In [2]:
from transformers import TrainingArguments, Trainer

In [1]:
import accelerate


('4.35.2', '0.24.1')

In [4]:
training_args = TrainingArguments(output_dir="/content/drive/MyDrive/Data Science and Machine Learning/Project_Data_Science/result_V3", evaluation_strategy="epoch")

trainer = Trainer(model=model,args=training_args,train_dataset=train_hg,eval_dataset=valid_hg,tokenizer=tokenizer,compute_metrics=compute_metrics)

NameError: name 'model' is not defined

In [23]:
trainer.train()

You're using a CamembertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,1.064689,0.538542,0.526041,0.538542,0.520443
2,1.268500,1.036839,0.569792,0.57691,0.569792,0.569426
3,0.871400,1.145427,0.569792,0.574783,0.569792,0.567941


TrainOutput(global_step=1440, training_loss=0.9392418967352973, metrics={'train_runtime': 578.089, 'train_samples_per_second': 19.928, 'train_steps_per_second': 2.491, 'total_flos': 1515574107832320.0, 'train_loss': 0.9392418967352973, 'epoch': 3.0})

In [24]:
trainer.evaluate()

{'eval_loss': 1.1454271078109741,
 'eval_accuracy': 0.5697916666666667,
 'eval_precision': 0.5747831578482807,
 'eval_recall': 0.5697916666666667,
 'eval_f1': 0.5679411129233926,
 'eval_runtime': 13.5758,
 'eval_samples_per_second': 70.714,
 'eval_steps_per_second': 8.839,
 'epoch': 3.0}

In [25]:
model1.save_pretrained('/content/drive/MyDrive/Data Science and Machine Learning/Project_Data_Science/train_model&test_bert_V3/')

### Time for predictions


In [27]:
%%capture
!pip install kaggle
!mkdir ~/.kaggle
!cp /content/drive/MyDrive/kaggle.json ~/.kaggle/kaggle.json

In [28]:
! kaggle competitions download -c detecting-french-texts-difficulty-level-2023
from zipfile import ZipFile
with ZipFile('detecting-french-texts-difficulty-level-2023.zip','r') as zip:
  zip.extractall(path="")

Downloading detecting-french-texts-difficulty-level-2023.zip to /content
100% 303k/303k [00:00<00:00, 477kB/s]
100% 303k/303k [00:00<00:00, 476kB/s]


In [29]:
import torch

In [30]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
new_model = AutoModelForSequenceClassification.from_pretrained('/content/drive/MyDrive/Data Science and Machine Learning/Project_Data_Science/train_model&test_bert_V3/').to(device)

In [31]:
unlabelled_data = pd.read_csv('unlabelled_test_data.csv', index_col = 'id')

In [32]:
new_tokenizer = AutoTokenizer.from_pretrained('camembert-base')

In [47]:
def get_prediction(text):
    encoding = new_tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=256)
    encoding = {k: v.to(trainer.model.device) for k,v in encoding.items()}

    outputs = new_model(**encoding)

    logits = outputs.logits
    #device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    softmax = torch.nn.Softmax(dim=-1)
    probs = softmax(logits.squeeze().cpu())
    probs = probs.detach().numpy()

    predicted_class = np.argmax(probs, axis=-1)

    # Map the predicted class to your specific difficulty levels
    difficulty_mapping = {0: 'A1', 1: 'A2', 2: 'B1', 3: 'B2', 4: 'C1', 5: 'C2'}
    predicted_difficulty = difficulty_mapping[predicted_class.item()]

    # Création du résultat sous la forme souhaitée
    result = {
        'predicted_difficulty': predicted_difficulty,
        'probability': probs[predicted_class].item()
    }

    return result

In [48]:
from tqdm import tqdm

In [49]:
# Initialiser une liste vide pour stocker les prédictions
predicted_difficulties = []

# Loop à travers chaque ligne dans le test_dataset et obtenir les prédictions
for index, row in tqdm(unlabelled_data.iterrows(), total=len(unlabelled_data), desc="Making Predictions"):
    sentence = row['sentence']
    prediction_result = get_prediction(sentence)

    # Supposons que votre prediction_result ressemble à {'predicted_difficulty': 'A1', 'probability': 0.95}
    predicted_difficulty = prediction_result['predicted_difficulty']
    predicted_difficulties.append(predicted_difficulty)

# Ajouter une nouvelle colonne 'difficulty' à votre test_dataset
unlabelled_data['difficulty'] = predicted_difficulties

Making Predictions: 100%|██████████| 1200/1200 [00:33<00:00, 35.92it/s]


In [50]:
predictions = unlabelled_data[['difficulty']]

In [51]:
predictions.head()

Unnamed: 0_level_0,difficulty
id,Unnamed: 1_level_1
0,C2
1,A2
2,B1
3,A2
4,C2


In [52]:
predictions.to_csv('submission.csv')

In [54]:
#! kaggle competitions submit -c detecting-french-texts-difficulty-level-2023 -f submission.csv -m "Sample submission"

100% 8.30k/8.30k [00:02<00:00, 3.86kB/s]
Successfully submitted to Detecting the difficulty level of French texts

## Test accuracy score: 0.574