In [8]:
%%capture
# Installing the 'datasets' package for convenient access to various datasets
!pip install datasets

# Installing the 'transformers' package with optional torch support
!pip install transformers[torch]

# Installing the 'googletrans' package with version 4.0.0-rc1 for translation capabilities
!pip install googletrans==4.0.0-rc1

In [9]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pyarrow as pa
import torch.nn as nn
import string

# Importing transformers for natural language processing tasks
from transformers import AutoModelForSequenceClassification
from transformers import AutoModel, AutoTokenizer
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import TrainingArguments, Trainer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from transformers import CamembertModel, CamembertTokenizer
from typing import Dict
from googletrans import Translator

# Loading datasets from remote URLs
sample = pd.read_csv("https://raw.githubusercontent.com/Bratuz/Jaeger/master/detecting-french-texts-difficulty-level-2023/Data/sample_submission.csv")
train_dataset = pd.read_csv("https://raw.githubusercontent.com/Bratuz/Jaeger/master/detecting-french-texts-difficulty-level-2023/Data/training_data.csv")
test_dataset = pd.read_csv("https://raw.githubusercontent.com/Bratuz/Jaeger/master/detecting-french-texts-difficulty-level-2023/Data/unlabelled_test_data.csv")
supp = pd.read_csv("https://raw.githubusercontent.com/Bratuz/Jaeger/master/detecting-french-texts-difficulty-level-2023/Data/translation.csv")

In [10]:
# Dropping the 'id' column from the training dataset
train_dataset.drop('id', axis=1, inplace=True)

In [11]:
# Concatenating the training dataset with the supplementary dataset
train_dataset = pd.concat([train_dataset, supp], ignore_index=True)

In [12]:
# Initializing a Camembert tokenizer from the 'camembert-base' pre-trained model
tokenizer = AutoTokenizer.from_pretrained('camembert-base')

config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

In [13]:
def process_data(row):
    # Extracting text from the 'sentence' column
    text = row['sentence']
    text = str(text)
    text = ' '.join(text.split())

    # Tokenizing the text with Camembert tokenizer, handling padding and truncation
    encodings = tokenizer(text, padding="max_length", truncation=True, max_length=256)

    # Default label, can be adjusted based on the default difficulty level
    label = 0

    # Defining values corresponding to each difficulty level
    difficulty_mapping = {'A1': 0, 'A2': 1, 'B1': 2, 'B2': 3, 'C1': 4, 'C2': 5}

    # Checking if the difficulty level is present in the 'difficulty' column
    if row['difficulty'] in difficulty_mapping:
        label = difficulty_mapping[row['difficulty']]
    else:
        print(f"Label {row['difficulty']} not found in difficulty_mapping. Row: {row}")

    # Adding 'label' and 'text' to the encodings dictionary
    encodings['label'] = label
    encodings['text'] = text

    return encodings

In [14]:
# Creating an empty list to store processed data
processed_data = []

# Iterating through rows in the training dataset and processing each row using the 'process_data' function
for i in range(len(train_dataset)):
    processed_data.append(process_data(train_dataset.iloc[i]))

In [15]:
# Creating a new DataFrame 'new_df' from the processed data
new_df = pd.DataFrame(processed_data)

# Splitting the new DataFrame into training and validation sets
train_df, valid_df = train_test_split(new_df, test_size=0.2, random_state=50)

In [16]:
# Creating Hugging Face Datasets from Pandas DataFrames for training and validation
train_hg = Dataset(pa.Table.from_pandas(train_df))
valid_hg = Dataset(pa.Table.from_pandas(valid_df))

In [17]:
# Initializing a Camembert model for sequence classification with six output labels
model = AutoModelForSequenceClassification.from_pretrained('camembert-base', num_labels=6)

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
# Importing the necessary metrics from scikit-learn
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Defining a function to compute accuracy, precision, recall, and f1-score for Hugging Face Trainer evaluation
def compute_metrics(p):
    labels = p.label_ids
    preds = p.predictions.argmax(-1)

    # Computing accuracy using scikit-learn's accuracy_score
    acc = accuracy_score(labels, preds)

    # Computing precision, recall, and f1-score using scikit-learn's precision_recall_fscore_support
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')

    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}


In [21]:
# Defining training arguments for the Trainer
training_args = TrainingArguments(output_dir="./results", num_train_epochs=11, evaluation_strategy="epoch")

# Initializing a Trainer with the specified model, training arguments, and datasets
trainer = Trainer(model=model, args=training_args, train_dataset=train_hg, eval_dataset=valid_hg, tokenizer=tokenizer, compute_metrics=compute_metrics)

In [22]:
# Initiating the training process using the Trainer
trainer.train()

You're using a CamembertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2851,1.030635,0.571354,0.564446,0.571354,0.554653
2,0.7508,0.901428,0.675521,0.691327,0.675521,0.672709
3,0.4216,0.970136,0.733333,0.753629,0.733333,0.731464
4,0.2932,1.012619,0.771354,0.778362,0.771354,0.770495
5,0.1942,1.060784,0.805208,0.812959,0.805208,0.806128
6,0.0943,0.928868,0.835417,0.838502,0.835417,0.835053
7,0.0749,1.02255,0.838542,0.843332,0.838542,0.838558
8,0.0466,1.164083,0.832292,0.840546,0.832292,0.831875
9,0.021,1.099475,0.847396,0.852032,0.847396,0.847776
10,0.0163,1.084778,0.851042,0.854903,0.851042,0.851392


TrainOutput(global_step=10560, training_loss=0.27431910476844873, metrics={'train_runtime': 4224.7805, 'train_samples_per_second': 19.996, 'train_steps_per_second': 2.5, 'total_flos': 1.111421012410368e+16, 'train_loss': 0.27431910476844873, 'epoch': 11.0})

In [23]:
# Evaluating the trained model on the validation dataset using the Trainer
trainer.evaluate()

{'eval_loss': 1.1453781127929688,
 'eval_accuracy': 0.84375,
 'eval_precision': 0.8488914529164528,
 'eval_recall': 0.84375,
 'eval_f1': 0.8438586887696303,
 'eval_runtime': 26.943,
 'eval_samples_per_second': 71.262,
 'eval_steps_per_second': 8.908,
 'epoch': 11.0}

In [24]:
# Saving the trained model to the specified directory
model.save_pretrained('./model/')

In [25]:
# Importing the torch module for PyTorch functionality
import torch

# Checking for GPU availability and setting the device accordingly using torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Loading the trained model from the specified directory and moving it to the selected device
new_model = AutoModelForSequenceClassification.from_pretrained('./model/').to(device)

In [26]:
# Initializing a new Camembert tokenizer from the 'camembert-base' pre-trained model
new_tokenizer = AutoTokenizer.from_pretrained('camembert-base')

In [27]:
def get_prediction(text):
    # Tokenizing the input text and preparing the input for the model
    encoding = new_tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=256)
    encoding = {k: v.to(new_model.device) for k, v in encoding.items()}

    # Forward pass through the model to obtain predictions
    outputs = new_model(**encoding)

    # Extracting logits from the model outputs
    logits = outputs.logits

    # Determining device (GPU or CPU) for softmax calculation
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Applying softmax to obtain class probabilities
    softmax = torch.nn.Softmax(dim=-1)
    probs = softmax(logits.squeeze().cpu())
    probs = probs.detach().numpy()

    # Determining the predicted class based on the highest probability
    predicted_class = np.argmax(probs, axis=-1)

    # Mapping the predicted class to specific difficulty levels
    difficulty_mapping = {0: 'A1', 1: 'A2', 2: 'B1', 3: 'B2', 4: 'C1', 5: 'C2'}
    predicted_difficulty = difficulty_mapping[predicted_class.item()]

    # Creating the result in the desired format
    result = {
        'predicted_difficulty': predicted_difficulty,
        'probability': probs[predicted_class].item()
    }

    return result

In [28]:
# Importing the tqdm module for displaying progress bars
from tqdm import tqdm

In [29]:
# Initializing an empty list to store predictions
predicted_difficulties = []

# Looping through each row in the test_dataset and obtaining predictions
for index, row in tqdm(test_dataset.iterrows(), total=len(test_dataset), desc="Making Predictions"):
    sentence = row['sentence']
    prediction_result = get_prediction(sentence)

    # Assuming that your prediction_result looks like {'predicted_difficulty': 'A1', 'probability': 0.95}
    predicted_difficulty = prediction_result['predicted_difficulty']
    predicted_difficulties.append(predicted_difficulty)

# Adding a new column 'difficulty' to your test_dataset
test_dataset['difficulty'] = predicted_difficulties

Making Predictions: 100%|██████████| 1200/1200 [00:21<00:00, 57.11it/s]


In [30]:
# Dropping the 'sentence' column from the test_dataset
test_dataset = test_dataset.drop('sentence', axis=1)

In [31]:
# Saving the predictions to a CSV file
test_dataset.to_csv("predictions_test.csv", index=False)

In [32]:
# Downloading the predictions_test.csv file in Google Colab
from google.colab import files
files.download("predictions_test.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>