In [None]:
%%capture
!pip install transformers[torch]
!pip install datasets
!pip install pyarrow
!pip install evaluate
!pip install --upgrade -q wandb

#!python -m spacy download fr_core_news_sm

In [None]:
%%capture
import pandas as pd
from transformers import AutoModel, AutoTokenizer
from transformers import FlaubertModel, FlaubertTokenizer#CamembertTokenizer
from typing import Dict
import pyarrow as pa
from datasets import Dataset
from transformers import AutoModelForSequenceClassification
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import TrainingArguments, Trainer

In [None]:
my_data = pd.read_csv("/kaggle/input/textes/training_data.csv")

In [None]:
my_data.head()

In [None]:
my_data.shape

In [None]:

# Number of words in each sentence
number_data = []

# Iterate through your DataFrame rows
for index, row in my_data.iterrows():
   
    num_words = len(row['sentence'].split())

    # Append the result to the processed_data list
    number_data.append({'sentence': row['sentence'], 'difficulty': row['difficulty'], 'num_words': num_words})

# Create a new DataFrame from the processed_data list
new_df = pd.DataFrame(number_data)


In [None]:
# Group by 'difficulty' and calculate the mean number of words
avg_words_by_difficulty = new_df.groupby('difficulty')['num_words'].mean()

# number of words by difficulty
print(avg_words_by_difficulty)


In [None]:
%%capture
tokenizer = AutoTokenizer.from_pretrained('camembert-base',do_lower_case=False)


In [None]:
## The code to create the sentence without the numbers. We didn't use it because it didn't improve the results. 


# import re
#from typing import Dict
#
#def remove_numbers(text):
#    # Use regular expression to remove numbers
#    text_without_numbers = re.sub(r'\d+', '', text)
#    return text_without_numbers

In [None]:
## The code to substracts the proper names. We didn't use it because it didn't improve the results.


#import spacy
#
## Load the spaCy French language model
#nlp = spacy.load('fr_core_news_sm')
#
#def remove_proper_names(text):
#    doc = nlp(text)
#    # Remove entities (proper names)
#    text_without_entities = ' '.join([token.text if not token.ent_type_ else '' for token in doc])
#    return text_without_entities

In [None]:
def process_data(row) -> Dict:
    # Clean the text
    text = row['sentence']

    # Get tokens
    encodings = tokenizer(text, truncation=True, max_length=512)

    # Convert difficulty labels to integers
    difficulty_mapping = {'A1': 0, 'A2': 1, 'B1': 2, 'B2': 3, 'C1': 4, 'C2': 5}
    label = difficulty_mapping.get(row['difficulty'], 0)

    encodings['label'] = label
    encodings['text'] = text

    return encodings


In [None]:
#test
print(process_data({'sentence': "En fait, je trouve que l'éducation est une bonne chose mais il ne faut pas .",'difficulty': 'B1'}))

In [None]:
# Store the encodings into an array to generate dataset
processed_data = []

for i in range(len(my_data[:4800])):
        processed_data.append(process_data(my_data.iloc[i]))

In [None]:
from sklearn.model_selection import train_test_split
#from transformers import DataCollatorWithPadding
#import evaluate

In [None]:
new_df = pd.DataFrame(processed_data)

train_df, valid_df = train_test_split(new_df,test_size=0.15,random_state=42)

In [None]:
train_hg = Dataset(pa.Table.from_pandas(train_df))
valid_hg = Dataset(pa.Table.from_pandas(valid_df))

In [None]:
%%capture
model1 = AutoModelForSequenceClassification.from_pretrained("camembert-base",num_labels=6)


In [None]:
## We tried the droopout technique to prevent overfitting but we didn't notice any visible improvement in the results so we didn't use it.

#from torch.nn import Dropout
##we are adding a droupout layer to prevent over fitting 
#model1.dropout = Dropout(0.3)

In [None]:

def compute_metrics(p):

    predictions, labels = p.predictions, p.label_ids
    predictions = np.argmax(predictions, axis=1)

    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions, average='weighted')
    recall = recall_score(labels, predictions, average='weighted')
    f1 = f1_score(labels, predictions, average='weighted')
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

In [None]:
# We let all the diffrent parameters as default except the number of epochs which we set to 10. 
# We tried to change the learning rate and other parameters but we get our best results without changing them (with the base values). 
# All the parameters are commented below are those we tried to change to get best results but we didn't get any improvement.

training_args = TrainingArguments(
    output_dir="/kaggle/working/results",
    num_train_epochs=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    
    #learning_rate=3e-5, #5e-5 by default
    #per_device_train_batch_size=16,
    #per_device_eval_batch_size=16,
    #fp16=True,
    #fp16_opt_level="O2",
    #weight_decay=0.01,
    #metric_for_best_model="accuracy",
    #warmup_steps=200,
    #gradient_accumulation_steps = 4 
    
    )

trainer = Trainer(model=model1,
                  args=training_args,
                  train_dataset=train_hg,
                  eval_dataset=valid_hg,
                  tokenizer=tokenizer,
                  compute_metrics=compute_metrics)

In [None]:
from kaggle_secrets import UserSecretsClient
import wandb

user_secrets = UserSecretsClient()

wandb_api = user_secrets.get_secret("wandb_api")

wandb.login(key=wandb_api)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
model1.save_pretrained('/kaggle/working/model_camembert_V4/')