In [1]:
#%%capture
#!pip install transformers[torch]
#!pip install datasets
#!pip install pyarrow
#!pip install evaluate

In [2]:
import pandas as pd

In [3]:
my_data = pd.read_csv("training_data.csv")

In [4]:
my_data.head()

Unnamed: 0,id,sentence,difficulty
0,0,Les coûts kilométriques réels peuvent diverger...,C1
1,1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1
2,2,Le test de niveau en français est sur le site ...,A1
3,3,Est-ce que ton mari est aussi de Boston?,A1
4,4,"Dans les écoles de commerce, dans les couloirs...",B1


In [5]:
my_data.shape

(4800, 3)

In [6]:
from transformers import AutoModel, AutoTokenizer
from transformers import CamembertTokenizer
from typing import Dict

In [7]:
%%capture
tokenizer = AutoTokenizer.from_pretrained('camembert-base')

In [8]:
def process_data(row) -> Dict:
    # Clean the text
    text = row['sentence']
    text = str(text)
    text = ' '.join(text.split())

    # Get tokens
    encodings = tokenizer(text, padding="max_length", truncation=True, max_length=512)

    # Convert difficulty labels to integers
    difficulty_mapping = {'A1': 0, 'A2': 1, 'B1': 2, 'B2': 3, 'C1': 4, 'C2': 5}
    label = difficulty_mapping.get(row['difficulty'], 0)

    encodings['label'] = label
    encodings['text'] = text

    return encodings

In [9]:
#test
print(process_data({'sentence': "Est-ce que ton mari est aussi de Boston?", 'difficulty': 'A1'}))

{'input_ids': [5, 1196, 26, 291, 27, 415, 1946, 30, 99, 8, 19171, 197, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [10]:
# Store the encodings into an array to generate dataset
processed_data = []

for i in range(len(my_data[:4800])):
        processed_data.append(process_data(my_data.iloc[i]))

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
new_df = pd.DataFrame(processed_data)

train_df, valid_df = train_test_split(new_df,test_size=0.2,random_state=2022)

In [13]:
import pyarrow as pa
from datasets import Dataset

In [14]:
train_hg = Dataset(pa.Table.from_pandas(train_df))
valid_hg = Dataset(pa.Table.from_pandas(valid_df))

In [15]:
print(type(train_hg))

<class 'datasets.arrow_dataset.Dataset'>


In [16]:
from transformers import AutoModelForSequenceClassification

In [23]:
%%capture
model = AutoModelForSequenceClassification.from_pretrained('camembert-base',num_labels=6)

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [25]:

def compute_metrics(p):
    predictions, labels = p.predictions, p.label_ids
    predictions = np.argmax(predictions, axis=1)

    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions, average='weighted')
    recall = recall_score(labels, predictions, average='weighted')
    f1 = f1_score(labels, predictions, average='weighted')

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

In [26]:
from transformers import TrainingArguments, Trainer
import accelerate

In [27]:
training_args = TrainingArguments(output_dir="./result", evaluation_strategy="epoch")

trainer = Trainer(model=model,args=training_args,train_dataset=train_hg,eval_dataset=valid_hg,tokenizer=tokenizer,compute_metrics=compute_metrics)

In [28]:
trainer.train()

  0%|          | 0/1440 [00:00<?, ?it/s]

You're using a CamembertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


KeyboardInterrupt: 

In [None]:
trainer.evaluate()

{'eval_loss': 1.926348328590393,
 'eval_accuracy': 0.55625,
 'eval_precision': 0.5885482243240117,
 'eval_recall': 0.55625,
 'eval_f1': 0.5548746000654246,
 'eval_runtime': 28.6108,
 'eval_samples_per_second': 33.554,
 'eval_steps_per_second': 4.194,
 'epoch': 3.0}

In [None]:
model1.save_pretrained('/content/drive/MyDrive/Data Science and Machine Learning/Project_Data_Science/train_model_bert_V2/')