### Télécharge le modèle pré-entrainé de l'article

In [7]:
from transformers import CamembertTokenizer, CamembertForMaskedLM
import torch
# Charger le modèle CamemBERT et son tokenizer
model_name = "camembert-base"  # Version 4GB
model = CamembertForMaskedLM.from_pretrained(model_name)
tokenizer = CamembertTokenizer.from_pretrained(model_name)

# Spécifiez le chemin du fichier à sauvegarder
save_path = "camembert_article_4GB.pt"

# Sauvegarder le modèle complet (poids + configuration)
torch.save(model.state_dict(), save_path)

print(f"Modèle sauvegardé dans le fichier : {save_path}")

Modèle sauvegardé dans le fichier : camembert_article_4GB.pt


### Lance l'entrainement sur un Dataset

In [1]:
from Camembert_4GB_model.pos_trainer import train_pos
from transformers import CamembertTokenizer, CamembertForMaskedLM
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1"
model_name = "camembert-base"  # Version 4GB
tokenizer = CamembertTokenizer.from_pretrained(model_name)

def extract_labels(conllu_file):
    labels = set()
    with open(conllu_file, "r", encoding="utf-8") as f:
        for line in f:
            if not line.startswith("#") and line.strip():  # Ignorer les commentaires et les lignes vides
                parts = line.split("\t")
                if len(parts) > 3:  # La 4e colonne est le label POS
                    labels.add(parts[3])
    return {label: idx for idx, label in enumerate(sorted(labels))}

data_train_path = "fr_gsd-ud-train.conllu"
data_dev_path = "fr_gsd-ud-dev.conllu"
label2id = extract_labels(data_train_path)

num_labels = len(label2id)

model = train_pos(
    pretrained_path="camembert_article_4GB.pt",
    train_path= data_train_path,
    dev_path= data_dev_path,
    tokenizer=tokenizer,
    label2id=label2id,
    num_labels=num_labels,
    device="cuda",
    lr = 3e-5
    out_model_path = "camembert_pos_gsd_article_best.pt"
    )

Epoch 1: 100%|██████████| 904/904 [08:32<00:00,  1.76it/s]


[Epoch 1] train loss=0.9708
[Epoch 1] dev acc=91.07%
New best dev acc=91.07% (epoch=1)


Epoch 2: 100%|██████████| 904/904 [08:32<00:00,  1.76it/s]


[Epoch 2] train loss=0.2517
[Epoch 2] dev acc=95.63%
New best dev acc=95.63% (epoch=2)


Epoch 3: 100%|██████████| 904/904 [08:32<00:00,  1.76it/s]


[Epoch 3] train loss=0.1432
[Epoch 3] dev acc=95.94%
New best dev acc=95.94% (epoch=3)


Epoch 4: 100%|██████████| 904/904 [08:31<00:00,  1.77it/s]


[Epoch 4] train loss=0.0991
[Epoch 4] dev acc=97.22%
New best dev acc=97.22% (epoch=4)


Epoch 5: 100%|██████████| 904/904 [08:32<00:00,  1.76it/s]


[Epoch 5] train loss=0.0778
[Epoch 5] dev acc=97.03%


Epoch 6: 100%|██████████| 904/904 [08:33<00:00,  1.76it/s]


[Epoch 6] train loss=0.0642
[Epoch 6] dev acc=96.46%


Epoch 7: 100%|██████████| 904/904 [08:33<00:00,  1.76it/s]


[Epoch 7] train loss=0.0565
[Epoch 7] dev acc=97.60%
New best dev acc=97.60% (epoch=7)


Epoch 8: 100%|██████████| 904/904 [08:33<00:00,  1.76it/s]


[Epoch 8] train loss=0.0419
[Epoch 8] dev acc=97.78%
New best dev acc=97.78% (epoch=8)


Epoch 9: 100%|██████████| 904/904 [08:33<00:00,  1.76it/s]


[Epoch 9] train loss=0.0417
[Epoch 9] dev acc=97.77%


Epoch 10: 100%|██████████| 904/904 [08:32<00:00,  1.77it/s]


[Epoch 10] train loss=0.0327
[Epoch 10] dev acc=97.77%


In [4]:
# Enregistrer le modèle entraîné
import torch
print(f"Type du modèle : {type(model)}")

output_model_path = "camembert_pos_gsd_article_best.pt"
torch.save(model.state_dict(), output_model_path)
print(f"Modèle sauvegardé sous : {output_model_path}")

Type du modèle : <class 'float'>


AttributeError: 'float' object has no attribute 'state_dict'

### Lance les tests sur tout les Dataset

In [None]:
from fine_tuning.pos_test import test_pos
from fine_tuning.ner_test import test_ner
from fine_tuning.nli_test import test_nli
from fine_tuning.parsing_test import test_parsing

from transformers import CamembertTokenizer, CamembertForMaskedLM

model_name = "camembert-base"  # Version 4GB
tokenizer = CamembertTokenizer.from_pretrained(model_name)

In [None]:
label2id = {
    "ADJ": 0,
    "ADP": 1,
    "ADV": 2,
    "AUX": 3,
    "CCONJ": 4,
    "DET": 5,
    "INTJ": 6,
    "NOUN": 7,
    "NUM": 8,
    "PART": 9,
    "PRON": 10,
    "PROPN": 11,
    "PUNCT": 12,
    "SCONJ": 13,
    "SYM": 14,
    "VERB": 15,
    "X": 16,
    "_": 17
}
id2label = {
    0: "ADJ",
    1: "ADP",
    2: "ADV",
    3: "AUX",
    4: "CCONJ",
    5: "DET",
    6: "INTJ",
    7: "NOUN",
    8: "NUM",
    9: "PART",
    10: "PRON",
    11: "PROPN",
    12: "PUNCT",
    13: "SCONJ",
    14: "SYM",
    15: "VERB",
    16: "X",
    17: "_"
}

model_pretrain = "camembert_article2_4GB.pt"

print("Dataset Rhapsodie :")
acc_pos_spoken = test_pos("camembert_pos_rhapsodie_article_best.pt",
               model_pretrain,
               "fr_rhapsodie-ud-test.conllu",
               tokenizer,
               label2id,
               id2label,
                device="cuda")

print("\nDataset Partut :")
acc_pos_ParTUT = test_pos("camembert_pos_partut_article_best.pt",
               model_pretrain,
               "fr_partut-ud-test.conllu",
               tokenizer,
               label2id,
               id2label,
                device="cuda")

print("\nDataset Sequoia :")
acc_pos_sequoia = test_pos("camembert_pos_sequoia_article_best.pt",
               model_pretrain,
               "fr_sequoia-ud-test.conllu",
               tokenizer,
               label2id,
               id2label,
                device="cuda")

print("\nDataset GSD :")
acc_pos_GSD = test_pos("camembert_pos_gsd_article_best.pt",
               model_pretrain,
               "fr_gsd-ud-test.conllu",
               tokenizer,
               label2id,
               id2label,
                device="cuda")