In [None]:
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification
import numpy as np
from seqeval.metrics import classification_report, accuracy_score


In [None]:
# Charger les données depuis un dossier
def load_conll_data(path):
    return load_dataset("conll2003", data_files={"train": path + "/train.txt", "validation": path + "/valid.txt"}, delimiter="\t")

# Tokeniser les données
def tokenize_and_align_labels(examples, tokenizer, label_to_id):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label_to_id[label[word_idx]])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Paramètres
model_checkpoint = "camembert-base"
label_list = ["O", "B-ABR"]
label_to_id = {label: i for i, label in enumerate(label_list)}
id_to_label = {i: label for label, i in label_to_id.items()}

# Charger tokenizer + modèle
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint, num_labels=len(label_list))

# Charger les données
dataset = load_dataset("text", data_files={"train": "data/train.txt", "validation": "data/valid.txt"})
def process_data(example):
    tokens = []
    labels = []
    for line in example['text'].splitlines():
        if line.strip() == "":
            continue
        token, label = line.split()
        tokens.append(token)
        labels.append(label)
    return {"tokens": tokens, "ner_tags": labels}

dataset = dataset.map(process_data)

# Tokenisation + alignement des labels
tokenized_datasets = dataset.map(lambda x: tokenize_and_align_labels(x, tokenizer, label_to_id), batched=True)

# Entraînement
training_args = TrainingArguments(
    output_dir="./ner_abreviations_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
)

data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=lambda p: {"accuracy": accuracy_score(p.label_ids, p.predictions)}
)

trainer.train()
trainer.save_model("ner_abreviations_model")


In [None]:
from transformers import pipeline

ner_pipe = pipeline("ner", model="ner_abreviations_model", tokenizer="ner_abreviations_model", aggregation_strategy="simple")

mot = "sy.aeps"
resultat = ner_pipe(mot)
print(resultat)


##Detection des abreviations


In [2]:
!pip install datasets


Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

In [3]:
!pip install transformers datasets seqeval


Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: seqeval
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
  Created wheel for seqeval: filename=seqeval-1.2.2-py3-none-any.whl size=16162 sha256=ca64502af3faf6f6f9d45af9610df96e39cbe5bcbc769cb4257416ce47fd0bc5
  Stored in directory: /root/.cache/pip/wheels/bc/92/f0/243288f899c2eacdfa8c5f9aede4c71a9bad0ee26a01dc5ead
Successfully built seqeval
Installing collected packages: seqeval
Successfully installed seqeval-1.2.2


In [10]:
!pip install --upgrade transformers


Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Downloading transformers-4.51.3-py3-none-any.whl (10.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m38.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.51.1
    Uninstalling transformers-4.51.1:
      Successfully uninstalled transformers-4.51.1
Successfully installed transformers-4.51.3


In [1]:
!pip install --upgrade transformers datasets seqeval


Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.30.2-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-win_amd64.whl.metadata (3.9 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Downloading transformers-4.51.3-py3-none-any.whl (10.4

In [1]:
pip install tf-keras

Collecting tf-keras
  Using cached tf_keras-2.19.0-py3-none-any.whl.metadata (1.8 kB)
Collecting tensorflow<2.20,>=2.19 (from tf-keras)
  Downloading tensorflow-2.19.0-cp310-cp310-win_amd64.whl.metadata (4.1 kB)
Collecting absl-py>=1.0.0 (from tensorflow<2.20,>=2.19->tf-keras)
  Downloading absl_py-2.2.2-py3-none-any.whl.metadata (2.6 kB)
Collecting astunparse>=1.6.0 (from tensorflow<2.20,>=2.19->tf-keras)
  Using cached astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow<2.20,>=2.19->tf-keras)
  Using cached flatbuffers-25.2.10-py2.py3-none-any.whl.metadata (875 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow<2.20,>=2.19->tf-keras)
  Using cached gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow<2.20,>=2.19->tf-keras)
  Using cached google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow<2.20,>=2.19->tf-keras)
  Using cac

In [4]:
pip install accelerate -U


Collecting accelerate
  Downloading accelerate-1.6.0-py3-none-any.whl.metadata (19 kB)
Downloading accelerate-1.6.0-py3-none-any.whl (354 kB)
Installing collected packages: accelerate
Successfully installed accelerate-1.6.0
Note: you may need to restart the kernel to use updated packages.


In [10]:
pip install --upgrade accelerate


Note: you may need to restart the kernel to use updated packages.


In [2]:
import os
os.environ["USE_TF"] = "0"
from datasets import load_dataset, Dataset
import numpy as np
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from transformers import DataCollatorForTokenClassification


  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [4]:
# Charger le fichier train.txt (et valid.txt s'il existe)
def lire_fichier_conll(nom_fichier):
    tokens = []
    labels = []
    avec = open(nom_fichier, 'r', encoding='utf-8')
    lignes = avec.readlines()
    avec.close()

    sequence_tokens = []
    sequence_labels = []

    for ligne in lignes:
        ligne = ligne.strip()
        if ligne == "":
            if sequence_tokens:
                tokens.append(sequence_tokens)
                labels.append(sequence_labels)
                sequence_tokens = []
                sequence_labels = []
        else:
            parties = ligne.split()
            if len(parties) == 2:
                mot, etiquette = parties
                sequence_tokens.append(mot)
                sequence_labels.append(etiquette)
    if sequence_tokens:
        tokens.append(sequence_tokens)
        labels.append(sequence_labels)

    return Dataset.from_dict({"tokens": tokens, "ner_tags": labels})


# Création du dataset
train_dataset = lire_fichier_conll(r"C:\Users\Ons\Downloads\train_final.txt")
valid_dataset = lire_fichier_conll(r"C:\Users\Ons\Downloads\valid.txt")


label_list = ["O", "B-ABR"]
label_to_id = {l: i for i, l in enumerate(label_list)}
id_to_label = {i: l for l, i in label_to_id.items()}


# Convertir les étiquettes en ID
def encode_labels(example):
    example["labels"] = [label_to_id[tag] for tag in example["ner_tags"]]
    return example

train_dataset = train_dataset.map(encode_labels)
valid_dataset = valid_dataset.map(encode_labels)

# Tokenizer et modèle
checkpoint = "camembert-base"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForTokenClassification.from_pretrained(checkpoint, num_labels=len(label_list))

def tokenize_and_align_labels(example):
    tokenized = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)
    word_ids = tokenized.word_ids()
    previous_word_idx = None
    labels = []

    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(example["labels"][word_idx])
        else:
            labels.append(-100)
        previous_word_idx = word_idx

    tokenized["labels"] = labels
    return tokenized

train_tokenized = train_dataset.map(tokenize_and_align_labels)
valid_tokenized = valid_dataset.map(tokenize_and_align_labels)

# Entraînement
args = TrainingArguments(
    output_dir="ner_abreviation_model",
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    logging_steps=1,
    save_strategy="epoch",
    save_total_limit=1,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    weight_decay=0.01,
    report_to="none"
)


data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tokenized,
    eval_dataset=valid_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()
trainer.save_model("ner_abreviation_model")
print("✅ Modèle entraîné et sauvegardé dans ner_abreviation_model/")


Map: 100%|██████████████████████████████████████████████████████████████████| 139/139 [00:00<00:00, 1944.37 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 1459.13 examples/s]
Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████████████████████████████████████████████████████████████| 139/139 [00:00<00:00, 1540.28 examples/s]
Map: 100%|████████████████████████████████████████████████████████████████████| 17/17 [00:00<00:00, 1368.43 examples/s]
You're using a CamembertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encodin

Epoch,Training Loss,Validation Loss
1,0.6743,0.673582
2,0.6411,0.654608
3,0.6191,0.647269


✅ Modèle entraîné et sauvegardé dans ner_abreviation_model/


In [5]:
from transformers import pipeline

# 📦 Charger ton modèle entraîné
model_path = "ner_abreviation_model"
ner_pipeline = pipeline(
    "ner",
    model=model_path,
    tokenizer=model_path,
    aggregation_strategy="simple"  # pour regrouper les prédictions sur plusieurs tokens si besoin
)

# ✏️ Liste des mots à tester (tu peux la modifier !)
mots_ocr = ["KCH", "D", "TSK", "PO", "pharmacie", "test", "consultation", "R", "IR", "XYZ"]

print("🔍 Résultat de détection des abréviations :\n")

for mot in mots_ocr:
    resultat = ner_pipeline(mot)
    
    if resultat and resultat[0]["entity_group"] == "B-ABR":
        print(f"🟢 '{mot}' est reconnu comme une ABRÉVIATION médicale ✅")
    else:
        print(f"🔴 '{mot}' n'est PAS détecté comme abréviation ❌")


🔍 Résultat de détection des abréviations :

🔴 'KCH' n'est PAS détecté comme abréviation ❌
🔴 'D' n'est PAS détecté comme abréviation ❌
🔴 'TSK' n'est PAS détecté comme abréviation ❌
🔴 'PO' n'est PAS détecté comme abréviation ❌
🔴 'pharmacie' n'est PAS détecté comme abréviation ❌
🔴 'test' n'est PAS détecté comme abréviation ❌
🔴 'consultation' n'est PAS détecté comme abréviation ❌
🔴 'R' n'est PAS détecté comme abréviation ❌
🔴 'IR' n'est PAS détecté comme abréviation ❌
🔴 'XYZ' n'est PAS détecté comme abréviation ❌


In [6]:
mots_ocr = ["KCH", "TSK", "D", "PO", "R"]

for mot in mots_ocr:
    phrase = f"Le patient présente une {mot}."
    inputs = tokenizer(phrase, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)

    predictions = torch.argmax(outputs.logits, dim=2)
    labels = [model.config.id2label[p.item()] for p in predictions[0]]

    print(f"🧪 {phrase}")
    print(f"🔍 Labels prédits : {labels}")


NameError: name 'torch' is not defined

In [1]:

from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from datasets import Dataset
import torch
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [3]:
# ✅ 1. Charger et préparer les données CoNLL (train_final_corrige.txt)
def lire_conll(fichier):
    tokens, tags = [], []
    with open(fichier, encoding="utf-8") as f:
        lignes = f.readlines()

    sequence_tokens, sequence_tags = [], []
    for ligne in lignes:
        ligne = ligne.strip()
        if not ligne:
            if sequence_tokens:
                tokens.append(sequence_tokens)
                tags.append(sequence_tags)
                sequence_tokens, sequence_tags = [], []
        else:
            mot, tag = ligne.split()
            sequence_tokens.append(mot)
            sequence_tags.append(tag)
    if sequence_tokens:
        tokens.append(sequence_tokens)
        tags.append(sequence_tags)

    return Dataset.from_dict({"tokens": tokens, "ner_tags": tags})

# Charger le dataset
dataset = lire_conll(r"C:\Users\Ons\Downloads\train_final_balanced_cleaned.txt")

# Labels utilisés : O, B-ABR
label_list = ["O", "B-ABR"]
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

# Convertir les tags en ID
def encode_tags(example):
    example["labels"] = [label2id[tag] for tag in example["ner_tags"]]
    return example

encoded_dataset = dataset.map(encode_tags)

# Tokenizer & modèle
model_name = "camembert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list), id2label=id2label, label2id=label2id)

# Aligner tokens et labels
def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)
    word_ids = tokenized_inputs.word_ids()

    labels = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        elif word_idx != previous_word_idx:
            labels.append(example["labels"][word_idx])
        else:
            labels.append(-100)
        previous_word_idx = word_idx

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

encoded_dataset = encoded_dataset.map(tokenize_and_align_labels)

# Entraînement
args = TrainingArguments(
    output_dir="ner_model",
    num_train_epochs=5,
    per_device_train_batch_size=16,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_strategy="steps",      # ✅ afficher à chaque "step"
    logging_steps=5,               # ✅ toutes les 5 itérations
    save_total_limit=1,
    evaluation_strategy="no",
    report_to="none",              # (évite les erreurs liés à wandb ou TensorBoard)
)

data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=encoded_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()
trainer.save_model("ner_model")


Map: 100%|██████████| 410/410 [00:00<00:00, 2631.34 examples/s]
Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 410/410 [00:00<00:00, 1025.97 examples/s]
You're using a CamembertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
5,0.6847
10,0.6036
15,0.5869
20,0.4678
25,0.4292
30,0.3063
35,0.2907
40,0.2685
45,0.2373
50,0.2253


In [5]:
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline

# Charger le modèle local correctement
model = AutoModelForTokenClassification.from_pretrained("./ner_model")
tokenizer = AutoTokenizer.from_pretrained("./ner_model")

# Créer le pipeline avec le modèle local
nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")


In [7]:
print(model.config.id2label)


{0: 'O', 1: 'B-ABR'}


In [9]:
def tester_mot(mot):
    mot = mot.upper().strip()
    phrase = f"Le patient a présenté le terme {mot} lors de la consultation."
    prediction = nlp(phrase)
    print(f"🧪 Phrase : {phrase}")
    print(f"🔍 Prédiction : {prediction}")

    for entity in prediction:
        if entity["word"].replace("Ġ", "").upper() == mot and entity["entity_group"] == "ABR" and entity["score"] > 0.7:
            print(f"✅ '{mot}' est reconnu comme une abréviation.")
            return
    print(f"❌ '{mot}' n'est PAS reconnu comme une abréviation.")



In [11]:
tester_mot("INES")
tester_mot("VN")

🧪 Phrase : Le patient a présenté le terme INES lors de la consultation.
🔍 Prédiction : [{'entity_group': 'ABR', 'score': np.float32(0.5209276), 'word': 'a', 'start': 11, 'end': 12}, {'entity_group': 'ABR', 'score': np.float32(0.54116267), 'word': 'le', 'start': 22, 'end': 24}, {'entity_group': 'ABR', 'score': np.float32(0.6981754), 'word': '', 'start': 31, 'end': 32}, {'entity_group': 'ABR', 'score': np.float32(0.5561478), 'word': 'INES', 'start': 31, 'end': 35}, {'entity_group': 'ABR', 'score': np.float32(0.7196773), 'word': '.', 'start': 59, 'end': 60}]
❌ 'INES' n'est PAS reconnu comme une abréviation.
🧪 Phrase : Le patient a présenté le terme VN lors de la consultation.
🔍 Prédiction : [{'entity_group': 'ABR', 'score': np.float32(0.53486574), 'word': 'a', 'start': 11, 'end': 12}, {'entity_group': 'ABR', 'score': np.float32(0.51076424), 'word': 'présenté', 'start': 13, 'end': 21}, {'entity_group': 'ABR', 'score': np.float32(0.56872135), 'word': 'le', 'start': 22, 'end': 24}, {'entity_

In [13]:
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification

# Charger modèle entraîné en local
model = AutoModelForTokenClassification.from_pretrained("./ner_model")
tokenizer = AutoTokenizer.from_pretrained("./ner_model")
nlp = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# Testeur avec phrase contextuelle
def tester_mot(mot):
    mot = mot.upper().strip()
    phrase = f"Le patient a présenté le terme {mot} lors de la consultation."
    prediction = nlp(phrase)
    print(f"🧪 Phrase : {phrase}")
    print(f"🔍 Prédiction : {prediction}")

    if any(entity['entity_group'] == "B-ABR" for entity in prediction):
        print(f"✅ '{mot}' est reconnu comme une abréviation.")
    else:
        print(f"❌ '{mot}' n'est PAS reconnu comme une abréviation.")

# Test avec quelques mots
for mot in ["VN", "02/02/2022", "V", "KE", "E", "CPY"]:
    tester_mot(mot)


🧪 Phrase : Le patient a présenté le terme VN lors de la consultation.
🔍 Prédiction : [{'entity_group': 'ABR', 'score': np.float32(0.53486574), 'word': 'a', 'start': 11, 'end': 12}, {'entity_group': 'ABR', 'score': np.float32(0.51076424), 'word': 'présenté', 'start': 13, 'end': 21}, {'entity_group': 'ABR', 'score': np.float32(0.56872135), 'word': 'le', 'start': 22, 'end': 24}, {'entity_group': 'ABR', 'score': np.float32(0.8244967), 'word': 'V', 'start': 31, 'end': 32}, {'entity_group': 'ABR', 'score': np.float32(0.81648326), 'word': 'N', 'start': 32, 'end': 33}, {'entity_group': 'ABR', 'score': np.float32(0.50217056), 'word': 'la', 'start': 42, 'end': 44}, {'entity_group': 'ABR', 'score': np.float32(0.75313234), 'word': '.', 'start': 57, 'end': 58}]
❌ 'VN' n'est PAS reconnu comme une abréviation.
🧪 Phrase : Le patient a présenté le terme 02/02/2022 lors de la consultation.
🔍 Prédiction : [{'entity_group': 'ABR', 'score': np.float32(0.60314786), 'word': '.', 'start': 65, 'end': 66}]
❌ '0

In [7]:
!pip install evaluate


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
Installing collected packages: evaluate
Successfully installed evaluate-0.4.3


In [3]:
pip install -U tokenizers


Collecting tokenizers
  Using cached tokenizers-0.21.1-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Using cached tokenizers-0.21.1-cp39-abi3-win_amd64.whl (2.4 MB)
Installing collected packages: tokenizers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.15.2
    Uninstalling tokenizers-0.15.2:
      Successfully uninstalled tokenizers-0.15.2
Successfully installed tokenizers-0.21.1
Note: you may need to restart the kernel to use updated packages.


  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
transformers 4.36.2 requires tokenizers<0.19,>=0.14, but you have tokenizers 0.21.1 which is incompatible.


In [5]:
pip install tokenizers==0.18.1


Note: you may need to restart the kernel to use updated packages.


ERROR: Ignored the following yanked versions: 0.12.0, 0.20.4
ERROR: Could not find a version that satisfies the requirement tokenizers==0.18.1 (from versions: 0.0.2, 0.0.3, 0.0.4, 0.0.5, 0.0.6, 0.0.7, 0.0.8, 0.0.9, 0.0.10, 0.0.11, 0.0.12, 0.0.13, 0.1.0, 0.1.1, 0.2.0, 0.2.1, 0.3.0, 0.4.0, 0.4.1, 0.4.2, 0.5.0, 0.5.1, 0.5.2, 0.6.0, 0.7.0rc1, 0.7.0rc2, 0.7.0rc3, 0.7.0rc4, 0.7.0rc5, 0.7.0rc6, 0.7.0rc7, 0.7.0, 0.8.0.dev0, 0.8.0.dev1, 0.8.0.dev2, 0.8.0rc1, 0.8.0rc2, 0.8.0rc3, 0.8.0rc4, 0.8.0, 0.8.1rc1, 0.8.1rc2, 0.8.1, 0.9.0.dev0, 0.9.0.dev1, 0.9.0.dev2, 0.9.0.dev3, 0.9.0.dev4, 0.9.0rc1, 0.9.0rc2, 0.9.0, 0.9.1, 0.9.2, 0.9.3, 0.9.4, 0.10.0rc1, 0.10.0, 0.10.1rc1, 0.10.1, 0.10.2, 0.10.3, 0.11.0, 0.11.1, 0.11.2, 0.11.3, 0.11.4, 0.11.5, 0.11.6, 0.12.1, 0.13.0, 0.13.1, 0.13.2, 0.13.3, 0.14.0, 0.14.1, 0.15.0, 0.15.1, 0.15.2, 0.19.0, 0.19.1, 0.20.0rc1, 0.20.0, 0.20.1rc1, 0.20.1, 0.20.2, 0.20.3rc0, 0.20.3, 0.20.4rc0, 0.21.0rc0, 0.21.0, 0.21.1rc0, 0.21.1)
ERROR: No matching distribution found for token

In [3]:
pip install --upgrade evaluate datasets huggingface_hub


Collecting datasets
  Using cached datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Using cached datasets-3.5.0-py3-none-any.whl (491 kB)
Installing collected packages: datasets
  Attempting uninstall: datasets
    Found existing installation: datasets 2.2.1
    Uninstalling datasets-2.2.1:
      Successfully uninstalled datasets-2.2.1
Successfully installed datasets-3.5.0
Note: you may need to restart the kernel to use updated packages.




In [3]:
pip install transformers==4.36.2


Collecting tokenizers<0.19,>=0.14 (from transformers==4.36.2)
  Using cached tokenizers-0.15.2-cp310-none-win_amd64.whl.metadata (6.8 kB)
Using cached tokenizers-0.15.2-cp310-none-win_amd64.whl (2.2 MB)
Installing collected packages: tokenizers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.21.1
    Uninstalling tokenizers-0.21.1:
      Successfully uninstalled tokenizers-0.21.1
Successfully installed tokenizers-0.15.2
Note: you may need to restart the kernel to use updated packages.


  You can safely remove it manually.


In [15]:
!pip install tokenizers==0.21.0



Collecting tokenizers==0.21.0
  Using cached tokenizers-0.21.0-cp39-abi3-win_amd64.whl.metadata (6.9 kB)
Using cached tokenizers-0.21.0-cp39-abi3-win_amd64.whl (2.4 MB)
Installing collected packages: tokenizers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.21.1
    Uninstalling tokenizers-0.21.1:
      Successfully uninstalled tokenizers-0.21.1
Successfully installed tokenizers-0.21.0


In [4]:
pip install transformers==4.36.2


Collecting transformers==4.36.2
  Using cached transformers-4.36.2-py3-none-any.whl.metadata (126 kB)
Collecting tokenizers<0.19,>=0.14 (from transformers==4.36.2)
  Using cached tokenizers-0.15.2-cp310-none-win_amd64.whl.metadata (6.8 kB)
Using cached transformers-4.36.2-py3-none-any.whl (8.2 MB)
Using cached tokenizers-0.15.2-cp310-none-win_amd64.whl (2.2 MB)
Installing collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.21.1
    Uninstalling tokenizers-0.21.1:
      Successfully uninstalled tokenizers-0.21.1
  Attempting uninstall: transformers
    Found existing installation: transformers 4.51.3
    Uninstalling transformers-4.51.3:
      Successfully uninstalled transformers-4.51.3
Successfully installed tokenizers-0.15.2 transformers-4.36.2
Note: you may need to restart the kernel to use updated packages.


  You can safely remove it manually.


In [17]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [5]:
from datasets import Dataset
import evaluate
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer
import numpy as np

# Chargement CoNLL
def lire_conll(fichier):
    tokens, tags = [], []
    with open(fichier, encoding="utf-8") as f:
        lignes = f.readlines()

    sequence_tokens, sequence_tags = [], []
    for ligne in lignes:
        ligne = ligne.strip()
        if not ligne:
            if sequence_tokens:
                tokens.append(sequence_tokens)
                tags.append(sequence_tags)
                sequence_tokens, sequence_tags = [], []
        else:
            mot, tag = ligne.rsplit(maxsplit=1)
            sequence_tokens.append(mot)
            sequence_tags.append(tag)
    if sequence_tokens:
        tokens.append(sequence_tokens)
        tags.append(sequence_tags)

    return Dataset.from_dict({"tokens": tokens, "ner_tags": tags})

# Charger train + valid
train_dataset = lire_conll(r"C:\Users\Ons\Downloads\train_final (1).txt")
valid_dataset = lire_conll(r"C:\Users\Ons\Downloads\valid_final.txt")

# Labels
label_list = ["O", "B-ABR"]
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

# Encoder les labels
def encode_tags(example):
    example["labels"] = [label2id[tag] for tag in example["ner_tags"]]
    return example

train_dataset = train_dataset.map(encode_tags)
valid_dataset = valid_dataset.map(encode_tags)

# Tokenizer et modèle (multilingue)
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

# Aligner tokens/labels
def tokenize_and_align_labels(example):
    tokenized_inputs = tokenizer(example["tokens"], truncation=True, is_split_into_words=True)
    word_ids = tokenized_inputs.word_ids()

    labels = []
    for word_idx in word_ids:
        if word_idx is None:
            labels.append(-100)
        else:
            labels.append(example["labels"][word_idx])
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

train_dataset = train_dataset.map(tokenize_and_align_labels)
valid_dataset = valid_dataset.map(tokenize_and_align_labels)

# Fonction de métriques
seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# Entraînement
args = TrainingArguments(
    output_dir="ner_model",
    num_train_epochs=5,
    per_device_train_batch_size=16,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=1,
    report_to="none",
)

data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
trainer.save_model("ner_model")


Map: 100%|██████████| 1/1 [00:00<00:00, 198.69 examples/s]
Map: 100%|██████████| 1/1 [00:00<00:00, 141.01 examples/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1/1 [00:00<00:00, 24.38 examples/s]
Map: 100%|██████████| 1/1 [00:00<00:00, 95.67 examples/s]
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.521564,1.0,0.125,0.222222,0.730769
2,No log,0.569524,0.470588,1.0,0.64,0.653846
3,No log,0.483981,0.615385,1.0,0.761905,0.807692
4,No log,0.416183,0.8,1.0,0.888889,0.923077
5,No log,0.398312,0.8,1.0,0.888889,0.923077


In [15]:
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification

# Charger ton modèle fine-tuné
model_path = "ner_model"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)

# Pipeline NER
ner_model = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")

# ✨ Liste de mots jamais vus dans le train/valid
test_words = ["OLFA", "D80", "12/04/2024", "01-01", "2023", "05.06.22", "labo", "chedli"]

print("🔍 Prédictions sur mots jamais vus :")
for word in test_words:
    prediction = ner_model(word)
    if prediction:
        label = prediction[0]["entity_group"]
        score = round(prediction[0]["score"], 4)
        print(f"{word:15} ➜ {label} (score: {score})")
    else:
        print(f"{word:15} ➜ O (aucune prédiction)")



🔍 Prédictions sur mots jamais vus :
OLFA            ➜ O (aucune prédiction)
D80             ➜ ABR (score: 0.5303000211715698)
12/04/2024      ➜ O (aucune prédiction)
01-01           ➜ O (aucune prédiction)
2023            ➜ ABR (score: 0.5394999980926514)
05.06.22        ➜ O (aucune prédiction)
labo            ➜ ABR (score: 0.5327000021934509)
chedli          ➜ O (aucune prédiction)


In [1]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("camembert-base")

# Exemple : un mot réel de ton dataset annoté comme abréviation
mot = "Cs+NEu"
tag = "B-ABR"

# Tokenisation
encoded = tokenizer(mot, is_split_into_words=False, return_offsets_mapping=True)
tokens = tokenizer.convert_ids_to_tokens(encoded["input_ids"])
word_ids = encoded.word_ids()

# Aligner le tag manuellement
aligned_tags = []
prev_word_id = None
for idx, word_id in enumerate(word_ids):
    if word_id is None:
        aligned_tags.append("-100")
    elif word_id != prev_word_id:
        aligned_tags.append(tag)
    else:
        aligned_tags.append("-100")
    prev_word_id = word_id

# Résultat visuel
for t, l in zip(tokens, aligned_tags):
    print(f"{t:15} ➜ {l}")


  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(


<s>             ➜ -100
▁C              ➜ B-ABR
s               ➜ -100
+               ➜ -100
NE              ➜ -100
u               ➜ -100
</s>            ➜ -100


In [22]:
pip install transformers

Note: you may need to restart the kernel to use updated packages.


In [1]:
from transformers import AutoTokenizer

# Charger le tokenizer CamemBERT
tokenizer = AutoTokenizer.from_pretrained("camembert-base")

# Liste d'abréviations à tester
abbreviations = ["CPY", "KE", "D48", "RTS", "XYT", "NEC"]

# Analyser comment chaque mot est tokenisé
tokenization_debug = []

for word in abbreviations:
    encoded = tokenizer(word, return_offsets_mapping=True)
    tokens = tokenizer.convert_ids_to_tokens(encoded["input_ids"])
    word_ids = encoded.word_ids()

    tokenization_debug.append({
        "original": word,
        "tokens": tokens,
        "word_ids": word_ids
    })

tokenization_debug


  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(


[{'original': 'CPY',
  'tokens': ['<s>', '▁CP', 'Y', '</s>'],
  'word_ids': [None, 0, 0, None]},
 {'original': 'KE',
  'tokens': ['<s>', '▁K', 'E', '</s>'],
  'word_ids': [None, 0, 0, None]},
 {'original': 'D48',
  'tokens': ['<s>', '▁D', '48', '</s>'],
  'word_ids': [None, 0, 0, None]},
 {'original': 'RTS',
  'tokens': ['<s>', '▁R', 'TS', '</s>'],
  'word_ids': [None, 0, 0, None]},
 {'original': 'XYT',
  'tokens': ['<s>', '▁X', 'Y', 'T', '</s>'],
  'word_ids': [None, 0, 0, 0, None]},
 {'original': 'NEC',
  'tokens': ['<s>', '▁N', 'EC', '</s>'],
  'word_ids': [None, 0, 0, None]}]

In [1]:
import os
print(os.getcwd())


C:\Users\Ons\Downloads


In [3]:
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification

# ➤ Chemin du dossier où le modèle a été sauvegardé
model_path = "C:/Users/Ons/Downloads/ner_model"

# Recharger le tokenizer et le modèle
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForTokenClassification.from_pretrained(model_path)

# Créer la pipeline NER
ner = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")



  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [35]:
for mot in ["Z25*2", "A14", "R40", "ECO", "08/10/1971","labo","Med aziz","Meddeb","Bilel ssouissi","Ben Ramdhane SAAD","M.E.D"]:
    prediction = ner(mot)
    if prediction:
        label = prediction[0]['entity_group']
        score = round(prediction[0]['score'], 4)
        print(f"{mot:12} ➜ {label} (score: {score})")
    else:
        print(f"{mot:12} ➜ O (aucune prédiction)")


Z25*2        ➜ ABR (score: 0.5884000062942505)
A14          ➜ ABR (score: 0.6735000014305115)
R40          ➜ ABR (score: 0.7150999903678894)
ECO          ➜ ABR (score: 0.5270000100135803)
08/10/1971   ➜ O (aucune prédiction)
labo         ➜ ABR (score: 0.5327000021934509)
Med aziz     ➜ O (aucune prédiction)
Meddeb       ➜ O (aucune prédiction)
Bilel ssouissi ➜ O (aucune prédiction)
Ben Ramdhane SAAD ➜ O (aucune prédiction)
M.E.D        ➜ ABR (score: 0.7124000191688538)
