In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**1-Introduction:**

Natural Language Processing (NLP) is one of the hottest areas of artificial intelligence, Natural language processing (**NLP**) is the discipline of building machines that can manipulate human language — or data that resembles human language — in the way that it is written, spoken, and organized. NLP can be divided into two overlapping subfields: natural language understanding (NLU), which focuses on semantic analysis or determining the intended meaning of text, and natural language generation (NLG), which focuses on text generation by a machine. NLP is used for a wide variety of language-related tasks, like: Sentiment analysis / Machine translation / Named entity recognition / Spam detection / Grammatical error correction  / Text generation / Question answering...  

**2-Problem Statment**

Probleme statment: the mission of this project is to develop a model based on Natural Language Processing (NLP) that will be able to extract this information(full name, birthdate, Adresse,CIN ) from text. To do this, we must use NLP techniques, in particular Named Entity Recognition (NER).

Named entity recognition (NER) aims to extract entities in a piece of text into predefined categories such as personal names, organizations, locations, and quantities. The input to such a model is generally text, and the output is the various named entities

**2- Preprocessing:**

in this code we will Load data from data files, and Process the data into dictionaries where words are keys and their corresponding tags are the values.

In [None]:
# Define paths for the training, testing, and validation files
train_file_path = '/content/drive/MyDrive/Training.txt'
test_file_path = '/content/drive/MyDrive/Validation.txt'
validation_file_path = '/content/drive/MyDrive/test.txt'

# Load the content of each file into variables
with open(train_file_path, 'r', encoding='utf-8') as file:
    train_data = file.readlines()  # Reads each line into a list
with open(test_file_path, 'r', encoding='utf-8') as file:
    test_data = file.readlines()  # Reads each line into a list

with open(validation_file_path, 'r', encoding='utf-8') as file:
    validation_data = file.readlines()  # Reads each line into a list

# Print the loaded data to verify
print("Training Data:")
print(train_data[:26])  # Display the first 5 lines of training data

print("\nTesting Data:")
print(test_data[:26])  # Display the first 5 lines of testing data

print("\nValidation Data:")
print(validation_data[:26])  # Display the first 5 lines of validation data

word_tag_dict = {}

word_tag_dict_test = {}
# Traitement des données
for line in train_data:
    if line.strip():  # Ignore les lignes vides
        word, tag = line.rsplit(' ', 1)  # Séparer le mot et l'étiquette
        word_tag_dict[word.strip()] = tag.strip()  # Ajouter au dictionnaire

# Affichage du dictionnaire
print(word_tag_dict)

# Traitement des données
for line in test_data:
    if line.strip():  # Ignore les lignes vides
        word, tag = line.rsplit(' ', 1)  # Séparer le mot et l'étiquette
        word_tag_dict_test[word.strip()] = tag.strip()  # Ajouter au dictionnaire
print(word_tag_dict_test)

Training Data:
['Mr. O\n', 'FATIMA B-PERSON\n', 'ZAHRAE I-PERSON\n', 'BELHADI I-PERSON\n', '\n', 'Démeurant O\n', 'à O\n', ': O\n', '1 B-LOC\n', 'Av I-LOC\n', 'Med I-LOC\n', 'VI I-LOC\n', 'Résidence I-LOC\n', 'Ali I-LOC\n', 'Entrée I-LOC\n', 'D I-LOC\n', '2è I-LOC\n', 'étage I-LOC\n', 'Appt I-LOC\n', '5 I-LOC\n', '\n', 'CIN O\n', 'RO194214 B-cin\n', '\n', 'Né O\n', 'le O\n']

Testing Data:
['Mme O\n', 'ZAKARIA B-PERSON\n', 'EL I-PERSON\n', 'METTAIY I-PERSON\n', '\n', 'Démeurant O\n', 'à O\n', ': O\n', 'Rue B-LOC\n', 'N I-LOC\n', '21 I-LOC\n', 'Quartier I-LOC\n', 'Al I-LOC\n', 'Wifak I-LOC\n', 'Fes I-LOC\n', '\n', 'CIN O\n', 'TU114877 B-cin\n', '\n', 'Né O\n', 'le O\n', '12.06.1966 B-date\n', 'à O\n', 'Zawyat O\n', '\n', 'Mme O\n']

Validation Data:
['Mme O\n', 'YASSINE B-PERSON\n', 'ELKORCHI I-PERSON\n', '\n', 'Démeurant O\n', 'à O\n', ': O\n', 'Quartier B-LOC\n', 'Industriel I-LOC\n', 'CP I-LOC\n', '80000 I-LOC\n', 'Agadir I-LOC\n', '\n', 'Carte O\n', 'de O\n', 'séjour O\n', 'N° O\n',

**generate a structured CSV file:**

This step processes a dataset of tagged words to generate a structured CSV file containing sentences, words, and their respective tags.

In [None]:
import csv

# Initialiser un set pour stocker les tags
tags_set = set()
l = []
listphrase = []

# Variable pour suivre l'état de la phrase en cours
current_phrase = []

# Traitement et affichage
for line in train_data:
    if line.strip():  # Ignore les lignes vides
        word, tag = line.rsplit(' ', 1)  # Séparer le mot et l'étiquette

        if word == 'Mr.' or word == 'Mme':  # Détecter Mr. ou Mme
            # Si une phrase est en cours, on la stocke dans la liste
            if current_phrase:
                listphrase.append(" ".join(current_phrase))
                current_phrase = []  # Réinitialiser pour la prochaine phrase
            current_phrase.append(word)  # Démarrer une nouvelle phrase avec Mr. ou Mme
        else:
            current_phrase.append(word)  # Ajouter le mot à la phrase en cours

        # Ajouter le tag dans le set
        tags_set.add(tag.strip())

# Ajouter la dernière phrase en cours si elle existe
if current_phrase:
    listphrase.append(" ".join(current_phrase))

# Convertir le set en liste
entity_types = list(tags_set)

# Création d'un fichier CSV
with open('sentences_tags.csv', 'w', newline='') as csvfile:
    fieldnames = ['sentence', 'word', 'tag']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()  # Écrire les en-têtes du CSV

    sentence_count = 1  # Compteur pour le numéro de la phrase

    # Remplir le fichier CSV avec les mots et tags
    for phrase in listphrase:
        for word in phrase.split():
            # On suppose que le tag pour chaque mot est dans entity_types
            # Vous pouvez adapter ici pour associer correctement le tag à chaque mot
            tag = word_tag_dict[word]  # Remplacer par une logique pour obtenir le tag correct pour chaque mot
            writer.writerow({'sentence': sentence_count, 'word': word, 'tag': tag})
        sentence_count += 1

print("Fichier CSV généré : sentences_tags.csv")


Fichier CSV généré : sentences_tags.csv


In [None]:
import csv

# Initialiser un set pour stocker les tags
tags_set = set()
l = []
listphrase = []

# Variable pour suivre l'état de la phrase en cours
current_phrase = []

# Traitement et affichage
for line in test_data:
    if line.strip():  # Ignore les lignes vides
        word, tag = line.rsplit(' ', 1)  # Séparer le mot et l'étiquette

        if word == 'Mr.' or word == 'Mme':  # Détecter Mr. ou Mme
            # Si une phrase est en cours, on la stocke dans la liste
            if current_phrase:
                listphrase.append(" ".join(current_phrase))
                current_phrase = []  # Réinitialiser pour la prochaine phrase
            current_phrase.append(word)  # Démarrer une nouvelle phrase avec Mr. ou Mme
        else:
            current_phrase.append(word)  # Ajouter le mot à la phrase en cours

        # Ajouter le tag dans le set
        tags_set.add(tag.strip())

# Ajouter la dernière phrase en cours si elle existe
if current_phrase:
    listphrase.append(" ".join(current_phrase))

# Convertir le set en liste
entity_types = list(tags_set)

# Création d'un fichier CSV
with open('sentencestest_tags.csv', 'w', newline='') as csvfile:
    fieldnames = ['sentence', 'word', 'tag']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()  # Écrire les en-têtes du CSV

    sentence_count = 1  # Compteur pour le numéro de la phrase

    # Remplir le fichier CSV avec les mots et tags
    for phrase in listphrase:
        for word in phrase.split():
            # On suppose que le tag pour chaque mot est dans entity_types
            # Vous pouvez adapter ici pour associer correctement le tag à chaque mot
            tag = word_tag_dict_test[word]  # Remplacer par une logique pour obtenir le tag correct pour chaque mot
            writer.writerow({'sentence': sentence_count, 'word': word, 'tag': tag})
        sentence_count += 1

print("Fichier CSV généré : sentencestest_tags.csv")

Fichier CSV généré : sentencestest_tags.csv


In [None]:
!pip install simpletransformers

Collecting simpletransformers
  Downloading simpletransformers-0.70.1-py3-none-any.whl.metadata (42 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.4/42.4 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Collecting datasets (from simpletransformers)
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting seqeval (from simpletransformers)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorboardx (from simpletransformers)
  Downloading tensorboardX-2.6.2.2-py2.py3-none-any.whl.metadata (5.8 kB)
Collecting streamlit (from simpletransformers)
 

**Read CSV files**

In [None]:
import pandas as pd
dftest=pd.read_csv('/content/sentencestest_tags.csv')

dftest.columns=['sentence_id', 'words', 'tag']
dftest.head()

dftrain=pd.read_csv('/content/sentences_tags.csv')
dftrain.columns=['sentence_id', 'words', 'tag']
dftrain.head()

Unnamed: 0,sentence_id,words,tag
0,1,Mr.,O
1,1,FATIMA,B-PERSON
2,1,ZAHRAE,I-PERSON
3,1,BELHADI,B-PERSON
4,1,Démeurant,O


The table represents a dataset where each row corresponds to a word in a sentence, along with its tag for named entity recognition (NER).

**sentence_id:** Represents the unique identifier for the sentence that the word belongs to.Multiple rows with the same sentence_id indicate that the words are part of the same sentence.

**words:** Contains individual words or tokens from the sentences.

**tag:** The(BIO) tag associated with each word, used for NER.

O: Indicates that the word is not part of any named entity.

B-PERSON: Denotes the beginning of a person's name.

I-PERSON: Indicates a continuation of a person's name

In [None]:



import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score



# Séparer les features et les étiquettes (tags)
X_train = dftrain[['sentence_id', 'words']]
Y_train = dftrain['tag']

X_test = dftest[['sentence_id', 'words']]
Y_test = dftest['tag']

# Effectuer la séparation en train/test (80% train et 20% test) pour dftrain
X_train_split, X_valid_split, Y_train_split, Y_valid_split = train_test_split(
    X_train, Y_train, test_size=0.2, random_state=42
)

# Afficher les tailles des jeux de données
print(f"Train Features Shape: {X_train_split.shape}")
print(f"Validation Features Shape: {X_valid_split.shape}")
print(f"Train Labels Shape: {Y_train_split.shape}")
print(f"Validation Labels Shape: {Y_valid_split.shape}")


Train Features Shape: (59227, 2)
Validation Features Shape: (14807, 2)
Train Labels Shape: (59227,)
Validation Labels Shape: (14807,)


In this section, we prepared and split the dataset to facilitate the training and evaluation of a Named Entity Recognition (NER) model. The data was first structured into features (sentence_id and words) and corresponding labels (tag). The training dataset was further divided into training and validation subsets, with 80% of the data used for training and 20% for validation

**3-training and evaluating a Named Entity Recognition *texte en italique* *(NER) model* :**

**Setting Hyperparameters:**

In [None]:
from simpletransformers.ner import NERModel, NERArgs

label=['B-LOC', 'B-date', 'I-cin', 'I-LOC', 'B-cin', 'B-PERSON', 'O', 'I-PERSON']
args=NERArgs()
args.num_train_epochs=40
args.learning_rate=1e-4
args.overwrite_output_dir=True
args.train_batch_size=32
args.eval_batch_size=32

In [None]:
model=NERModel('bert', 'bert-base-cased', labels=label, args=args, use_cuda=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

The parameters specified here control how the model trains and evaluates. The settings ensure that the model has sufficient epochs to learn, processes the data efficiently in batches, and uses a learning rate that promotes stable convergence.

**fine-tuning the BERT model for NER:**

In [None]:
!pip install simpletransformers

import pandas as pd
from sklearn.model_selection import train_test_split
from simpletransformers.ner import NERModel, NERArgs

# Charger les données
dftrain = pd.read_csv('/content/sentences_tags.csv')
dftest = pd.read_csv('/content/sentencestest_tags.csv')

# Vérification du type des DataFrames
print("Type of dftrain:", type(dftrain))  # Devrait être <class 'pandas.core.frame.DataFrame'>
print("Type of dftest:", type(dftest))   # Devrait être <class 'pandas.core.frame.DataFrame'>

# Renommer les colonnes si nécessaire
dftrain.columns = ['sentence_id', 'words', 'tag']
dftest.columns = ['sentence_id', 'words', 'tag']

# Supprimer les lignes avec des valeurs manquantes
dftrain = dftrain.dropna(subset=['sentence_id', 'words', 'tag'])
dftest = dftest.dropna(subset=['sentence_id', 'words', 'tag'])

# S'assurer que sentence_id est une chaîne de caractères
dftrain['sentence_id'] = dftrain['sentence_id'].astype(str)
dftest['sentence_id'] = dftest['sentence_id'].astype(str)

# Créer les DataFrames pour l'entraînement et l'évaluation
train_data = []
for sentence_id, group in dftrain.groupby('sentence_id'):
    words = group['words'].tolist()
    tags = group['tag'].tolist()
    for word, tag in zip(words, tags):
        train_data.append({"sentence_id": sentence_id, "words": word, "labels": tag})

train_data = pd.DataFrame(train_data)

test_data = []
for sentence_id, group in dftest.groupby('sentence_id'):
    words = group['words'].tolist()
    tags = group['tag'].tolist()
    for word, tag in zip(words, tags):
        test_data.append({"sentence_id": sentence_id, "words": word, "labels": tag})

test_data = pd.DataFrame(test_data)

# Vérifier la structure de train_data et test_data
print("First 5 entries of train_data:", train_data.head())
print("First 5 entries of test_data:", test_data.head())

# Diviser les données d'entraînement en ensembles d'entraînement et de validation
train_data_split, valid_data_split = train_test_split(train_data, test_size=0.2, random_state=42)

# Initialiser le modèle NER
model_args = NERArgs()
model_args.num_train_epochs = 50
model_args.learning_rate = 1e-4
model_args.overwrite_output_dir = True
model_args.train_batch_size = 32
model_args.eval_batch_size = 32

# Get unique labels from both training and test data
all_labels = set(dftrain['tag'].unique()).union(set(dftest['tag'].unique()))
label = list(all_labels)  # Use all unique labels
print("All unique labels:", label)

# Initialiser le modèle BERT NER
model = NERModel("bert", "bert-base-cased", labels=label, args=model_args, use_cuda=True)

# Entraîner le modèle
print("Training the model...")
model.train_model(train_data_split, eval_data=valid_data_split,acc=accuracy_score)
model.save_model("bertner")
# Évaluer le modèle
print("Evaluating the model...")
result, model_outputs, preds_list = model.eval_model(test_data)

# Afficher les résultats de l'évaluation
print("Evaluation Results:", result)

Type of dftrain: <class 'pandas.core.frame.DataFrame'>
Type of dftest: <class 'pandas.core.frame.DataFrame'>
First 5 entries of train_data:   sentence_id      words    labels
0           1        Mr.         O
1           1     FATIMA  B-PERSON
2           1     ZAHRAE  I-PERSON
3           1    BELHADI  B-PERSON
4           1  Démeurant         O
First 5 entries of test_data:   sentence_id      words    labels
0           1        Mme         O
1           1    ZAKARIA  B-PERSON
2           1         EL  I-PERSON
3           1    METTAIY  I-PERSON
4           1  Démeurant         O
All unique labels: ['I-PERSON', 'B-LOC', 'I-LOC', 'I-cin', 'O', 'B-PERSON', 'B-date', 'B-cin']


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training the model...


  0%|          | 0/2 [00:00<?, ?it/s]

Epoch:   0%|          | 0/50 [00:00<?, ?it/s]

  scaler = amp.GradScaler()


Running Epoch 1 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

  with amp.autocast():


Running Epoch 2 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 3 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 4 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 5 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 6 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 7 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 8 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 9 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 10 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 11 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 12 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 13 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 14 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 15 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 16 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 17 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 18 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 19 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 20 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 21 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 22 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 23 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 24 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 25 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 26 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 27 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 28 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 29 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 30 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 31 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 32 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 33 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 34 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 35 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 36 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 37 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 38 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 39 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 40 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 41 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 42 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 43 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 44 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 45 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 46 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 47 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 48 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 49 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Running Epoch 50 of 50:   0%|          | 0/101 [00:00<?, ?it/s]

Evaluating the model...


  0%|          | 0/1 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/3 [00:00<?, ?it/s]

  with amp.autocast():


Evaluation Results: {'eval_loss': 0.6817167500654856, 'precision': 0.7009132420091324, 'recall': 0.7791878172588832, 'f1_score': 0.7379807692307692}




This step demonstrates fine-tuning the BERT model for NER tasks using the simpletransformers library.The dataset is loaded from CSV files containing sentence_id, words, and tags.

The Evaluation Metrics results we received from the model:

**eval_loss (0.6817):** The lower the loss, the better the model is, In this case, a loss of 0.6817 suggests that the model is performing reasonably.

**recall (0.7792):** A recall of 0.7792 indicates that the model is correctly identifying around 78% of the actual entities.

**f1_score (0.7380):** The F1 score of 0.7380 suggests that the model has a reasonably good balance between precision and recall.


**performance of  named entity recognition (NER):**

The aim of this step is to use the fine-tuned BERT model for Named Entity Recognition (NER) on a given text, so that we can identify and classify entities within the text,

In [None]:
# Importer les bibliothèques nécessaires
from simpletransformers.ner import NERModel
import torch

# Charger le modèle fine-tuné à partir du checkpoint
model_path = '/content/outputs/checkpoint-5050-epoch-50'  # Chemin vers le modèle fine-tuné
model = NERModel('bert', model_path, use_cuda=torch.cuda.is_available())

# Exemple de texte à tester
text = "Mr NAOUAL NOUINI Démeurant à : Quartier Douar Rja Fellah CP 80000 Agadir CIN NS260057 Né le 03.06.1971 à Ait Mme FATIMA BEKKAL Démeurant à : 33 Quartier Dakhla CP 80000 Agadir CIN SH399857 Né le 19.04.1982 à Ad"

# Prédire les entités dans le texte
print("Predicting entities...")
predictions, raw_outputs = model.predict([text])  # Passer le texte sous forme de liste

# Afficher les prédictions
print(predictions)




Predicting entities...


  0%|          | 0/1 [00:00<?, ?it/s]

Running Prediction:   0%|          | 0/1 [00:00<?, ?it/s]

[[{'Mr': 'O'}, {'NAOUAL': 'B-PERSON'}, {'NOUINI': 'I-PERSON'}, {'Démeurant': 'O'}, {'à': 'O'}, {':': 'O'}, {'Quartier': 'I-LOC'}, {'Douar': 'I-LOC'}, {'Rja': 'I-LOC'}, {'Fellah': 'I-LOC'}, {'CP': 'I-LOC'}, {'80000': 'I-LOC'}, {'Agadir': 'O'}, {'CIN': 'O'}, {'NS260057': 'B-cin'}, {'Né': 'O'}, {'le': 'O'}, {'03.06.1971': 'B-date'}, {'à': 'O'}, {'Ait': 'O'}, {'Mme': 'O'}, {'FATIMA': 'B-PERSON'}, {'BEKKAL': 'I-PERSON'}, {'Démeurant': 'O'}, {'à': 'O'}, {':': 'O'}, {'33': 'B-LOC'}, {'Quartier': 'I-LOC'}, {'Dakhla': 'I-LOC'}, {'CP': 'I-LOC'}, {'80000': 'I-LOC'}, {'Agadir': 'O'}, {'CIN': 'O'}, {'SH399857': 'B-cin'}, {'Né': 'O'}, {'le': 'O'}, {'19.04.1982': 'B-date'}, {'à': 'O'}, {'Ad': 'O'}]]


In [None]:
# Exemple d'annotations manuelles (vraies étiquettes)
true_annotations = [
    {'Mr': 'O'}, {'NAOUAL': 'B-PERSON'}, {'NOUINI': 'I-PERSON'},
    {'Démeurant': 'O'}, {'à': 'O'}, {':': 'O'},
    {'Quartier': 'B-LOC'}, {'Douar': 'I-LOC'}, {'Rja': 'I-LOC'}, {'Fellah': 'I-LOC'},
    {'CP': 'I-LOC'}, {'80000': 'I-LOC'}, {'Agadir': 'I-LOC'},
    {'CIN': 'O'}, {'NS260057': 'B-cin'},
    {'Né': 'O'}, {'le': 'O'}, {'03.06.1971': 'B-date'}, {'à': 'O'}, {'Ait': 'O'},
    {'Mme': 'O'}, {'FATIMA': 'B-PERSON'}, {'BEKKAL': 'I-PERSON'},
    {'Démeurant': 'O'}, {'à': 'O'}, {':': 'O'},
    {'33': 'B-LOC'}, {'Quartier': 'I-LOC'}, {'Dakhla': 'I-LOC'},
    {'CP': 'I-LOC'}, {'80000': 'I-LOC'}, {'Agadir': 'I-LOC'},
    {'CIN': 'O'}, {'SH399857': 'B-cin'},
    {'Né': 'O'}, {'le': 'O'}, {'19.04.1982': 'B-date'}, {'à': 'O'}, {'Ad': 'O'}
]

# Exemple de prédictions retournées par le modèle (exemple formaté que vous avez mentionné)


# Fonction pour calculer l'accuracy
def calculate_accuracy(true_annotations, predictions):
    correct = 0
    total = 0

    # Comparer les annotations et prédictions par token
    for true, pred in zip(true_annotations, predictions[0]):  # predictions[0] pour accéder à la première prédiction
        for word in true:
            true_label = true[word]
            pred_label = pred.get(word, 'O')  # Si le mot n'est pas dans la prédiction, considérer 'O'

            # Comparer les étiquettes (vraie et prédite) pour chaque mot
            if true_label == pred_label:
                correct += 1
            total += 1

    accuracy = correct / total if total > 0 else 0
    return accuracy

# Calculer l'accuracy
accuracy = calculate_accuracy(true_annotations, predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 92.31%


The accuracy of 92.31% indicates that the fine-tuned BERT model successfully predicts Named Entity Recognition (NER) tags for the provided text with a high degree of correctness.

**Conclusion:**     

In this work, we fine-tuned a pre-trained BERT model for Named Entity Recognition (NER) tasks using the simpletransformers library, leveraging a custom dataset with entity labels such as persons (B-PERSON, I-PERSON), locations (B-LOC, I-LOC), dates (B-DATE),CIN (B-cin)

The evaluation results indicate that the model has achieved impressive performance: F1 Score (0.7380),Recall (0.7792), Evaluation Loss (0.6817)

after testing the fine-tuned model with an example text, we observed that the model performed well in identifying and tagging entities, achieving an accuracy of 92.31%