# Projet NLP Alexia Allal

Chosen project: 4 - Reconnaissance de références juridiques

# Import dependencies with pip

In [2]:
! pip install transformers datasets evaluate seqeval

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━

In [3]:
import os
import json
import re
import random

from datasets import DatasetDict, Dataset

import torch

# login to hugging face

In [4]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Use pretrained LLMs

## Bert (english)

Implementation found on [Hugging Face](https://huggingface.co/docs/transformers/tasks/token_classification#token-classification), that was adapted to the data from the project.

In [None]:
def get_labels_from_annotation(html_file):
    labels = []
    words = []
    inside = False
    for i, word in enumerate(html_file.split()):
        if "<a>" in word:
            inside = True
        elif "</a>" in word:
            inside = False
        else:
            if inside:
                labels.append(1)
                # if labels[-1] == 0:
                #     labels.append(1)
                # else:
                #     labels.append(2)
            else:
                labels.append(0)
            words.append(word)
    return labels, words

In [None]:
html_test = " The <a> quick </a> brown <a> fox jumps over </a> the lazy dog."
get_labels_from_annotation(html_test)

([0, 1, 0, 1, 1, 1, 0, 0, 0],
 ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog.'])

Important: You need a space between the word and the tag!

In [None]:
def read_annoted_data(file_path):
    # the target has been annotated by hand
    # the words to be extracted are between <a> and </a> tags
    list_labels = []
    list_words = []
    for folder in os.listdir(file_path):
        for file in os.listdir(file_path + folder):
            with open(file_path + folder + "/" + file, 'r') as f:
                html = f.read()
                # keep only sections where data-title="ABROGATION DE DISPOSITIONS ANTERIEURES"
                # html = re.findall(r'data-title="ABROGATION DE DISPOSITIONS ANTERIEURES".*?</section>', html, re.DOTALL)
                html = re.findall(r'<header class="dsr-header">.*?</header>', html, re.DOTALL)
                if html:
                    # print(html[0])
                    labels, words = get_labels_from_annotation(html[0])
                    # divide the text into chunks of 100 words (for the tokenizer to work)
                    for i in range(0, len(labels), 250):
                        list_labels.append(labels[i:i+100])
                        list_words.append(words[i:i+100])
    return list_labels, list_words

In [None]:
# dezip file
import zipfile

with zipfile.ZipFile("./data_4/data_annoted.zip", 'r') as zip_ref:
    zip_ref.extractall("./data_4/data_annoted")

In [None]:
file_path = "./data_4/data_annoted/"
list_labels, list_words = read_annoted_data(file_path)
print(list_labels[0])
print(list_words[0])

[0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]
['class="dsr-identification">', '<h1>', 'ARRÊTÉ', 'du', '20', 'AVR.', '2020**', 'portant', 'autorisation', "d'exploiter", 'une', 'unité']


In [None]:
def build_data(file_path):
    list_labels, list_words = read_annoted_data(file_path)
    # data has to be in the form of a list of dictionaries
    # {'id': 'i',
    # 'tokens': ['word1', word2', ...],
    # 'ner_tags': [0, 0, 1, 2, 0, ...]}
    data = []
    for i in range(len(list_labels)):
        # print(i, list_labels[i], list_words[i])
        data.append({'id': str(i), 'tokens': list_words[i], 'ner_tags': list_labels[i]})

    # separate the data into train, validation and test sets
    random.seed(42)
    random.shuffle(data)

    train_size = int(0.8 * len(data))
    val_size = int(0.1 * len(data))

    train_data = data[:train_size]
    val_data = data[train_size:train_size+val_size]
    test_data = data[train_size+val_size:]

    train_dataset = Dataset.from_dict({
        'id': [d['id'] for d in train_data],
        'tokens': [d['tokens'] for d in train_data],
        'ner_tags': [d['ner_tags'] for d in train_data]
    })
    val_dataset = Dataset.from_dict({
        'id': [d['id'] for d in val_data],
        'tokens': [d['tokens'] for d in val_data],
        'ner_tags': [d['ner_tags'] for d in val_data]
    })
    test_dataset = Dataset.from_dict({
        'id': [d['id'] for d in test_data],
        'tokens': [d['tokens'] for d in test_data],
        'ner_tags': [d['ner_tags'] for d in test_data]
    })

    data = DatasetDict({
        'train': train_dataset,
        'validation': val_dataset,
        'test': test_dataset
    })

    return data

file_path = "./data_4/data_annoted/"
data = build_data(file_path)
data

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 22
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 2
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 4
    })
})

In [None]:
label_list = ['O', 'B', 'I']

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [None]:
example = data["train"][0]
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokens)

['[CLS]', 'consider', '##ant', 'que', 'la', 'demand', '##e', 'd', "'", 'aug', '##ment', '##ation', 'du', 'ton', '##nage', 'de', '1000', '##0', 'tonnes', 'a', '1500', '##0', 'tonnes', 'a', 'part', '##ir', 'du', '##quel', 'l', "'", 'exploit', '##ant', 'so', '##ume', '##t', 'au', 'pre', '##fe', '##t', 'sa', 'pre', '##station', 'de', 'service', 'en', 'rep', '##ons', '##e', 'au', 'si', '##vo', '##m', 'correspond', 'a', 'en', '##vir', '##on', '8', 'jo', '##urs', 'supplement', '##aire', '##s', 'sur', 'les', '16', 'initial', '##ement', 'pre', '##vus', ',', 'ne', 're', '##met', 'pas', 'en', 'cause', 'le', 'pri', '##nc', '##ipe', 'de', 'control', '##e', 'par', 'l', "'", 'auto', '##rite', 'administrative', 'tel', 'que', 'pre', '##vu', 'par', 'l', "'", 'ar', '##ret', '##e', 'd', "'", 'auto', '##ris', '##ation', 'initial', 'du', '20', 'av', '##ril', '2020', 'su', '##s', '##vis', '##e', ',', '<', '/', 'di', '##v', '>', '<', '/', 'header', '>', '[SEP]']


In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
tokenized_data = data.map(tokenize_and_align_labels, batched=True)
print(tokenized_data["train"][0])

Map:   0%|          | 0/22 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

{'id': '18', 'tokens': ['CONSIDÉRANT', 'que', 'la', 'demande', "d'augmentation", 'du', 'tonnage', 'de', '10000', 'tonnes', 'à', '15000', 'tonnes', 'à', 'partir', 'duquel', "l'exploitant", 'soumet', 'au', 'préfet', 'sa', 'prestation', 'de', 'service', 'en', 'réponse', 'au', 'SIVOM', 'correspond', 'à', 'environ', '8', 'jours', 'supplémentaires', 'sur', 'les', '16', 'initialement', 'prévus,', 'ne', 'remet', 'pas', 'en', 'cause', 'le', 'principe', 'de', 'contrôle', 'par', "l'autorité", 'administrative', 'tel', 'que', 'prévu', 'par', "l'arrêté", "d'autorisation", 'initial', 'du', '20', 'avril', '2020', 'susvisé,', '</div>', '</header>'], 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'input_ids': [101, 5136, 4630, 10861, 2474, 5157, 2063, 1040, 1005, 15476, 3672, 3370, 4241, 10228, 27031, 2139, 6694, 2692, 11000, 1037, 10347, 2692,

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
import evaluate

seqeval = evaluate.load("seqeval")

In [None]:
import numpy as np

labels = [label_list[i] for i in example[f"ner_tags"]]


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
id2label = {
    0: "O",
    1: "I",
    # 2: "I",
}
label2id = {
    "O": 0,
    "I": 1,
    # "I": 2
}

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="ner-model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.296032,0.0,0.0,0.0,0.95098


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=2, training_loss=0.43594563007354736, metrics={'train_runtime': 15.4914, 'train_samples_per_second': 1.42, 'train_steps_per_second': 0.129, 'total_flos': 1635715568760.0, 'train_loss': 0.43594563007354736, 'epoch': 1.0})

In [None]:
trainer.push_to_hub()

events.out.tfevents.1737026013.f35fb2804064.206.7:   0%|          | 0.00/5.83k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/alexia-allal/ner-model/commit/45d957b3e8cb1e6e647b21e911ed21a157ca8abc', commit_message='End of training', commit_description='', oid='45d957b3e8cb1e6e647b21e911ed21a157ca8abc', pr_url=None, repo_url=RepoUrl('https://huggingface.co/alexia-allal/ner-model', endpoint='https://huggingface.co', repo_type='model', repo_id='alexia-allal/ner-model'), pr_revision=None, pr_num=None)

In [None]:
def read_test_data(file_path):
    # a random text file, to test the model
    list_text = []
    with open(file_path, 'r') as f:
        html = f.read()
        # keep only sections where data-title="ABROGATION DE DISPOSITIONS ANTERIEURES"
        # html = re.findall(r'data-title="ABROGATION DE DISPOSITIONS ANTERIEURES".*?</section>', html, re.DOTALL)
        html = re.findall(r'<header class="dsr-header">.*?</header>', html, re.DOTALL)
        if html:
            # print(html[0])
            words = html[0].split()
            # divide the text into chunks of 100 words (for the tokenizer to work)
            for i in range(0, len(words), 250):
              words_chunk = words[i:i+100]
              text = " ".join(words_chunk)
              list_text.append(text)
    return list_text

In [None]:
list_text = read_test_data("./data_4/data_test.html")
print(list_text[0])
print(len(list_text))

<header class="dsr-header"> <div class="dsr-entity"> <div> Préfecture </div> <div> Direction des Collectivités Locales et des Produits Publics </div> <div> Bureau des Foyers Publics et Installations Classées </div> </div> <div class="dsr-identification"> <h1> ARRÊTÉ du 20 AVR. 2020** portant autorisation d'exploiter une unité de valorisation énergétique de combustibles solides de récupération (CSR), de déchets d'activité économique (DAE) et d'ordures ménagères (OM) sur le territoire de la commune de Bantzenheim à la société B+T ÉNERGIE France Sas en référence au titre VIII du livre I et au titre I° du livre V du code de l'environnement </h1> </div> <div class="dsr-visa"> VU le code de
6


In [None]:
from transformers import pipeline

classifier = pipeline("ner", model="alexia-allal/ner-model")
for i in range(len(list_text)):
  print(classifier(list_text[i]))

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

Device set to use cuda:0


[]
[]
[]
[]
[]
[{'entity': 'I', 'score': 0.52665764, 'index': 18, 'word': '##er', 'start': 54, 'end': 56}]


In [None]:
def classifier_to_text(text, classifier):
  tags = classifier(text)
  # put inference back to the original format with <a> </a> tags
  for tag in tags[::-1]:
    if tag['entity'] in ['B', 'I']:
      text = text[:tag['start']] + f"<a>{text[tag['start']:tag['end']]}" + f"</a>" + text[tag['end']:]
  return text

In [None]:
classifier_to_text(list_text[0], classifier)

'meilleures techniques disponibles relatives, notamment, à l\'incinération de déchets (BREF et WtE<a>)</a> ; </div> <div class="dsr-visa"> VU la directive 2003/87/CE du 13 octobre 2003 établissant un système d\'échange de quotas d\'émission de gaz à effet de serre dans la Communauté et modifiant la directive 96/61/CE du Conseil ; </div> <div class="dsr-visa"> VU l\'arrêté ministériel du 23 mai 2016 relatif aux installations de production de chaleur et/ou d\'électricité à partir de déchets non dangereux préparés sous forme de combustibles solides de récupération dans des installations prévues à cet effet associées ou non à un autre combustible et relevant de la rubrique'

This model doesn't work, because it was pretrained in english...

## XLM-Roberta

Pretrained model found on [Hugging Face](https://huggingface.co/FacebookAI/xlm-roberta-base/blob/main/README.md?code=true), trained on many languages including french.

In [185]:
def get_labels_from_annotation(html_file, nb_labels=2):
    labels = []
    words = []
    inside = False
    for i, word in enumerate(html_file.split()):
        if "<a>" in word:
            inside = True
        elif "</a>" in word:
            inside = False
        else:
            if inside:
                if nb_labels == 2:
                    labels.append(1)
                else:
                  if labels[-1] == 0:
                      labels.append(1)
                  else:
                      labels.append(2)
            else:
                labels.append(0)
            words.append(word)
    return labels, words

In [186]:
html_test = " The <a> quick </a> brown <a> fox jumps over </a> the lazy dog."
get_labels_from_annotation(html_test)

([0, 1, 0, 1, 1, 1, 0, 0, 0],
 ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog.'])

In [187]:
html_test = " The <a> quick </a> brown <a> fox jumps over </a> the lazy dog."
get_labels_from_annotation(html_test, nb_labels=3)

([0, 1, 0, 1, 2, 2, 0, 0, 0],
 ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog.'])

Important: You need a space between the word and the tag!

In [188]:
def read_annoted_data(file_path, nb_labels=2):
    # the target has been annotated by hand
    # the words to be extracted are between <a> and </a> tags
    list_labels = []
    list_words = []
    for folder in os.listdir(file_path):
        for file in os.listdir(file_path + folder):
            with open(file_path + folder + "/" + file, 'r') as f:
                html = f.read()
                # keep only sections where data-title="ABROGATION DE DISPOSITIONS ANTERIEURES"
                # html = re.findall(r'data-title="ABROGATION DE DISPOSITIONS ANTERIEURES".*?</section>', html, re.DOTALL)
                html = re.findall(r'<header class="dsr-header">.*?</header>', html, re.DOTALL)
                if html:
                    # print(html[0])
                    labels, words = get_labels_from_annotation(html[0], nb_labels)
                    # divide the text into chunks for the tokenizer to work (max 512 for roberta)
                    for i in range(0, len(labels), 256):
                        list_labels.append(labels[i:i+100])
                        list_words.append(words[i:i+100])
    return list_labels, list_words

In [189]:
# dezip file
import zipfile

with zipfile.ZipFile("./data_4/data_annoted.zip", 'r') as zip_ref:
    zip_ref.extractall("./data_4/data_annoted")

In [190]:
file_path = "./data_4/data_annoted/"
list_labels, list_words = read_annoted_data(file_path, nb_labels=2)
span=[30, 35]
print(list_labels[0][span[0]:span[1]])
print(list_words[0][span[0]:span[1]])

[0, 1, 1, 1, 0]
['<h1>', 'ARRETE', 'N°', '06/1C/032', 'demandant']


In [191]:
def build_data(file_path):
    list_labels, list_words = read_annoted_data(file_path)
    # data has to be in the form of a list of dictionaries
    # {'id': 'i',
    # 'tokens': ['word1', word2', ...],
    # 'ner_tags': [0, 0, 1, 2, 0, ...]}
    data = []
    for i in range(len(list_labels)):
        # print(i, list_labels[i], list_words[i])
        data.append({'id': str(i), 'tokens': list_words[i], 'ner_tags': list_labels[i]})

    # separate the data into train, validation and test sets
    random.seed(42)
    random.shuffle(data)

    train_size = int(0.8 * len(data))
    val_size = int(0.1 * len(data))

    train_data = data[:train_size]
    val_data = data[train_size:train_size+val_size]
    test_data = data[train_size+val_size:]

    train_dataset = Dataset.from_dict({
        'id': [d['id'] for d in train_data],
        'tokens': [d['tokens'] for d in train_data],
        'ner_tags': [d['ner_tags'] for d in train_data]
    })
    val_dataset = Dataset.from_dict({
        'id': [d['id'] for d in val_data],
        'tokens': [d['tokens'] for d in val_data],
        'ner_tags': [d['ner_tags'] for d in val_data]
    })
    test_dataset = Dataset.from_dict({
        'id': [d['id'] for d in test_data],
        'tokens': [d['tokens'] for d in test_data],
        'ner_tags': [d['ner_tags'] for d in test_data]
    })

    data = DatasetDict({
        'train': train_dataset,
        'validation': val_dataset,
        'test': test_dataset
    })

    return data

file_path = "./data_4/data_annoted/"
data = build_data(file_path)
data

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 33
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 4
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 5
    })
})

In [192]:
label_list = ['O', 'B', 'I']

In [193]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")

In [194]:
example = data["train"][0]
print(example)
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokens)

{'id': '9', 'tokens': ['</div>', '<div>', '2.', '</div>', '<div', 'class="dsr-motifs">', 'CONSIDÉRANT', 'que', 'les', 'conditions', "d'aménagement", 'et', "d'exploitation,", 'telles', 'que', 'définies', 'par', 'le', 'présent', 'arrêté,', 'permettent', 'de', 'prévenir', 'les', 'dangers', 'et', 'inconvénients', 'des', 'installations', 'pour', 'les', 'intérêts', 'mentionnés', 'à', "l'article", 'L', '511-1', 'du', 'Code', 'de', "l'Environnement,", 'notamment', 'pour', 'la', 'commodité', 'du', 'voisinage,', 'pour', 'la', 'santé,', 'la', 'sécurité,', 'la', 'salubrité', 'publiques', 'et', 'pour', 'la', 'protection', 'de', 'la', 'nature', 'et', 'de', "l'environnement", ';', '</div>', '<div', 'class="dsr-motifs">', 'CONSIDÉRANT', 'que', 'des', 'dispositifs', 'de', 'réduction', 'des', 'risques', 'à', 'la', 'source,', 'notamment', 'le', 'système', 'de', 'sectionnement', 'associé', 'à', 'la', 'détection', 'de', 'fuite', 'accidentelle,', 'permettent', 'de', 'limiter', 'notablement', 'les', 'conséqu

In [195]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [196]:
tokenized_data = data.map(tokenize_and_align_labels, batched=True)
print(tokenized_data["train"][0])

Map:   0%|          | 0/33 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

{'id': '9', 'tokens': ['</div>', '<div>', '2.', '</div>', '<div', 'class="dsr-motifs">', 'CONSIDÉRANT', 'que', 'les', 'conditions', "d'aménagement", 'et', "d'exploitation,", 'telles', 'que', 'définies', 'par', 'le', 'présent', 'arrêté,', 'permettent', 'de', 'prévenir', 'les', 'dangers', 'et', 'inconvénients', 'des', 'installations', 'pour', 'les', 'intérêts', 'mentionnés', 'à', "l'article", 'L', '511-1', 'du', 'Code', 'de', "l'Environnement,", 'notamment', 'pour', 'la', 'commodité', 'du', 'voisinage,', 'pour', 'la', 'santé,', 'la', 'sécurité,', 'la', 'salubrité', 'publiques', 'et', 'pour', 'la', 'protection', 'de', 'la', 'nature', 'et', 'de', "l'environnement", ';', '</div>', '<div', 'class="dsr-motifs">', 'CONSIDÉRANT', 'que', 'des', 'dispositifs', 'de', 'réduction', 'des', 'risques', 'à', 'la', 'source,', 'notamment', 'le', 'système', 'de', 'sectionnement', 'associé', 'à', 'la', 'détection', 'de', 'fuite', 'accidentelle,', 'permettent', 'de', 'limiter', 'notablement', 'les', 'conséqu

In [197]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [198]:
import evaluate

seqeval = evaluate.load("seqeval")

In [199]:
import numpy as np

labels = [label_list[i] for i in example[f"ner_tags"]]


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [200]:
id2label = {
    0: "O",
    1: "I",
    # 2: "I",
}
label2id = {
    "O": 0,
    "I": 1,
    # "I": 2
}

In [201]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained(
    "xlm-roberta-base",  num_labels=2, id2label=id2label, label2id=label2id)

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [202]:
training_args = TrainingArguments(
    output_dir="ner-model-roberta",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.491399,0.0,0.0,0.0,0.959427
2,No log,0.424451,0.0,0.0,0.0,0.959427


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=6, training_loss=0.5465536117553711, metrics={'train_runtime': 206.1454, 'train_samples_per_second': 0.32, 'train_steps_per_second': 0.029, 'total_flos': 9473538467364.0, 'train_loss': 0.5465536117553711, 'epoch': 2.0})

In [203]:
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/alexia-allal/ner-model-roberta/commit/e2b6f7d1abe6eacebded09b13af27f7d16032f70', commit_message='End of training', commit_description='', oid='e2b6f7d1abe6eacebded09b13af27f7d16032f70', pr_url=None, repo_url=RepoUrl('https://huggingface.co/alexia-allal/ner-model-roberta', endpoint='https://huggingface.co', repo_type='model', repo_id='alexia-allal/ner-model-roberta'), pr_revision=None, pr_num=None)

In [204]:
def read_test_data(file_path):
    # a random text file, to test the model
    list_text = []
    with open(file_path, 'r') as f:
        html = f.read()
        # keep only sections where data-title="ABROGATION DE DISPOSITIONS ANTERIEURES"
        # html = re.findall(r'data-title="ABROGATION DE DISPOSITIONS ANTERIEURES".*?</section>', html, re.DOTALL)
        html = re.findall(r'<header class="dsr-header">.*?</header>', html, re.DOTALL)
        if html:
            # print(html[0])
            words = html[0].split()
            # divide the text into chunks of 100 words (for the tokenizer to work)
            for i in range(0, len(words), 250):
              words_chunk = words[i:i+100]
              text = " ".join(words_chunk)
              list_text.append(text)
    return list_text

In [205]:
list_text = read_test_data("./data_4/data_test.html")
print(list_text[0])
print(len(list_text))

<header class="dsr-header"> <div class="dsr-entity"> <div> Préfecture </div> <div> Direction des Collectivités Locales et des Produits Publics </div> <div> Bureau des Foyers Publics et Installations Classées </div> </div> <div class="dsr-identification"> <h1> ARRÊTÉ du 20 AVR. 2020** portant autorisation d'exploiter une unité de valorisation énergétique de combustibles solides de récupération (CSR), de déchets d'activité économique (DAE) et d'ordures ménagères (OM) sur le territoire de la commune de Bantzenheim à la société B+T ÉNERGIE France Sas en référence au titre VIII du livre I et au titre I° du livre V du code de l'environnement </h1> </div> <div class="dsr-visa"> VU le code de
6


In [206]:
from transformers import pipeline

classifier = pipeline("ner", model="alexia-allal/ner-model-roberta")
for i in range(len(list_text)):
  print(classifier(list_text[i]))

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Device set to use cuda:0


[]
[]
[]
[]
[]
[]


In [207]:
tags = classifier(list_text[0])
label_list = [tag['entity'] for tag in tags]
print(set(label_list))

set()


In [208]:
classifier(list_text[0])

[]

In [209]:
def classifier_to_text(text, classifier):
  tags = classifier(text)
  # put inference back to the original format with <a> </a> tags
  # tags are of the form {'start': 659, 'end': 662}
  for i, tag in enumerate(tags[::-1]):
    print(i, tag)
    if i != len(tags)- 1 and tag['end'] == tags[i+1]['start']:
      # next tag is following directly so don't put <\a>
      text = text[:tag['start']] + "<a>" + text[tag['start']:]
    elif i != 0 and tag['start'] == tags[i-1]['end']:
      # previous tag is following directly so don't put <a>
      text = text[:tag['end']] + "</a>" + text[tag['end']:]
    else:
      text = text[:tag['start']] + "<a>" +text[tag['start']:tag['end']] + "</a>" + text[tag['end']:]
  return text

In [210]:
classifier_to_text(list_text[3], classifier)

'l\'environnement ; </div> <div class="dsr-visa"> VU** l\'avis de l\'Autorité Environnementale en date du 9 août 2019 et la réponse de l\'exploitant à l\'avis de l\'autorité environnementale en date du 12 septembre 2019 ; </div> <div class="dsr-visa"> VU** la décision du président du tribunal administratif de Strasbourg en date du 13 septembre 2019 portant non-nomination du commissaire enquêteur ; </div> <div class="dsr-visa"> VU** l\'arrêté préfectoral en date du 17 octobre 2019 ordonnant l\'organisation d\'une enquête publique pour une durée de 30 jours du 12 novembre 2019 au 12 décembre 2019 inclus sur le territoire des communes de Bantzenheim, Chalampé, Ottmarsheim, Hombourg'

## CamemBERT

Pretrained model found on [Hugging Face](https://huggingface.co/docs/transformers/model_doc/camembert), trained in french.

In [5]:
def get_labels_from_annotation(html_file, nb_labels=2):
    labels = []
    words = []
    inside = False
    for i, word in enumerate(html_file.split()):
        if "<a>" in word:
            inside = True
        elif "</a>" in word:
            inside = False
        else:
            if inside:
                if nb_labels == 2:
                    labels.append(1)
                else:
                    if labels[-1] == 0:
                        labels.append(1)
                    else:
                        labels.append(2)
            else:
                labels.append(0)
            words.append(word)
    return labels, words

In [6]:
html_test = " The <a> quick </a> brown <a> fox jumps over </a> the lazy dog."
get_labels_from_annotation(html_test)

([0, 1, 0, 1, 1, 1, 0, 0, 0],
 ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog.'])

In [7]:
html_test = " The <a> quick </a> brown <a> fox jumps over </a> the lazy dog."
get_labels_from_annotation(html_test, nb_labels=3)

([0, 1, 0, 1, 2, 2, 0, 0, 0],
 ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog.'])

Important: You need a space between the word and the tag!

In [8]:
def read_annoted_data(file_path, nb_labels=2):
    # the target has been annotated by hand
    # the words to be extracted are between <a> and </a> tags
    list_labels = []
    list_words = []
    for folder in os.listdir(file_path):
        for file in os.listdir(file_path + folder):
            with open(file_path + folder + "/" + file, 'r') as f:
                html = f.read()
                # keep only sections where data-title="ABROGATION DE DISPOSITIONS ANTERIEURES"
                # html = re.findall(r'data-title="ABROGATION DE DISPOSITIONS ANTERIEURES".*?</section>', html, re.DOTALL)
                html = re.findall(r'<header class="dsr-header">.*?</header>', html, re.DOTALL)
                if html:
                    # print(html[0])
                    labels, words = get_labels_from_annotation(html[0], nb_labels)
                    # divide the text into chunks for the tokenizer to work (max 512 for roberta)
                    for i in range(0, len(labels), 256):
                        list_labels.append(labels[i:i+100])
                        list_words.append(words[i:i+100])
    return list_labels, list_words

In [215]:
# dezip file
import zipfile

with zipfile.ZipFile("./data_4/data_annoted.zip", 'r') as zip_ref:
    zip_ref.extractall("./data_4/data_annoted")

In [216]:
file_path = "./data_4/data_annoted/"
list_labels, list_words = read_annoted_data(file_path, nb_labels=2)
span=[30, 35]
print(list_labels[0][span[0]:span[1]])
print(list_words[0][span[0]:span[1]])

[0, 1, 1, 1, 0]
['<h1>', 'ARRETE', 'N°', '06/1C/032', 'demandant']


In [217]:
def build_data(file_path):
    list_labels, list_words = read_annoted_data(file_path)
    # data has to be in the form of a list of dictionaries
    # {'id': 'i',
    # 'tokens': ['word1', word2', ...],
    # 'ner_tags': [0, 0, 1, 2, 0, ...]}
    data = []
    for i in range(len(list_labels)):
        # print(i, list_labels[i], list_words[i])
        data.append({'id': str(i), 'tokens': list_words[i], 'ner_tags': list_labels[i]})

    # separate the data into train, validation and test sets
    random.seed(42)
    random.shuffle(data)

    train_size = int(0.8 * len(data))
    val_size = int(0.1 * len(data))

    train_data = data[:train_size]
    val_data = data[train_size:train_size+val_size]
    test_data = data[train_size+val_size:]

    train_dataset = Dataset.from_dict({
        'id': [d['id'] for d in train_data],
        'tokens': [d['tokens'] for d in train_data],
        'ner_tags': [d['ner_tags'] for d in train_data]
    })
    val_dataset = Dataset.from_dict({
        'id': [d['id'] for d in val_data],
        'tokens': [d['tokens'] for d in val_data],
        'ner_tags': [d['ner_tags'] for d in val_data]
    })
    test_dataset = Dataset.from_dict({
        'id': [d['id'] for d in test_data],
        'tokens': [d['tokens'] for d in test_data],
        'ner_tags': [d['ner_tags'] for d in test_data]
    })

    data = DatasetDict({
        'train': train_dataset,
        'validation': val_dataset,
        'test': test_dataset
    })

    return data

file_path = "./data_4/data_annoted/"
data = build_data(file_path)
data

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 33
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 4
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 5
    })
})

In [218]:
label_list = ['O', 'B', 'I']

In [221]:
from transformers import AutoTokenizer
from transformers import CamembertTokenizer, CamembertForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("camembert-base")

In [222]:
example = data["train"][0]
print(example)
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokens)

{'id': '9', 'tokens': ['</div>', '<div>', '2.', '</div>', '<div', 'class="dsr-motifs">', 'CONSIDÉRANT', 'que', 'les', 'conditions', "d'aménagement", 'et', "d'exploitation,", 'telles', 'que', 'définies', 'par', 'le', 'présent', 'arrêté,', 'permettent', 'de', 'prévenir', 'les', 'dangers', 'et', 'inconvénients', 'des', 'installations', 'pour', 'les', 'intérêts', 'mentionnés', 'à', "l'article", 'L', '511-1', 'du', 'Code', 'de', "l'Environnement,", 'notamment', 'pour', 'la', 'commodité', 'du', 'voisinage,', 'pour', 'la', 'santé,', 'la', 'sécurité,', 'la', 'salubrité', 'publiques', 'et', 'pour', 'la', 'protection', 'de', 'la', 'nature', 'et', 'de', "l'environnement", ';', '</div>', '<div', 'class="dsr-motifs">', 'CONSIDÉRANT', 'que', 'des', 'dispositifs', 'de', 'réduction', 'des', 'risques', 'à', 'la', 'source,', 'notamment', 'le', 'système', 'de', 'sectionnement', 'associé', 'à', 'la', 'détection', 'de', 'fuite', 'accidentelle,', 'permettent', 'de', 'limiter', 'notablement', 'les', 'conséqu

In [223]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [224]:
tokenized_data = data.map(tokenize_and_align_labels, batched=True)
print(tokenized_data["train"][0])

Map:   0%|          | 0/33 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

{'id': '9', 'tokens': ['</div>', '<div>', '2.', '</div>', '<div', 'class="dsr-motifs">', 'CONSIDÉRANT', 'que', 'les', 'conditions', "d'aménagement", 'et', "d'exploitation,", 'telles', 'que', 'définies', 'par', 'le', 'présent', 'arrêté,', 'permettent', 'de', 'prévenir', 'les', 'dangers', 'et', 'inconvénients', 'des', 'installations', 'pour', 'les', 'intérêts', 'mentionnés', 'à', "l'article", 'L', '511-1', 'du', 'Code', 'de', "l'Environnement,", 'notamment', 'pour', 'la', 'commodité', 'du', 'voisinage,', 'pour', 'la', 'santé,', 'la', 'sécurité,', 'la', 'salubrité', 'publiques', 'et', 'pour', 'la', 'protection', 'de', 'la', 'nature', 'et', 'de', "l'environnement", ';', '</div>', '<div', 'class="dsr-motifs">', 'CONSIDÉRANT', 'que', 'des', 'dispositifs', 'de', 'réduction', 'des', 'risques', 'à', 'la', 'source,', 'notamment', 'le', 'système', 'de', 'sectionnement', 'associé', 'à', 'la', 'détection', 'de', 'fuite', 'accidentelle,', 'permettent', 'de', 'limiter', 'notablement', 'les', 'conséqu

In [225]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [226]:
import evaluate

seqeval = evaluate.load("seqeval")

In [227]:
import numpy as np

labels = [label_list[i] for i in example[f"ner_tags"]]


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [228]:
id2label = {
    0: "O",
    1: "I",
    # 2: "I",
}
label2id = {
    "O": 0,
    "I": 1,
    # "I": 2
}

In [229]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
model = CamembertForTokenClassification.from_pretrained(
    "camembert-base", num_labels=2, id2label=id2label, label2id=label2id)
# model = AutoModelForTokenClassification.from_pretrained(
#     "xlm-roberta-base",  num_labels=2, id2label=id2label, label2id=label2id)

model.safetensors:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [230]:
training_args = TrainingArguments(
    output_dir="ner-model-camembert",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.578317,0.0,0.0,0.0,0.957041
2,No log,0.546247,0.0,0.0,0.0,0.959427


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=6, training_loss=0.6054340600967407, metrics={'train_runtime': 62.9418, 'train_samples_per_second': 1.049, 'train_steps_per_second': 0.095, 'total_flos': 9219896889048.0, 'train_loss': 0.6054340600967407, 'epoch': 2.0})

In [231]:
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

events.out.tfevents.1737975859.32bd4fe8808f.327.6:   0%|          | 0.00/6.46k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/alexia-allal/ner-model-camembert/commit/8a9a8c04659baa5309835cd895d845ac242a0c6b', commit_message='End of training', commit_description='', oid='8a9a8c04659baa5309835cd895d845ac242a0c6b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/alexia-allal/ner-model-camembert', endpoint='https://huggingface.co', repo_type='model', repo_id='alexia-allal/ner-model-camembert'), pr_revision=None, pr_num=None)

In [232]:
def read_test_data(file_path):
    # a random text file, to test the model
    list_text = []
    with open(file_path, 'r') as f:
        html = f.read()
        # keep only sections where data-title="ABROGATION DE DISPOSITIONS ANTERIEURES"
        # html = re.findall(r'data-title="ABROGATION DE DISPOSITIONS ANTERIEURES".*?</section>', html, re.DOTALL)
        html = re.findall(r'<header class="dsr-header">.*?</header>', html, re.DOTALL)
        if html:
            # print(html[0])
            words = html[0].split()
            # divide the text into chunks of 100 words (for the tokenizer to work)
            for i in range(0, len(words), 250):
              words_chunk = words[i:i+100]
              text = " ".join(words_chunk)
              list_text.append(text)
    return list_text

In [233]:
list_text = read_test_data("./data_4/data_test.html")
print(list_text[0])
print(len(list_text))

<header class="dsr-header"> <div class="dsr-entity"> <div> Préfecture </div> <div> Direction des Collectivités Locales et des Produits Publics </div> <div> Bureau des Foyers Publics et Installations Classées </div> </div> <div class="dsr-identification"> <h1> ARRÊTÉ du 20 AVR. 2020** portant autorisation d'exploiter une unité de valorisation énergétique de combustibles solides de récupération (CSR), de déchets d'activité économique (DAE) et d'ordures ménagères (OM) sur le territoire de la commune de Bantzenheim à la société B+T ÉNERGIE France Sas en référence au titre VIII du livre I et au titre I° du livre V du code de l'environnement </h1> </div> <div class="dsr-visa"> VU le code de
6


In [234]:
from transformers import pipeline

classifier = pipeline("ner", model="alexia-allal/ner-model-camembert")
for i in range(len(list_text)):
  print(classifier(list_text[i]))

config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/811k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/374 [00:00<?, ?B/s]

Device set to use cuda:0


[]
[]
[]
[{'entity': 'I', 'score': 0.5018947, 'index': 127, 'word': '▁17', 'start': 461, 'end': 463}]
[]
[]


In [239]:
tags = classifier(list_text[3])
label_list = [tag['entity'] for tag in tags]
print(set(label_list))

{'I'}


In [240]:
classifier(list_text[3])

[{'entity': 'I',
  'score': 0.5018947,
  'index': 127,
  'word': '▁17',
  'start': 461,
  'end': 463}]

In [241]:
def classifier_to_text(text, classifier):
  tags = classifier(text)
  # put inference back to the original format with <a> </a> tags
  # tags are of the form {'start': 659, 'end': 662}
  for i, tag in enumerate(tags[::-1]):
    print(i, tag)
    if i != len(tags)- 1 and tag['end'] == tags[i+1]['start']:
      # next tag is following directly so don't put <\a>
      text = text[:tag['start']] + "<a>" + text[tag['start']:]
    elif i != 0 and tag['start'] == tags[i-1]['end']:
      # previous tag is following directly so don't put <a>
      text = text[:tag['end']] + "</a>" + text[tag['end']:]
    else:
      text = text[:tag['start']] + "<a>" +text[tag['start']:tag['end']] + "</a>" + text[tag['end']:]
  return text

In [242]:
classifier_to_text(list_text[3], classifier)

0 {'entity': 'I', 'score': 0.5018947, 'index': 127, 'word': '▁17', 'start': 461, 'end': 463}


'l\'environnement ; </div> <div class="dsr-visa"> VU** l\'avis de l\'Autorité Environnementale en date du 9 août 2019 et la réponse de l\'exploitant à l\'avis de l\'autorité environnementale en date du 12 septembre 2019 ; </div> <div class="dsr-visa"> VU** la décision du président du tribunal administratif de Strasbourg en date du 13 septembre 2019 portant non-nomination du commissaire enquêteur ; </div> <div class="dsr-visa"> VU** l\'arrêté préfectoral en date du <a>17</a> octobre 2019 ordonnant l\'organisation d\'une enquête publique pour une durée de 30 jours du 12 novembre 2019 au 12 décembre 2019 inclus sur le territoire des communes de Bantzenheim, Chalampé, Ottmarsheim, Hombourg'

## CamemBERT - fine-tuned with the files annotated using regex

Pre-trained model from [Hugging Face](https://huggingface.co/docs/transformers/model_doc/camembert), that we fine tune with the annoted files with regex.

In [122]:
def get_labels_from_annotation(html_file, nb_labels=2):
    labels = []
    words = []
    inside = False
    for i, word in enumerate(html_file.split()):
        if "<a>" in word:
            inside = True
        elif "</a>" in word:
            inside = False
        else:
            if inside:
                if nb_labels == 2:
                    labels.append(1)
                else:
                    if labels[-1] == 0:
                        labels.append(1)
                    else:
                        labels.append(2)
            else:
                labels.append(0)
            words.append(word)
    return labels, words

In [123]:
html_test = " The <a> quick </a> brown <a> fox jumps over </a> the lazy dog."
get_labels_from_annotation(html_test)

([0, 1, 0, 1, 1, 1, 0, 0, 0],
 ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog.'])

In [124]:
html_test = " The <a> quick </a> brown <a> fox jumps over </a> the lazy dog."
get_labels_from_annotation(html_test, nb_labels=3)

([0, 1, 0, 1, 2, 2, 0, 0, 0],
 ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog.'])

Important: You need a space between the word and the tag!

In [125]:
def read_annoted_data(file_path, nb_labels=2):
    # the target has been annotated by hand
    # the words to be extracted are between <a> and </a> tags
    list_labels = []
    list_words = []
    for folder in os.listdir(file_path):
        for file in os.listdir(file_path + folder):
            with open(file_path + folder + "/" + file, 'r') as f:
                # print(file_path + folder + "/" + file)
                html = f.read()
                # keep only sections where data-title="ABROGATION DE DISPOSITIONS ANTERIEURES"
                # html = re.findall(r'data-title="ABROGATION DE DISPOSITIONS ANTERIEURES".*?</section>', html, re.DOTALL)
                html = re.findall(r'<header class="dsr-header">.*?</header>', html, re.DOTALL)
                if html:
                    # print(html[0])
                    labels, words = get_labels_from_annotation(html[0], nb_labels)
                    # divide the text into chunks for the tokenizer to work (max 512 for roberta)
                    for i in range(0, len(labels), 256):
                        list_labels.append(labels[i:i+100])
                        list_words.append(words[i:i+100])
    return list_labels, list_words

In [126]:
# dezip file
import zipfile

with zipfile.ZipFile("./data_4/annoted_regex.zip", 'r') as zip_ref:
    zip_ref.extractall("./data_4/annoted_regex")

In [127]:
file_path = "./data_4/annoted_regex/"
list_labels, list_words = read_annoted_data(file_path, nb_labels=2)
span=[30, 40]
print(list_labels[0][span[0]:span[1]])
print(list_words[0][span[0]:span[1]])

[0, 0, 1, 1, 1, 0, 0, 0, 0, 0]
['class="dsr-identification">', '<h1>', 'ARRETE', 'PREFECTORAL', 'COMPLEMENTAIRE', 'Commune', 'de', 'DOMFRONT', 'Societe', 'Fromagere']


In [128]:
def build_data(file_path):
    list_labels, list_words = read_annoted_data(file_path)
    # data has to be in the form of a list of dictionaries
    # {'id': 'i',
    # 'tokens': ['word1', word2', ...],
    # 'ner_tags': [0, 0, 1, 2, 0, ...]}
    data = []
    for i in range(len(list_labels)):
        # print(i, list_labels[i], list_words[i])
        data.append({'id': str(i), 'tokens': list_words[i], 'ner_tags': list_labels[i]})

    # separate the data into train, validation and test sets
    random.seed(42)
    random.shuffle(data)

    train_size = int(0.8 * len(data))
    val_size = int(0.1 * len(data))

    train_data = data[:train_size]
    val_data = data[train_size:train_size+val_size]
    test_data = data[train_size+val_size:]

    train_dataset = Dataset.from_dict({
        'id': [d['id'] for d in train_data],
        'tokens': [d['tokens'] for d in train_data],
        'ner_tags': [d['ner_tags'] for d in train_data]
    })
    val_dataset = Dataset.from_dict({
        'id': [d['id'] for d in val_data],
        'tokens': [d['tokens'] for d in val_data],
        'ner_tags': [d['ner_tags'] for d in val_data]
    })
    test_dataset = Dataset.from_dict({
        'id': [d['id'] for d in test_data],
        'tokens': [d['tokens'] for d in test_data],
        'ner_tags': [d['ner_tags'] for d in test_data]
    })

    data = DatasetDict({
        'train': train_dataset,
        'validation': val_dataset,
        'test': test_dataset
    })

    return data

file_path = "./data_4/annoted_regex/"
data = build_data(file_path)
data

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 372
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 46
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 47
    })
})

In [129]:
label_list = ['O', 'B', 'I']

In [130]:
from transformers import AutoTokenizer
from transformers import CamembertTokenizer, CamembertForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("camembert-base")

In [131]:
example = data["train"][0]
print(example)
tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokens)

{'id': '41', 'tokens': ['aux', 'installations', 'relevant', 'du', 'regime', 'de', "l'enregistrement", 'au', 'titre', 'de', 'la', 'rubrique', 'ndeg', '2230', 'de', 'la', 'nomenclature', 'des', 'installations', 'classees', 'pour', 'la', 'protection', 'de', "l'environnement", ';', '</div>', '<div', 'class="dsr-visa">', 'VU**', "l'arrete", 'ministeriel', 'du', '27', 'fevrier', '2020', 'relatif', 'aux', 'meilleures', 'techniques', 'disponibles', '(MTD)', 'applicables', 'a', 'certaines', 'installations', 'classees', 'du', 'secteur', 'de', "l'agroalimentaire", 'relevant', 'du', 'regime', 'de', "l'autorisation", 'au', 'titre', 'des', 'rubriques', '3642,', '3643', 'ou', '3710', '(pour', 'lesquelles', 'la', 'charge', 'polluante', 'principale', 'provient', "d'installations", 'relevant', 'des', 'rubriques', '3642', 'ou', '3643)', 'de', 'la', 'nomenclature', 'des', 'installations', 'classees', 'pour', 'la', 'protection', 'de', "l'environnement", ';', '</div>', '<div', 'class="dsr-visa">', 'VU**', "

In [132]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [133]:
tokenized_data = data.map(tokenize_and_align_labels, batched=True)
print(tokenized_data["train"][0])

Map:   0%|          | 0/372 [00:00<?, ? examples/s]

Map:   0%|          | 0/46 [00:00<?, ? examples/s]

Map:   0%|          | 0/47 [00:00<?, ? examples/s]

{'id': '41', 'tokens': ['aux', 'installations', 'relevant', 'du', 'regime', 'de', "l'enregistrement", 'au', 'titre', 'de', 'la', 'rubrique', 'ndeg', '2230', 'de', 'la', 'nomenclature', 'des', 'installations', 'classees', 'pour', 'la', 'protection', 'de', "l'environnement", ';', '</div>', '<div', 'class="dsr-visa">', 'VU**', "l'arrete", 'ministeriel', 'du', '27', 'fevrier', '2020', 'relatif', 'aux', 'meilleures', 'techniques', 'disponibles', '(MTD)', 'applicables', 'a', 'certaines', 'installations', 'classees', 'du', 'secteur', 'de', "l'agroalimentaire", 'relevant', 'du', 'regime', 'de', "l'autorisation", 'au', 'titre', 'des', 'rubriques', '3642,', '3643', 'ou', '3710', '(pour', 'lesquelles', 'la', 'charge', 'polluante', 'principale', 'provient', "d'installations", 'relevant', 'des', 'rubriques', '3642', 'ou', '3643)', 'de', 'la', 'nomenclature', 'des', 'installations', 'classees', 'pour', 'la', 'protection', 'de', "l'environnement", ';', '</div>', '<div', 'class="dsr-visa">', 'VU**', "

In [134]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [135]:
import evaluate

seqeval = evaluate.load("seqeval")

In [136]:
import numpy as np

labels = [label_list[i] for i in example[f"ner_tags"]]


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [137]:
id2label = {
    0: "O",
    1: "I",
    # 2: "I",
}
label2id = {
    "O": 0,
    "I": 1,
    # "I": 2
}

In [138]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
model = CamembertForTokenClassification.from_pretrained(
    "camembert-base", num_labels=2, id2label=id2label, label2id=label2id)
# model = AutoModelForTokenClassification.from_pretrained(
#     "camembert-base",  num_labels=2, id2label=id2label, label2id=label2id)

Some weights of CamembertForTokenClassification were not initialized from the model checkpoint at camembert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [139]:
training_args = TrainingArguments(
    output_dir="ner-model-camembert",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=25,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.363955,0.0,0.0,0.0,0.873887
2,No log,0.264031,0.688427,0.431227,0.530286,0.903657
3,No log,0.224775,0.697581,0.643123,0.669246,0.919831
4,No log,0.216313,0.818182,0.60223,0.69379,0.932958
5,No log,0.16903,0.733558,0.80855,0.769231,0.938819
6,No log,0.176796,0.855814,0.684015,0.760331,0.945617
7,No log,0.183808,0.857798,0.695167,0.767967,0.947023
8,No log,0.159134,0.815789,0.806691,0.811215,0.952649
9,No log,0.168829,0.857143,0.758364,0.804734,0.953586
10,No log,0.159559,0.843137,0.799257,0.820611,0.955931


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=600, training_loss=0.08729827523231506, metrics={'train_runtime': 1294.9252, 'train_samples_per_second': 7.182, 'train_steps_per_second': 0.463, 'total_flos': 1327224213745920.0, 'train_loss': 0.08729827523231506, 'epoch': 25.0})

In [149]:
trainer.push_to_hub()

No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/alexia-allal/ner-model-camembert/commit/8361b50dd5a1693a9f87732272f6037906ce2d07', commit_message='End of training', commit_description='', oid='8361b50dd5a1693a9f87732272f6037906ce2d07', pr_url=None, repo_url=RepoUrl('https://huggingface.co/alexia-allal/ner-model-camembert', endpoint='https://huggingface.co', repo_type='model', repo_id='alexia-allal/ner-model-camembert'), pr_revision=None, pr_num=None)

In [150]:
def read_test_data(file_path):
    # a random text file, to test the model
    list_text = []
    with open(file_path, 'r') as f:
        html = f.read()
        # keep only sections where data-title="ABROGATION DE DISPOSITIONS ANTERIEURES"
        # html = re.findall(r'data-title="ABROGATION DE DISPOSITIONS ANTERIEURES".*?</section>', html, re.DOTALL)
        html = re.findall(r'<header class="dsr-header">.*?</header>', html, re.DOTALL)
        if html:
            # print(html[0])
            words = html[0].split()
            # divide the text into chunks of 100 words (for the tokenizer to work)
            for i in range(0, len(words), 250):
              words_chunk = words[i:i+100]
              text = " ".join(words_chunk)
              list_text.append(text)
    return list_text

In [151]:
list_text = read_test_data("./data_4/data_test.html")
print(list_text[0])
print(len(list_text))

<header class="dsr-header"> <div class="dsr-entity"> <div> Préfecture </div> <div> Direction des Collectivités Locales et des Produits Publics </div> <div> Bureau des Foyers Publics et Installations Classées </div> </div> <div class="dsr-identification"> <h1> ARRÊTÉ du 20 AVR. 2020** portant autorisation d'exploiter une unité de valorisation énergétique de combustibles solides de récupération (CSR), de déchets d'activité économique (DAE) et d'ordures ménagères (OM) sur le territoire de la commune de Bantzenheim à la société B+T ÉNERGIE France Sas en référence au titre VIII du livre I et au titre I° du livre V du code de l'environnement </h1> </div> <div class="dsr-visa"> VU le code de
6


In [None]:
from transformers import pipeline

classifier = pipeline("ner", model="alexia-allal/ner-model-camembert")

In [154]:
classifier(list_text[0])

[{'entity': 'I',
  'score': 0.966462,
  'index': 85,
  'word': '▁A',
  'start': 260,
  'end': 261},
 {'entity': 'I',
  'score': 0.9662905,
  'index': 86,
  'word': 'RR',
  'start': 261,
  'end': 263},
 {'entity': 'I',
  'score': 0.9660116,
  'index': 87,
  'word': '<unk>',
  'start': 263,
  'end': 264},
 {'entity': 'I',
  'score': 0.9659257,
  'index': 88,
  'word': 'TÉ',
  'start': 264,
  'end': 266},
 {'entity': 'I',
  'score': 0.96680003,
  'index': 89,
  'word': '▁du',
  'start': 267,
  'end': 269},
 {'entity': 'I',
  'score': 0.9668481,
  'index': 90,
  'word': '▁20',
  'start': 270,
  'end': 272},
 {'entity': 'I',
  'score': 0.9665248,
  'index': 91,
  'word': '▁A',
  'start': 273,
  'end': 274},
 {'entity': 'I',
  'score': 0.9660688,
  'index': 92,
  'word': 'VR',
  'start': 274,
  'end': 276},
 {'entity': 'I',
  'score': 0.9630096,
  'index': 93,
  'word': '.',
  'start': 276,
  'end': 277},
 {'entity': 'I',
  'score': 0.96546483,
  'index': 94,
  'word': '▁2020',
  'start': 27

In [155]:
def classifier_to_text(text, classifier):
  tags = classifier(text)
  # put inference back to the original format with <a> </a> tags
  # tags are of the form {'start': 659, 'end': 662}
  for i in range(1, len(tags)+1):
    n_tag = len(tags) - i
    tag = tags[n_tag]
    beg = n_tag != 0 and tag['start'] in [tags[n_tag-1]['end'], tags[n_tag-1]['end']+1]
    end = n_tag != len(tags)- 1 and tag['end'] in [tags[n_tag+1]['start'], tags[n_tag+1]['start']-1]
    if beg and end:
      pass
    elif beg:
      text = text[:tag['end']] + "</a>" + text[tag['end']:]
    elif end:
      text = text[:tag['start']] + "<a>" + text[tag['start']:]
    else:
      text = text[:tag['start']] + "<a>" +text[tag['start']:tag['end']] + "</a>" + text[tag['end']:]
  return text

In [156]:
classifier_to_text(list_text[0], classifier)

'<header class="dsr-header"> <div class="dsr-entity"> <div> Préfecture </div> <div> Direction des Collectivités Locales et des Produits Publics </div> <div> Bureau des Foyers Publics et Installations Classées </div> </div> <div class="dsr-identification"> <h1> <a>ARRÊTÉ du 20 AVR. 2020</a>** portant autorisation d\'exploiter une unité de valorisation énergétique de combustibles solides de récupération (CSR), de déchets d\'activité économique (DAE) et d\'ordures ménagères (OM) sur le territoire de la commune de Bantzenheim à la société B+T ÉNERGIE France Sas en référence au titre VIII du livre I et au titre I° du livre V du <a>code de l</a>\'environnement </h1> </div> <div class="dsr-visa"> VU le <a>code de</a>'