In [None]:
from tqdm.auto import tqdm
import random
import json
import os
import shutil

# Math utils
import statistics
import numpy as np
import pandas as pd

# ML utils
import torch
from torch.utils.data import DataLoader
from transformers import DataCollatorForTokenClassification, get_scheduler
from datasets import load_dataset, load_metric, Features, ClassLabel, Value, Sequence

# Models used
from transformers import CamembertTokenizerFast, CamembertForTokenClassification

In [None]:
# setup seeds to reproduce results
SEED = 1312
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [None]:
# setup default GPU to use
device = torch.device("cpu")
if torch.cuda.is_available():
    device = torch.device("cuda")
    torch.cuda.set_device(2)
    print("cuDNN enabled? ", torch.backends.cudnn.enabled)
    print("cuDNN version:", torch.backends.cudnn.version())
    print("cuda version:", torch.version.cuda)
    print("Device name? ", torch.cuda.get_device_name(torch.cuda.current_device()))

# Training french BERT models to detect tokens allowing to identifying patients

The aim of this notebook is to train a BERT model, more especially a *CamemBERT* model, to detect terms in clinical document alowing to identify directly or indirectly a patient (names, ids, dates, etc.).

This kind of task can be seen as Name Entity Recognition (NER) problem, also called token classification problem.

First of all lets define the model we’ll use in this notebook.

In [None]:
# to test with another model, change name and rerun all notebook
model_name_prefix = "camembert-base"
model_name_suffix = "" #"-pretrained"
model_name = model_name_prefix + model_name_suffix
dir_model = "models"
if model_name_suffix != "":
    dir_model += "/saved_models/" + model_name + "/best"
else:
    dir_model += "/"+model_name
print(dir_model)

## Preparing datasets and dataloaders for Token Classification

To train a model for token classification we need two things:

1. defining classes to use to classify tokens
2. loading a dataset of text with tokens already classified

In our case we have 12 classes we aim to recognize by using a neural network, each class representing a piece of information that could conduct to the identification of a patient:

* Postal Code (or Code postal in french), the code allowing to identify a town/city and then the location of a patient
* City (or Ville in french), the name of a town/city allowing to identify the location of a patient
* Street (or Voie in french), the street part of an adress allowing to identify the location of a patient
* Locality (or Localité in french), the name of a country/region/state/department allowing to identify the location of a patient
* FirstName/LastName (or Nom/Prénom in french), the first name and/or the last name of a person, allowing to identify a patient or a person linked to the patient (ex. a doctor)
* Permanent Patient Identificator (or Identifiant Patient Permanent), an id allowing to identify directly a patient such as Social Security Number (or Numéro de Sécurité Sociale in french).
* File Number (or Numéro de Dossier in french), the id of a medical file in the hospital, allowing to identify indirectly a patient
* Phone Number (or Numéro de Téléphone in french), the phone number of a person (a patient or a person linked to the patient), allowing to identify at least the location of a patient
* E-Mail, allowing to identify the name of a person (a patient or a person linked to the patient)
* Organisation, the name of an organisation linked to treatement of a patient, allowing to identify indirectly a patient
* Website (or site web in french), the url address of the website of a person or an organisation, allowing to identify this person or organisation and then inderectly a patient
* Date, a date linked to healthcare process of a patient

Because several successive words can be labelised under the same class, in token classification a distinction is made between the token starting a labelisation (noted *B-ClassName*) and tokens classified under a label already started (noted *I-ClassName*).

In addition, we use the *O* class to identify tokens with no class.

This give us 25 classes.

In [None]:
label_names = ["O", "B-CodePostal", "I-CodePostal", "B-Ville", "I-Ville", "B-NomPrenom", "I-NomPrenom", "B-Voie", "I-Voie", "B-IPP", "I-IPP", "B-Date", "I-Date", "B-NoDossier", "I-NoDossier", "B-Organisation", "I-Organisation", "B-SiteWeb", "I-SiteWeb", "B-EMail", "I-EMail", "B-Localite", "I-Localite", "B-Telephone", "I-Telephone"]
features = Features({'id': Value(dtype='int64'), 'tokens': Sequence(feature=Value(dtype='string')), 'ner_tags': Sequence(feature=ClassLabel(names=label_names))})

### Loading train and validation datasets

To train our model on the identification 1240 medical have been labelised by 10 different peoples, with double-reading, according to the guidelines detailled in [LabelGuidelines.md](LabelGuidelines.md).

In [None]:
data_file = 'data/anonymisation-ner.jsonl'

cpt_labels = {}
with open(data_file, "r", encoding="utf-8") as f:
    json_list = list(f)
    for json_str in tqdm(json_list):
        result = json.loads(json_str)
        for result_label in result["ner_tags"]:
            label_name = label_names[result_label]
            if label_name not in cpt_labels:
                cpt_labels[label_name] = 0
            cpt_labels[label_name] += 1
cpt_labels

With the aim to validate the feasability of our token classification problem by using a BERT model we’ll use cross-validation.

To do so, we’ll load our labelised documents in 10 different training dataset each corresponding 90% of the original dataset.

In [None]:
raw_train_datasets = load_dataset(
    'json',
    data_files=data_file,
    features=features,
    download_mode='force_redownload',
    split=[f'train[:{k}%]+train[{k+10}%:]' for k in range(0, 100, 10)]
)
print(raw_train_datasets)

And we load the 10% left of each training dataset into 10 validation datasets.

Therefore, for each training dataset we’ll perform a clear training and validate the model on the corresponding validation dataset.

This will give us more relevant result, avoiding cases where a model show very good results only for a specific 

In [None]:
raw_val_datasets = load_dataset(
    'json',
    data_files=data_file,
    features=features,
    split=[f'train[{k}%:{k+10}%]' for k in range(0, 100, 10)]
)
print(raw_val_datasets)

## Preprocessing datasets

Before starting the training of our model, we need to preprocess our datasets to:

- Have texts with a size treatable by our model
- Align labels and tokens

### Chunk texts of datasets

One problem with NLP models is that they have a limitation in size of text they can treat.

In our case, CamemBERT models can treat texts with less that 512 tokens.

Different methods can be used to overcome this limitation. For example, we can simply use the 512 first tokens of each texts.

Because the documents we use can contain relevant information in all their contents, we choose to chunk our documents in subtexts of maximum 256 words.

256 words because tokenizer generally split words in prefixes and suffixes. So, 256 words easily give 512 tokens.

Fortunately, our datasets are already splitted into tokens with labels associated to each token.

To chunk datasets, we first define the following function:

In [None]:
embedding_dim = 512
subtext_size = int(embedding_dim / 2)
stride = int(subtext_size / 2)


def chunk_example(example):
    tokens = example["tokens"][0]
    labels = example["ner_tags"][0]
    if len(tokens) <= subtext_size:
        return {'tokens': [tokens], 'ner_tags': [labels]}

    chunk_tokens = []
    chunk_labels = []
    for i in range(stride, len(tokens), stride):
        sub_seq = [word for word in tokens[i-stride:i+stride]]
        sub_labels = [label for label in labels[i-stride:i+stride]]
        if len(sub_seq) != 0: #and any(label != 0 for label in sub_labels):
            chunk_tokens.append(sub_seq)
            chunk_labels.append(sub_labels)
    return {'tokens': chunk_tokens, 'ner_tags': chunk_labels}

Then we apply this function to each of text of each training dataset using the *map* function

In [None]:
chunked_train_datasets = []
for raw_dataset in raw_train_datasets:
    chunked_train_datasets.append(raw_dataset.map(chunk_example, batched=True, batch_size=1, remove_columns=raw_dataset.column_names, load_from_cache_file=False))
print(chunked_train_datasets)

We obtain then datasets with a mean size of 3838 texts by dataset.

And we apply the methods on validation datasets.

In [None]:
chunked_val_datasets = []
for raw_dataset in raw_val_datasets:
    chunked_val_datasets.append(raw_dataset.map(chunk_example, batched=True, batch_size=1, remove_columns=raw_dataset.column_names, load_from_cache_file=False))
print(chunked_val_datasets)

We obtain validation datasets with a mean size of 429 texts.

### Align labels with tokenized texts

The second step for preprocessing our datasets consist to align labels with how words will be tokenized by the model.

For example, a text like:

```
Anne Honyme, habitant à Paris, consultation du 08/07/2022 aux HCL. 
```

Will appeared in our dataset as following:

```
tokens: ["Anne", "Honyme", ",", "habitant", "à", "Paris", ",", "consultation", "du", "08", "/", "07", "/", "2022", "aux", "HCL" ]

labels: ["B-NomPrenom", "I-NomPrenom", "O", "O", "O", "B-Ville", "O", "O", "O", "B-Date", "I-Date", "I-Date", "I-Date", "I-Date", "O", "B-Organisation"]

label_ids = [5, 6, 0, 0, 0, 3, 0, 0, 0, 11, 12, 12, 12, 12, 0, 15]
```

But if we tokenize this text with the tokenizer of our model we obtain the following tokens:

In [None]:
tokenizer = CamembertTokenizerFast.from_pretrained(
        dir_model,
        local_files_only=True
    )

example_tokens = tokenizer.tokenize(["Anne", "Honyme", ",", "habitant", "à", "Paris", ",", "consultation", "du", "08", "/", "07", "/", "2022", "aux", "HCL" ], is_split_into_words=True)
print(example_tokens)

We can see that the tokenizer creates more tokens than the initial text.

In consequence we need to align the labels set to the original text to the corresponding tokens in the tokenized text.

To do so we define the following function that will, for each token get the corresponding word and associated label, then return a set of labels aligned with the new tokenized text.

In [None]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

If we get back our example we obtain:

In [None]:
example_tokens = tokenizer(["Anne", "Honyme", ",", "habitant", "à", "Paris", ",", "consultation", "du", "08", "/", "07", "/", "2022", "aux", "HCL" ], is_split_into_words=True)
example_labels = [5, 6, 0, 0, 0, 3, 0, 0, 0, 11, 12, 12, 12, 12, 0, 15]
example_wordids = example_tokens.word_ids(0)
print(example_wordids)
print(align_labels_with_tokens(example_labels, example_wordids))

We now define a function to apply this function to a set of documents.

In [None]:
def tokenize_and_align_labels(documents):
    tokenized_inputs = tokenizer(
        documents["tokens"], truncation=True, is_split_into_words=True, max_length=embedding_dim
    )
    all_labels = documents["ner_tags"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

Then we can apply this function to our training dataset using *map* fuction.

In [None]:
tokenized_train_datasets = []
for chunked_dataset in chunked_train_datasets:
    tokenized_train_datasets.append(chunked_dataset.map(
        tokenize_and_align_labels,
        batched=True,
        remove_columns=chunked_dataset.column_names,
        load_from_cache_file=False
    ))

print(tokenized_train_datasets)

And to our validation dataset using the same method.

In [None]:
tokenized_val_datasets = []
for chunked_dataset in chunked_val_datasets:
    tokenized_val_datasets.append(chunked_dataset.map(
        tokenize_and_align_labels,
        batched=True,
        remove_columns=chunked_dataset.column_names,
        load_from_cache_file=False
    ))

print(tokenized_val_datasets)

### Creation of DataLoader for Token Classification

Now that we have prepared our datasets, we still need to define the DataLoader that will generate the batches to train our model.

To do so, we’ll need to define a Data Collator based on our tokenizer and the size of batches for our training.

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
batch_size = 8

Then, because we’ll train and retrain our model on each training dataset, we define the following function to generate a training dataloader and a validation dataloader for each training dataset.

In [None]:
def create_dataloaders(train_dataset, val_dataset):
    train_dataloader = DataLoader(
        train_dataset,
        shuffle=True,
        collate_fn=data_collator,
        batch_size=batch_size,
    )
    eval_dataloader = DataLoader(
        val_dataset, collate_fn=data_collator, batch_size=batch_size
    )
    return train_dataloader, eval_dataloader

## Fine-tuning the model for Token Classification

Now that we have prepare our datasets and dataloaders, we can setup and train our model.

### Define Output

First, let us define where and how the fine-tuned models will be saved.

In [None]:
dir_output_model = "models/saved_models/"+model_name+"-finetuned"

def save_model(model, save_dir):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    model_to_save = model.module if hasattr(model, 'module') else model
    model_to_save.save_pretrained(save_dir)
    tokenizer.save_pretrained(save_dir)

### Define evaluation methods

To evaluated the performances of the token classification we’ll need to use *seqeval* to compute all true/false positive/negative, and then recall, precision and F-1 scores of our model.

In [None]:
metric = load_metric("my_seqeval.py")

Then, we can define a function that, for a given model and validation dataloader, will return the performances of this model on this validation dataloader.

In [None]:
def evaluate_model(model, eval_dataloader):
    model.eval()
    for batch in tqdm(eval_dataloader):
        with torch.no_grad():
            b_input_ids = batch['input_ids'].to(device)
            outputs = model(b_input_ids)

        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"].to(device)

        predictions = predictions.detach().cpu().clone().numpy()
        labels = labels.detach().cpu().clone().numpy()

        # Remove ignored index (special tokens) and convert to labels
        true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
        true_predictions = [
            [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]

        metric.add_batch(predictions=true_predictions, references=true_labels)

    results = metric.compute()

    return results

We also define the following function to save the results obtained in a given directory

In [None]:
# To avoid encoding errors
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)


def save_model_evaluation(results, dir_output):
    output = dir_output+"/eval-finetuning.csv"
    if not os.path.exists(output):
        f = open(output, "w", encoding="utf-8")
        f.write("model;pretrained")
        # write heads
        for key in results:
            if type(results[key]) is dict :
                for key2 in results[key]:
                    f.write(";"+key+"-"+key2)
            else:
                f.write(";"+key)
        f.write("\n")
    else:
        f = open(output, "a", encoding="utf-8")

    f.write(model_name_prefix+";"+str(model_name_suffix != ""))
    for key in results:
        if type(results[key]) is dict :
            for key2 in results[key]:
                f.write(";"+str(round(results[key][key2], 5)))
        else:
            f.write(";"+str(round(results[key], 5)))
    f.write("\n")
    f.close()

### Prepare models to train

To prepare our model, we first need to indicate it how to get a label from its id, and vice versa.

In [None]:
id2label = {str(i): label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

Then, because we want to train our model for each dataset, we define the following function that will create and return a clean version of our model.

In [None]:
def prepare_model(dir_model):
    model = CamembertForTokenClassification.from_pretrained(
            dir_model,
            id2label=id2label,
            label2id=label2id,
            local_files_only=True
        )

    model.to(device);

    return model

### Setup training

Last step before launch the training loop, we have to define the optimizers and the scheduler for our training.

First, let us define the hyperparameters for our optimizer (learning rate and adam epsilon).

In [None]:
# hyperparameters
learning_rate = 1e-4
adam_epsilon = 1e-8

Then, we define the following function to create an optimizer for given model.

In [None]:
def prepare_optimizer(model):
    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=learning_rate,
        eps=adam_epsilon
    )

    return optimizer

Now, we have to define the number of epochs for our training, in other words the number of time our model we’ll see the totality of the training dataset during its training.

In [None]:
num_epochs = 10

Then, we define the following function to generate a scheduler for a given optimizer and training dataloader.

In [None]:
def prepare_scheduler(optimizer, train_dataloader):
    total_steps = len(train_dataloader) * num_epochs

    scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )

    return scheduler

### Launch training

Finally, we can start the training of our model.

For each training dataset, and its corresponding validation dataset, we create a training and a validation dataloaders.

Then, we create a new model with new optimzer and scheduler.

Once the training done, we evaluate the model on the validation dataset and save its results. And if its overall f1-score is better than previous models, we save this model.

In [None]:
if os.path.exists(dir_output_model):
    shutil.rmtree(dir_output_model)

best_fscore = 0.0

for i in tqdm(range(4), desc="cross-validation"):

    train_dataloader, eval_dataloader = create_dataloaders(tokenized_train_datasets[i], tokenized_val_datasets[i])

    model = prepare_model(dir_model)
    optimizer = prepare_optimizer(model)
    scheduler = prepare_scheduler(optimizer, train_dataloader)

    barepochs = tqdm(range(num_epochs))
    for j in barepochs:

        total_loss = 0

        model.train()

        for step, batch in enumerate(tqdm(train_dataloader, desc="batches")):
            b_input_ids = batch['input_ids'].to(device)
            b_input_label = batch['labels'].to(device)

            model.zero_grad()

            outputs = model(b_input_ids, labels=b_input_label)

            loss = outputs[0]
            total_loss += loss.item()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

        avg_train_loss = total_loss / len(train_dataloader)

        barepochs.set_description(
            desc="loss: "+str(round(avg_train_loss, 5))
        )

        save_model(model, dir_output_model+"/last")

    results = evaluate_model(model, eval_dataloader)
    if results["overall_f1"] > best_fscore :
        best_fscore = results["overall_f1"]
        save_model(model, dir_output_model+"/best")

    save_model_evaluation(results, dir_output_model)

## Evaluation

Lets load the file where results of cross validation have been written.

In [None]:
cross_results = pd.read_csv(dir_output_model+"/eval-finetuning.csv", sep=";")

In [None]:
cross_results

In [None]:
print("label\t\tprecision\t\trecall\t\tf1-score")
for label in ["CodePostal", "Ville", "NomPrenom", "Voie", "IPP", "Date", "NoDossier", "Organisation", "SiteWeb", "EMail", "Localite", "Telephone"]:
    print(label, end="")
    for score in ["precision", "recall", "f1"]:
        key = label+"-"+score
        if key in cross_results:
            print(
                    "\t\t"+
                    str(round(statistics.mean(cross_results[key]), 3))+
                    " ± "+
                    str(round(statistics.stdev(cross_results[key]), 3)),
                    end=""
                 )
    print("")