In [None]:
from tqdm.auto import tqdm
import random
import json
import os
import sys

# Math utils
import statistics
import numpy as np

# ML utils
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import DataCollatorForLanguageModeling, get_linear_schedule_with_warmup
from tokenizers.pre_tokenizers import Whitespace

# Models used
from transformers import pipeline, CamembertTokenizerFast, CamembertForMaskedLM

In [None]:
# setup seeds to reproduce results
SEED = 1312
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [None]:
# setup CPU/GPU to use
device = torch.device("cpu")
if torch.cuda.is_available():
    device = torch.device("cuda")
    torch.cuda.set_device(2)
    print("cuDNN enabled? ", torch.backends.cudnn.enabled)
    print("cuDNN version:", torch.backends.cudnn.version())
    print("cuda version:", torch.version.cuda)
    print("Device name? ", torch.cuda.get_device_name(torch.cuda.current_device()))

# Pretraining french BERT models on french clinical documents

A first limitation of BERT models to treat documents from a specific domain, here healthcare, is their absence of knowledge on the vocabulary used in this domain.

In our case, we aim to use CamemBERT models on french clinical documents. CamemBERT being trained on french general documents, it have a good knowledge of french language, but not on  french clinical language (as we’ll see later in this document).

Therefore, to potentially improve the detection of identifying data on documents, we first want to pretrain CamemBERT models on a large panel of various clinical documents.

First, lets define the model we’ll use in this notebook.

In [None]:
# to test with another model, change name and rerun the all notebook
model_name = "camembert-base"
dir_model = "models/" + model_name

## Preparing datasets and dataloaders for Masked Language Modeling

To teach a new language to a model, we generally train it on a Fill-Mask problem.

Fill-Mask problems simply refers to a texts with missing words that the model have to find.

To do so, we transform the texts from our datasets and "mask" randomly some words, more specificaly "tokens".

We’ll use the following text, from the validation dataset, to explain the process in the sections below:

```
Contrôle 1 an 1/2 après cystoprostatectomie radicale avec Bricker Va bien , asymptomatique Pas d'infection urinaire symptomatique La stomie est jolie L'appareillage se fait sans difficulté La créatinine est normale = 7,7 Le scanner thoraco-abdomino-pelvienne est normal et supperposable aux scanners précédents Les reins ne sont pas dilatés.
```

### Loading train and validation datasets

To build datasets for pretraining, we have extracted close to one milion heterogenous clinical documents. However, some this file had encoding errors (special symbols, unreadable documents, missing letters, etc.).

After cleaning, we obtained 613650 documents (~1.4Gb).

Then, we use the bootstraping method to create a dataset for training and a dataset for test and validation.

The code used to bootstrap the raw dataset can be found in [Bootstrapping.ipynb](Boostrapping.ipynb).

So, lets load raw texts from the training and validation datasets.  

In [None]:
raw_trainingset = []
with open("data/data-for-trf-train.json", "r", encoding="utf-8") as f:
    print("loading json...")
    jsonfile = json.load(f)
    print("json loaded")
    raw_trainingset = [datum["file.contenu"] for datum in tqdm(jsonfile)]
print(len(raw_trainingset))

In [None]:
raw_validset = []
with open("data/data-for-trf-validation.json", "r", encoding="utf-8") as f:
    print("loading json...")
    jsonfile = json.load(f)
    print("json loaded")
    raw_validset = [datum["file.contenu"] for datum in tqdm(jsonfile)]
print(len(raw_validset))

We obtain a training dataset and a validation dataset with repectively: 613650 texts, same as the raw dataset but with duplications due to bootstraping, and 225794 texts. Both datasets are totally disjoint.

### Preprocessing datasets

One problem with NLP models is that they have a limitation in size of text they can treat.

In our case, CamemBERT models can treat texts with less that 512 tokens.

Different methods can be used to overcome this limitation. For example, we can simply use the 512 first tokens of each texts.

Because the documents we use can contain relevant information in all their contents, we choose to chunk our documents in subtexts of maximum 256 words.

256 words because, as we’ll see after, tokenizer generally split words in prefixes and suffixes. So, 256 words easily give 512 tokens.

To split texts into words, we use the tokenizer *Whitespace* of **HugginFace**.

In [None]:
pre_tokenizer = Whitespace()

With our example we obtain:

In [None]:
pre_tokenized_example = pre_tokenizer.pre_tokenize_str("Contrôle 1 an 1/2 après cystoprostatectomie radicale avec Bricker Va bien , asymptomatique Pas d'infection urinaire symptomatique La stomie est jolie L'appareillage se fait sans difficulté La créatinine est normale = 7,7 Le scanner thoraco-abdomino-pelvienne est normal et supperposable aux scanners précédents Les reins ne sont pas dilatés.")
print("Number of words:", str(len(pre_tokenized_example)))
print(pre_tokenized_example)

For texts with more than 256 words, the idea is to chunk texts using a stride of 128 words to obtain subtexts in a staggered arrangement.

We define then the sizes we’ll need.

In [None]:
embedding_dim = 512
subtext_size = int(embedding_dim / 2)
stride = int(subtext_size / 2)

And we pretokenize texts of our two datasets and chunk them if necessary. (for demonstration we limit the number of texts to tokenize and chunk)

In [None]:
pretokenized_trainingset = []
for seq in tqdm(raw_trainingset[:1000]):
    tokenized_seq = pre_tokenizer.pre_tokenize_str(seq)
    if len(tokenized_seq) <= subtext_size:
        pretokenized_trainingset.append(seq)
    else:
        for i in range(stride, len(tokenized_seq), stride):
            sub_tokenized_seq = tokenized_seq[i-stride:i+stride]
            sub_seq_start = sub_tokenized_seq[0][1][0]
            sub_seq_end = sub_tokenized_seq[-1][1][1]
            sub_seq = seq[sub_seq_start:sub_seq_end]
            if len(sub_seq) != 0:
                pretokenized_trainingset.append(sub_seq)
print(len(pretokenized_trainingset))

In [None]:
pretokenized_validset = []
for seq in tqdm(raw_validset[:100]):
    tokenized_seq = pre_tokenizer.pre_tokenize_str(seq)
    if len(tokenized_seq) <= subtext_size:
        pretokenized_validset.append(seq)
    else:
        for i in range(stride, len(tokenized_seq), stride):
            sub_tokenized_seq = tokenized_seq[i-stride:i+stride]
            sub_seq_start = sub_tokenized_seq[0][1][0]
            sub_seq_end = sub_tokenized_seq[-1][1][1]
            sub_seq = seq[sub_seq_start:sub_seq_end]
            if len(sub_seq) != 0:
                pretokenized_validset.append(sub_seq)
print(len(pretokenized_validset))

We obtain then two augmented datasets for training and validation, already split into words.

### Learn clinical vocabulary

To train our model on a new language, we first need to learn the vocabulary of this language by trainning the tokenizer of CamemBERT.

In [None]:
old_tokenizer = CamembertTokenizerFast.from_pretrained(
        dir_model,
        local_files_only=True
    )

print(old_tokenizer)

With our example, the tokenizer of CamemBERT gives us the following result: 

In [None]:
tokens = old_tokenizer.tokenize("Contrôle 1 an 1/2 après cystoprostatectomie radicale avec Bricker Va bien , asymptomatique Pas d'infection urinaire symptomatique La stomie est jolie L'appareillage se fait sans difficulté La créatinine est normale = 7,7 Le scanner thoraco-abdomino-pelvienne est normal et supperposable aux scanners précédents Les reins ne sont pas dilatés.")
print(tokens)

We can see that terms specific to the clinical language such as "cystoprostatectomie", "créatinine" or "thoraco-abdomino-pelvienne" are tokenized into letters. Its because the tokenizer of camembert doesn’t know the vocabulary specific to clinical documents.

To train the tokenizer of CamemBERT to learn a new vocabulary we’ll use the method *train_new_from_iterator* (available only for *Fast* tokenizers).

First, we have to define a function to iterate documents from the trainning dataset.

In [None]:
def get_training_corpus():
    return (
        raw_trainingset[i : i + 1000]
        for i in range(0, len(raw_trainingset), 1000)
    )


training_corpus = get_training_corpus()

Then, we can train the tokenizer of CamemBERT on texts from the trainning dataset.

In [None]:
tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, old_tokenizer.vocab_size)

print(tokenizer)

With our example, we obtain now:

In [None]:
tokens = tokenizer.tokenize("Contrôle 1 an 1/2 après cystoprostatectomie radicale avec Bricker Va bien , asymptomatique Pas d'infection urinaire symptomatique La stomie est jolie L'appareillage se fait sans difficulté La créatinine est normale = 7,7 Le scanner thoraco-abdomino-pelvienne est normal et supperposable aux scanners précédents Les reins ne sont pas dilatés.")
print(tokens)

We can observe that clinical terms like "cystoprostatectomie", "créatinine" or "thoraco-abdomino-pelvienne" are better tokenized.

### Create data loaders for fill-mask

Finally, we need to create dataloaders that will generate texts with masked tokens and their corresponding tokens to find.

First, we have to create a *Dataset* adapted to our needs.

In [None]:
class MaskedLMDataset(Dataset):
    def __init__(self, text_lines, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.lines = text_lines
        self.max_len = max_len
        self.ids = self.encode_lines(self.lines)

    def encode_lines(self, lines):
        batch_encoding = self.tokenizer(
            lines,
            add_special_tokens=True,
            return_tensors='pt',
            padding=True,
            truncation=True,
            max_length=self.max_len
        )
        return batch_encoding["input_ids"]

    def __len__(self):
        return len(self.lines)

    def __getitem__(self, idx):
        return torch.tensor(self.ids[idx], dtype=torch.long)

Then, we create a data collator that will tokenize our texts and determine which ones to mask.

The probability for a token to be masked is set at 15\%

In [None]:
data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=True,
        mlm_probability=0.15
    )

After that we can create a dataloader that generate batches of texts with masked tokens, based on the texts from our training dataset.

The size of batches is set at 4 to avoid memory errors.

In [None]:
batch_size = 4

In [None]:
training_dataset = MaskedLMDataset(
        pretokenized_trainingset,
        tokenizer,
        embedding_dim
    )

training_dataloader = DataLoader(
        training_dataset,
        batch_size=batch_size,
        collate_fn=data_collator,
        shuffle=True
    )

Then, we do the same on 1\% of the validation dataset to create a dataloader to briefly test the model during its training.

In [None]:
# 1 per cent of the validation dataset is used to test model during training
test_size = int(len(pretokenized_validset) * 0.01)
random.shuffle(pretokenized_validset)

test_dataset = MaskedLMDataset(
        pretokenized_validset[:test_size],
        tokenizer,
        embedding_dim
    )

test_dataloader = DataLoader(
        test_dataset,
        batch_size=batch_size,
        collate_fn=data_collator
    )

Finally, we create the dataloader for the rest of the validation dataset, that we’ll use to validate our models once its training will be done.

In [None]:
validation_dataset = MaskedLMDataset(
        pretokenized_validset[test_size:],
        tokenizer,
        embedding_dim
    )

validation_dataloader = DataLoader(
        validation_dataset,
        batch_size=batch_size,
        collate_fn=data_collator
    )

## Training Camembert for Masked Language Modeling

Now that we have prepare our datasets, we can setup parameters for the training of our model.

### Define output

First, lets define where and how the model will be saved during and after its training.

We can see that we save the new tokenizer in the same directory that our model.

In [None]:
dir_name = "models/saved_models/"+model_name+"-pretrained/"


def save_model(model, save_dir):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    model_to_save = model.module if hasattr(model, 'module') else model
    model_to_save.save_pretrained(save_dir)
    tokenizer.save_pretrained(save_dir)

### Define evaluation function

Secondly, lets define how the function we will use to evaluate the model during and once its training will be done.

This function compute the perplexity of a given model’s results for each batches of a given dataloader.

Perplexity evaluates the probabilities assigned to a token proposed by the model for a masked token, knowing the other tokens before the masked one. Lower perplexity indicates better performance.

Perplexity can be obtain from the loss of the model after a prediction.

The function return the perplexities obtained for all batches, allowing us to make some statistics.

In [None]:
def compute_perplexity(model, dataloader):
    model.eval()
    losses = []

    for batch in tqdm(dataloader, desc="compute perplexity"):
        b_input_ids = batch['input_ids'].to(device)
        b_input_label = batch['labels'].to(device)

        with torch.no_grad():
            outputs = model(b_input_ids, labels=b_input_label)
            loss = outputs[0]
            losses.append(loss.item())

    return losses

### Prepare CamemBERT Model

Now, we can prepare the CamemBERT model for training.

First, we loading it from local files.

In [None]:
model = CamembertForMaskedLM.from_pretrained(
        dir_model,
        local_files_only=True
    )

And then, we can load the model on the predefined GPU (or CPU)

In [None]:
model.to(device);

### Setup training

Now that we have prepared our model, we can make our last settings before starting our training loop.

First, we define the hyperparameters to use *AdamW* optimizer.

A learning rate at 1.10⁻⁴ to start, and an epsilon at 1.10⁻⁸ to avoid dividing by zero.

In [None]:
# hyperparameters
learning_rate = 1e-4
adam_epsilon = 1e-8

Then, we define the *AdamW* optimizer for our model.

In [None]:
optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=learning_rate,
        eps=adam_epsilon
    )

After that, we only have to define the scheduling of our training loop.

To do so, we first define the number of epoch we want to do and obtain the total number of steps of training.

In [None]:
num_epochs = 10
total_steps = len(training_dataloader) * num_epochs
print(total_steps)

Then, we can define our scheduler based on this total number of steps to adapt model’s parameters during the training.

In [None]:
scheduler = get_linear_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )

### Start training

Now, we have everything we need to train our model.

For each epoch, we train our model on the whole training dataset.

At the end of each epoch, we compute the perplexity of our model on the test dataset.

If the perplexity is better than before we save the model in *best* subdirectory.
In all cases, we save the last version of the model in *last* subdirectory.

In [None]:
best_perplexity = sys.float_info.max
barepochs = tqdm(range(num_epochs))
for i in barepochs:

    total_loss = 0

    model.train()

    for step, batch in enumerate(tqdm(training_dataloader, desc="batches")):
        b_input_ids = batch['input_ids'].to(device)
        b_input_label = batch['labels'].to(device)

        model.zero_grad()

        outputs = model(b_input_ids, labels=b_input_label)

        loss = outputs[0]

        total_loss += loss.item()

        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

        scheduler.step()

    avg_train_loss = total_loss / len(training_dataloader)

    test_perplexity = statistics.mean(compute_perplexity(model, test_dataloader))
    if test_perplexity < best_perplexity:
        best_perplexity = test_perplexity
        save_model(model, dir_name+"best")
    save_model(model, dir_name+"last")

    barepochs.set_description(
        desc="loss: "+str(round(avg_train_loss, 4))+", test: "+str(round(test_perplexity, 2))
    )

## Evaluating and validating models

Now that our training is done, lets evaluate the performances of pertrained model in comparison to the CamemBERT model.

### Evaluate perplexities

First lets evaluate the average perplexity obtain by each model on our validation dataset.

In [None]:
models = {
        "not-pretrained": dir_model,
        "pretrained": dir_name+"best"
    }

In [None]:
for key in models:
    # load model
    model = CamembertForMaskedLM.from_pretrained(
                            models[key],
                            local_files_only=True
                        )

    model.to(device);

    perplexities = compute_perplexity(model, validation_dataloader)
    print(
        "Average Perplexity for", key, ":",
        str(statistics.mean(perplexities)),
        "±",
        str(statistics.stdev(perplexities))
    )

    # save results
    with open(dir_name+key+"_perplexity.json", "w", encoding="utf-8") as f:
        json.dump(perplexities, f, ensure_ascii=False, indent=4)

### Example of results

Finally, lets compare our models on some example from our validation dataset.

In [None]:
example_dataset = {
    "Contrôle 1 an 1/2 après cystoprostatectomie radicale avec Bricker.":
    "Contrôle 1 an 1/2 après <mask> radicale avec Bricker.",
    "A ce stade, il existe des ondes lentes diphasiques dans les deux régions frontales intermittentes.":
    "A ce stade, il existe des ondes lentes <mask> dans les deux régions frontales intermittentes.",
    "Ordonnance bi-zone Prescriptions relatives au traitement de l'affection de longue durée.":
    "Ordonnance bi-zone <mask> relatives au traitement de l'affection de longue durée.",
    "Le contrôle de la fistule huméro-basilique gauche est plutôt bon puisque la fistule est hyper-débitante, environ 2L avec des IR à 0.53 sans sténose significative retrouvée.":
    "Le contrôle de la <mask> huméro-basilique gauche est plutôt bon puisque la fistule est hyper-débitante, environ 2L avec des IR à 0.53 sans sténose significative retrouvée.",
    "Dévitaion du dorsum nasal obcjctivée et subjective, vers la gauche avec enfoncement des OPN droits gène à la ventilation nasale rappportée spontanément par le patient":
    "Dévitaion du dorsum <mask> obcjctivée et subjective, vers la gauche avec enfoncement des OPN droits gène à la ventilation nasale rappportée spontanément par le patient"
}

In [None]:
classifiers = {
    "not-pretrained": pipeline("fill-mask", dir_model),
    "pretrained": pipeline("fill-mask", dir_name+"best")
}

In [None]:
for text in example_dataset:
    print(text)
    print(example_dataset[text])

    for classifier_name in classifiers:
        print(classifier_name+":")
        results = classifiers[classifier_name](example_dataset[text])

        for result in results:
            print(result["token_str"]+" ("+str(round(result["score"], 2))+")")