# Named entity recognition
This notebook is made to train a model for named entity recognition using transformers applied to a public dataset from the DEFT 2020 challenge.

## Preparation of the environment

**Download needed packages.**

In [None]:
# Import needed packages
!pip install transformers["sentencepiece"]
!pip install datasets
!pip install seqeval
!pip install tensorboard

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers[sentencepiece]
  Downloading transformers-4.19.3-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 5.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 4.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 58.5 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 28.8 MB/s 
Collecting sentencepiece!=0.1.92,>=0.1.91
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |██████████

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## Preparation of the data
We are using the DEFT dataset downloaded in our local machine (not Google Drive), to be able to use it, we upload it directly to the runtime environment of this notebook.

### Files treatment per set
We start by treating the data files by getting preparing files according to the needed annotations.

**Unzip the imported data file (.tar dataset version).**


In [None]:
# Unzip Dataset file
import tarfile

file = tarfile.open("DEFT2020-cas-cliniques.tar.gz")
file.extractall("./")
file.close()

**Unzip the imported data file (.zip dataset version).**

In [None]:
# Unzip Dataset file

import zipfile

path_to_zip_files = ["t3-appr.zip", "t3-test.zip"]

directory_to_extract_to = "./"

for path_to_zip_file in path_to_zip_files:
  with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
    zip_ref.extractall(directory_to_extract_to)

**Get the number of files for the training and test sets.**

In [None]:
# Check the dataset
import os
import glob

directories = ["t3-appr", "t3-test"]

totalNumberOfFiles = 0
for directory in directories:
  numberOfFiles = 0
  # Print dataset files (Text + annotations)
  for filename in glob.iglob(f'{directory}/*.txt'):
    numberOfFiles += 1
    totalNumberOfFiles += 1
  for filename in glob.iglob(f'{directory}/*.ann'):
    numberOfFiles += 1
  print(f'The number of text and annotation files in the directory: {directory} is: {numberOfFiles}')
  print(f'The total number of files in the directory: {directory} is: {len(os.listdir(directory))}')

print(f'The number of text and annotation files in the dataset: {totalNumberOfFiles}')

The number of text and annotation files in the directory: t3-appr is: 200
The total number of files in the directory: t3-appr is: 204
The number of text and annotation files in the directory: t3-test is: 67
The total number of files in the directory: t3-test is: 71
The number of text and annotation files in the dataset: 167


**Rewrite the annotation files (to change the annotations) for the training set.**

In [None]:
import csv
from collections import defaultdict

grouped_files = defaultdict(int)
EXTENSIONS = {'.ann', '.txt'}

for f in os.listdir('t3-appr'):
  name, ext = os.path.splitext(os.path.join('t3-appr', f))
  if ext in EXTENSIONS:
    grouped_files[name] += 1

for name in grouped_files:
  # with open('{}.ann'.format('DEFT-cas-cliniques/' + name[8:]), "r", encoding="utf-8") as file:
  with open('{}.ann'.format(name), "r", encoding="utf-8") as file:
    annotations = file.read()
  with open('{}.ann'.format(name), "w", encoding="utf-8") as ann_file:
    for annotation in annotations.split("\n"):
      temp = annotation.replace("\t", " ")
      elements = temp.split(" ", 4)
      if 'T' in elements[0]:
        if elements[1] not in ["age", "genre", "issue", "origine", "frequence", "date", "duree"]:
          # 'valeur', 'dose', 'mode', 'substance', 'examen', 'traitement', 'anatomie', 'moment', 'sosy', 'pathologie'
          # if elements[1] not in ['valeur', 'dose', 'mode', 'substance', 'examen', 'traitement', 'anatomie', 'moment']:
            ann_file.write(annotation + "\n")

**Rewrite the annotation files (to change the annotations) for the test set.**

In [None]:
import csv
from collections import defaultdict

grouped_files = defaultdict(int)
EXTENSIONS = {'.ann', '.txt'}

for f in os.listdir('t3-test'):
  name, ext = os.path.splitext(os.path.join('t3-test', f))
  if ext in EXTENSIONS:
    grouped_files[name] += 1

for name in grouped_files:
  with open('{}.ann'.format('DEFT-cas-cliniques/' + name[8:]), "r", encoding="utf-8") as file:
    annotations = file.read()
  with open('{}.ann'.format(name), "w", encoding="utf-8") as ann_file:
    for annotation in annotations.split("\n"):
      temp = annotation.replace("\t", " ")
      elements = temp.split(" ", 4)
      if 'T' in elements[0]:
        if elements[1] not in ["age", "genre", "issue", "origine", "frequence", "date", "duree"]:
          # 'valeur', 'dose', 'mode', 'substance', 'examen', 'traitement', 'anatomie', 'moment', 'sosy', 'pathologie'
          # if elements[1] not in ['valeur', 'dose', 'mode', 'substance', 'examen', 'traitement', 'anatomie', 'moment']:
            ann_file.write(annotation + "\n")

**Zip new obtained train data folder.**

In [None]:
# !zip -r /content/t3-appr-two_classes.zip /content/t3-appr

**Zip the new obtained test data folder.**

In [None]:
# !zip -r /content/t3-test-two_classes.zip /content/t3-test

### Preare data per set
In this phase, we get convert the data from the brat format to the CoNLL format to make usable by the transformers model.

**Track the frequency of each annotation.**

In [None]:
annotations_frequency = {
    'valeur': 0, 
    'issue': 0, 
    'dose': 0, 
    'mode': 0, 
    'genre': 0, 
    'substance': 0, 
    'origine': 0, 
    'sosy': 0, 
    'frequence': 0, 
    'examen': 0, 
    'traitement': 0, 
    'anatomie': 0, 
    'age': 0, 
    'date': 0, 
    'pathologie': 0, 
    'moment': 0, 
    'duree': 0, 
}

**Retrieve the text and annotations of a file.**

In [None]:
# Load data
def load_data(fileName):
  with open('{}.txt'.format(fileName), "r", encoding="utf-8") as txt_file, \
    open('{}.ann'.format(fileName), "r", encoding="utf-8") as ann_file:
    # Get text
    script = txt_file.read()
    text = script.split("\n\n\n\n")
    text[0] = text[0].replace(".\n", ". ").replace("\n", " ").strip()    
    
    annotations = ann_file.read().replace("\t", " ")
    annotations = annotations[:len(annotations)-1] + "" + annotations[len(annotations):]
    # print(annotations)
    return text, annotations, script

**Get the tokens of a text.**

In [None]:
def get_tokens(text):
  tokens = text[0].split(" ")
  # tokens = re.findall(r"\w+(?:[-]{1,2})?(?:\w+)?(?:[-]{1,2})?(?:\w+)?", str(text))
  return tokens

**Remove nested annotations by keeping the longest ones only for each list of annotationa of a file.**

In [None]:
def remove_nested_annotations(annotated_data):
  # Remove nested annotations
  nested_annotations = 0
  text_raws = []
  cursors = []
  indices = []
  # print("---------------- remove nested annotations ----------------")
  for row in annotated_data:
    # print(row)
    start = row["positions"][0]
    end = row["positions"][1]
    index = annotated_data.index(row)
    # print(index)
    delete = False
    # print("cursors = ", cursors)
    for cursor in cursors:
      if cursor[0] <= start <= cursor[1]:
        indices.append(index)
        delete = True
        break
    if not delete:
      cursors.append((start, end)) 
  if len(indices) > 0:
    nested_annotations = len(indices) + 1
  for i in reversed(indices):
    del annotated_data[i]
  # print("---------------- -------------------------- ----------------")
  return annotated_data, nested_annotations

**Structure annotations to make use of them.**

In [None]:
def structure_annotations(annotations):
  elements = []
  labels = []
  positions = []
  entities = []
  annotated_data = []
  nested_annotations = 0
  real_annotations = 0
  # Go through annotations and for each one get its different parts
  for annotation in annotations.split("\n"):
    elements = annotation.split(" ", 4)
    if 'T' in elements[0]:
      # "pathologie", "sosy"
      # , "anatomie", "dose", "examen", "mode", "moment", "substance", "traitement", "valeur"
      # if elements[1] in ["examen"]:
      if elements[1] not in ["age", "genre", "issue", "origine", "frequence", "date", "duree"]:
      # if len(elements[4].split(" ")) <= 2 and elements[1] != 'issue':
        labels.append(elements[1])
        positions.append((int(elements[2]), int(elements[3])))
        entities.append(elements[4])
        annotations_frequency[elements[1]] += 1
        annotated_data.append({
          'labels': elements[1],
          'positions': (int(elements[2]), int(elements[3])),
          'entities': elements[4] 
        })
  annotated_data = sorted(annotated_data, key=lambda x: x['positions'][0])
  real_annotations = len(annotated_data)
  annotated_data, nested_annotations = remove_nested_annotations(annotated_data)
  return annotated_data, nested_annotations, real_annotations

**Transform the data to a BIO annotation format for a given text.**

In [None]:
def annotate_data(text, annotated_data):
  # Annotate text
  cursor = 0
  token_docs = []
  label_docs = []
  not_annotated = 0
  annotated = 0
  for row in annotated_data:
    start = row["positions"][0]
    end = row["positions"][1]
    if cursor != start:
      not_annotated += 1
      for token in text[0][cursor:start].split(" "):
        if len(token) != 0:
          token_docs.append(token)
          label_docs.append('O')
    order = 0
    annotated += 1
    for token in row["entities"].split(" "):
      token_docs.append(token)
      if order == 0:
        label_docs.append(f'B-{row["labels"]}')
      else:
        label_docs.append(f'I-{row["labels"]}')
      order += 1
    # annotations_backup[row["labels"]].append(row["entities"])
    cursor = end

  if cursor != len(text[0]):
    not_annotated += 1
    for token in text[0][cursor:].split(" "):
      if len(token) != 0:
        token_docs.append(token)
        label_docs.append('O')
  return token_docs, label_docs, annotated, not_annotated

**Load NLTK tokenizer to use it in the text division into sentences.**

In [None]:
# Load NLTK tokenizer
nltk_tokenizer = nltk.data.load('tokenizers/punkt/french.pickle')

**Create a BIO annotated dataset (with text splited into sentences) from a list of files.**

In [None]:
def get_data(grouped_files):
  number_of_annotations = 0
  cpt = 0
  numberRealAnnotations = 0
  numberOfAnnotations = 0
  numberNotAnnotations = 0

  file_names = []
  token_docs = []
  label_docs = []
  nested = []

  numberOfDocs = 0
  numberOfSentences = 0
  # Iterate through files to get data
  for name in grouped_files:
    # print(name)
    
    if grouped_files[name] == len(EXTENSIONS): 
      file_names.append(name)
    
    text = [] # Text of one file
    annotations = [] # Annotations of one file

    # Get the text and annotations
    text, annotations, script = load_data(name)
    
    # Get tokens from text
    tokens = get_tokens(text)
    
    # # Structure annotations and remove the nested ones
    annotated_data, nested_annotations, real_annotations = structure_annotations(annotations)
    number_of_annotations += len(annotated_data)

    if nested_annotations != 0:
      nested.append(nested_annotations)

    cpt += nested_annotations
    numberRealAnnotations += real_annotations
    
    # # Annotate data
    temp = 0
    tokens, labels, annotated, not_annotated = annotate_data(text, annotated_data)

    numberOfAnnotations += annotated
    numberNotAnnotations += not_annotated

    sentences = []
    script_sentences = nltk_tokenizer.tokenize(script)
    for sentence in script_sentences:
      if "\n" in sentence:
        for s in sentence.split("\n"):
          if len(s) != 0:
            sentences.append(s)
      else:
        sentences.append(sentence)

    # # Save the processed text and annotations 
    if labels.count('O') != len(labels):
      token_docs.append(tokens)
      label_docs.append(labels)
      # cpt = 0
      # if numberOfDocs < 100:
      #   numberOfSentences += len(sentences)
      # numberOfDocs += 1
      # for sentence in sentences:
      #   i = len(sentence.split(" ")) - 1
      #   temp_tokens = []
      #   temp_labels = []
      #   while i < len(sentence) and cpt < len(tokens):
      #     i += len(tokens[cpt])
      #     temp_tokens.append(tokens[cpt])
      #     temp_labels.append(labels[cpt])
      #     cpt += 1
      #   token_docs.append(temp_tokens)
      #   label_docs.append(temp_labels)
      
  print("Number of nested annotations =", cpt)
  print("Number of real annotations =", numberRealAnnotations)
  print("------------------------------------------------------------------------")
  print("Number of annotated entities =", numberOfAnnotations)
  print("Number of not annotated entities =", numberNotAnnotations)
  print("Number of annotations =", number_of_annotations)
  print("------------------------------------------------------------------------")
  print("Number of sentences (tokens) =", len(token_docs))
  print("Number of sentences (labels) =", len(label_docs))
  print("------------------------------------------------------------------------")
  print("Number of docs =", numberOfDocs)
  print("Number of sentences =", numberOfSentences)
  print("------------------------------------------------------------------------")
  print("nested =", nested)
  return token_docs, label_docs, nested

**Get the list of tokens and respective labels per set.**

In [None]:
import os
from collections import defaultdict
import pprint

EXTENSIONS = {'.ann', '.txt'}
directories = ["t3-appr", "t3-test"]
train_grouped_files = defaultdict(int)
test_grouped_files = defaultdict(int)

train_tokens = []
train_labels = []
test_tokens = []
test_labels = []

train_nested = []
test_nested = []

# Get the number of files for one file name
for f in os.listdir(directories[0]):
  name, ext = os.path.splitext(os.path.join(directories[0], f))
  if ext in EXTENSIONS:
    train_grouped_files[name] += 1

train_tokens, train_labels, train_nested = get_data(train_grouped_files)

# Get the number of files for one file name
for f in os.listdir(directories[1]):
  name, ext = os.path.splitext(os.path.join(directories[1], f))
  if ext in EXTENSIONS:
    test_grouped_files[name] += 1

test_tokens, test_labels, test_nested = get_data(test_grouped_files)

Number of nested annotations = 2112
Number of real annotations = 7844
------------------------------------------------------------------------
Number of annotated entities = 5832
Number of not annotated entities = 5931
Number of annotations = 5832
------------------------------------------------------------------------
Number of sentences (tokens) = 100
Number of sentences (labels) = 100
------------------------------------------------------------------------
Number of docs = 0
Number of sentences = 0
------------------------------------------------------------------------
nested = [14, 33, 36, 28, 9, 20, 20, 5, 16, 37, 26, 19, 9, 42, 22, 30, 20, 11, 5, 4, 5, 23, 26, 40, 24, 24, 9, 35, 12, 19, 19, 7, 34, 9, 17, 20, 20, 30, 7, 41, 22, 63, 19, 13, 23, 17, 42, 41, 42, 21, 19, 19, 3, 18, 5, 19, 14, 24, 11, 13, 13, 18, 26, 2, 11, 4, 17, 25, 16, 18, 16, 21, 17, 23, 9, 24, 18, 11, 12, 24, 3, 14, 26, 24, 17, 58, 13, 27, 20, 30, 13, 38, 21, 31, 59, 21, 13, 25, 12, 47]
Number of nested annotatio

**Save obtained test data in .tsv file (BIO format).**

In [None]:
# save data in IOB format
import csv

with open('DEFT_test_2_classes_records_sentences.tsv', 'w', newline='\n', encoding="utf-8") as tsvfile:
    writer = csv.writer(tsvfile, delimiter='\t')
    for tokens, labels in zip(test_tokens, test_labels):
      for token, label in zip(tokens, labels):
        writer.writerow([token, label])
      writer.writerow([])

## Pre-processing of the data

### Stats from the datasets

**Get the frequency of each annotation.**

In [None]:
annotations_frequency

**Check the tokens and labels count for the training set.**

In [None]:
# Check the number of tokens and labels in the corpus
tokens_count = 0
labels_count = 0
for tokens, labels in zip(train_tokens, train_labels):
  tokens_count += len(tokens)
  labels_count += len(labels)

print(tokens_count)
print(labels_count)

**Visualize the distribution of sentences by their length.**

In [None]:
#Lets visualize how the sentences are distributed by their length
import matplotlib.pyplot as plt

plt.style.use("ggplot")
plt.hist([len(tokens) for tokens in train_tokens], bins=50)
plt.show()

### Map labels

Map labels to numbers and vice-versa in order to be able to use them by the transformers model.

In [None]:
# Define the labels mapping
unique_labels = list(set(label for doc in train_labels for label in doc))
label2id = {label: id for id, label in enumerate(unique_labels)}
id2label = {id: label for label, id in label2id.items()}

print(label2id)
print(id2label)

### Split data **(data already divided - section commented)**

**Split data (80% train, 20% test).**

In [None]:
# # Split data into text and test sets
# from sklearn.model_selection import train_test_split
# train_texts, test_texts, train_labels, test_labels = train_test_split(token_docs, label_docs, test_size=.2, random_state=42)

# # Print lengths of each part of the data
# print(len(train_texts), len(train_labels))
# print(len(test_texts), len(test_labels))

### Define the used model

**Define the used model to be used for encoding the data and fine-tuning the named entity recognition model.**

In [None]:
# Define the model id

# model_id = 'camembert/camembert-large'
model_id = 'camembert-base'
# model_id = "bert-base-cased"

### Encode data

**Import the tokenizer.**

In [None]:
# Import a tokenizer
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_id, model_max_length=256)

**Encode the tokens for the training and test sets.**

In [None]:
# Encode text tokens
train_text_encodings = tokenizer(
    train_tokens, is_split_into_words=True, 
    return_offsets_mapping=True, padding="max_length", 
    truncation=True
)

test_text_encodings = tokenizer(
    test_tokens, is_split_into_words=True, 
    return_offsets_mapping=True, padding="max_length", 
    truncation=True
)

**Encode labels for each list of encoded tokens.**

In [None]:
# Encode labels
import numpy as np

def encode_labels(tags, encodings, position):
  tags_list = [label2id[label] for label in tags[position]] 
  # Create an empty array of -100 of length max_length
  encoded_labels = np.ones(len(encodings["offset_mapping"][position]), dtype=int) * -100
  # Set only labels whose first offset position is 0 and the second is not 0
  i = 0
  for idx, mapping in enumerate(encodings["offset_mapping"][position]):
    if mapping[0] == 0 and mapping[1] != 0 and i < len(tags_list):
      # Overwrite label
      encoded_labels[idx] = tags_list[i]
      i += 1
  return encoded_labels

**Encode training labels.**

In [None]:
# Encode labels of the training set
train_label_encodings = []
for position in range(len(train_labels)):
  train_label_encodings.append(encode_labels(train_labels, train_text_encodings, position))

**Encode test labels.**

In [None]:
# Encode labels of the test set
test_label_encodings = []
for position in range(len(test_labels)):
  test_label_encodings.append(encode_labels(test_labels, test_text_encodings, position))

### Create training and test datasets

**Define the features for the training and test sets.**

In [None]:
import datasets
import pandas as pd

annotated_train_data = {"input_ids": [], "attention_mask": [], "labels": []}
annotated_test_data = {"input_ids": [], "attention_mask": [], "labels": []}

position = 0
for labels in train_label_encodings:
  annotated_train_data["input_ids"].append(train_text_encodings['input_ids'][position])
  annotated_train_data["attention_mask"].append(train_text_encodings['attention_mask'][position])
  annotated_train_data["labels"].append(labels)
  position += 1
position = 0
for labels in test_label_encodings:
  annotated_test_data["input_ids"].append(test_text_encodings['input_ids'][position])
  annotated_test_data["attention_mask"].append(test_text_encodings['attention_mask'][position])
  annotated_test_data["labels"].append(labels)
  position += 1

features = datasets.Features({
  "input_ids": datasets.Sequence(feature=datasets.Value(dtype="int32")),
  "attention_mask": datasets.Sequence(feature=datasets.Value(dtype="int8")),
  "labels": datasets.Sequence(feature=datasets.ClassLabel(num_classes=len(label2id), names=list(label2id)))
})

train_dataset = datasets.Dataset.from_pandas(pd.DataFrame(annotated_train_data), features=features)
test_dataset = datasets.Dataset.from_pandas(pd.DataFrame(annotated_test_data), features=features)

**Set the format and columns for the training and test datasets.**

In [None]:
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'labels'])

## Training & evaluation with CamemBERT and CamemBERTCRF

### CamemBERT

#### Model definition

**Set kernel to GPU.**

In [None]:
import torch
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

**Define the model for fine tuning.**

In [None]:
from transformers import AdamW, get_scheduler, AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(model_id, 
                                                        num_labels=len(unique_labels),
                                                        label2id=label2id,
                                                        id2label=id2label)

**Add model to GPU.**

In [None]:
model.to(device)

**Set hyperparameters.**

In [None]:
optimizer = AdamW(
  model.parameters(),
  lr = 4e-5,
)

epochs = 80

num_training_steps = epochs * len(train_dataset)
print(epochs, len(train_dataset), num_training_steps)

lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

#### Fine tuning

**Train the model (Fine tuning).**

In [None]:
# Train phase
train_loss_set = []
sequence_length = 512

from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

for epoch in range(0, epochs):
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0
  
  print(f'Epoch number = {epoch}')
  # Train the model
  model.train()
  for step, batch in enumerate(train_dataset):
    # if step % 400 == 0 and not step == 0:
    #   print(f'  Step {step}  of {len(train_dataloader)}.')

    # Add batch to device CPU or GPU
    batch = tuple(t.to(device) for t in batch.values())
    # Unpack the inputs from our dataloader
    # b_labels, b_input_ids, b_token_type_ids, b_input_mask = batch
    b_input_ids, b_input_mask, b_labels = batch

    # print(b_input_ids.shape, b_input_mask.shape, b_labels.shape)

    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()
    # Forward pass
    outputs = model(b_input_ids.reshape(1, sequence_length), 
                    token_type_ids = None, 
                    attention_mask = b_input_mask.reshape(1, sequence_length), 
                    labels = b_labels.reshape(1, sequence_length)
    )

    # print(outputs)
    # print(outputs[0].shape, outputs[1].shape)

    # Get loss value
    loss = outputs[0]
    # print(loss)
    # Add it to train loss list
    train_loss_set.append(loss.item())    
    # Backward pass
    loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()

    lr_scheduler
    progress_bar.update(1)
    
    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1
  
  print("Train loss: {}".format(tr_loss/nb_tr_steps))

#### Test of the fine-tuned model

**Test the fine-tuned model.**

In [None]:
# Test Phase
from datasets import load_metric
import numpy as np

metric = load_metric("seqeval")

def evaluate(model, dataset, ner_labels):
  cpt = 0
  all_predictions = []
  all_labels = []
  nb_test_steps = 0
  for batch in dataset:
    # Add batch to device CPU or GPU
    batch = tuple(t.to(device) for t in batch.values())
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs =  model(b_input_ids.reshape(1, sequence_length),
                     token_type_ids = None, 
                     attention_mask = b_input_mask.reshape(1, sequence_length), 
                     )
      logits = outputs[0]
      # print(logits)
    # Move logits and labels to CPU if GPU is used
    logits = logits.detach().cpu().numpy()
    labels = b_labels.reshape(1, sequence_length)
    labels = labels.to('cpu').numpy()
    
    predictions = np.argmax(logits, axis = -1)
    for prediction, label in zip(predictions, labels):
      for predicted_idx, label_idx in zip(prediction, label):
        if label_idx == -100:
          cpt += 1
          continue
        # print(f"Predicted_idx ner_labels[{predicted_idx}] = {ner_labels[predicted_idx]} / Label_idx ner_labels[{label_idx}] = {ner_labels[label_idx]}")
        all_predictions.append(ner_labels[predicted_idx])
        all_labels.append(ner_labels[label_idx])
    nb_test_steps += 1
    # print(tokenizer.decode(b_input_ids[0]))
  print(cpt, nb_test_steps)
  print(len(all_predictions), len(all_labels))
  # print(f'Predicted labels: {all_predictions}')
  # print(f'Real labels ----: {all_labels}')
  return metric.compute(predictions=[all_predictions], references=[all_labels], zero_division=0)

**Test the model on the training data.**

In [None]:
# Check results of train dataset
import pprint

model.eval()

results = evaluate(model, train_dataset, ner_labels=unique_labels)
pprint.pprint(results)

**Test the model on the test data.**

In [None]:
results = evaluate(model, test_dataset, ner_labels=unique_labels)
pprint.pprint(results)

#### Save the model

**Save fine tuned model.**

In [None]:
tokenizer.save_pretrained("./deft_examen_camembert")
model.save_pretrained("./deft_examen_camembert")

## CamemBERT CRF

#### Prepare packages

**Install needed package to use CRF.**

In [None]:
!pip install pytorch-crf

**Import needed packages.**

In [None]:
# Import packages
from transformers import AutoModel, CamembertModel, CamembertTokenizerFast
from torch import nn
from torch.nn import CrossEntropyLoss
import torch
from torchcrf import CRF

#### Model definition

**Combine CamemBERT and CRF.**

In [None]:
class CamemBERTCRF(nn.Module):
  def __init__(self, num_labels):
    super(CamemBERTCRF, self).__init__()
    
    self.encoder = CamembertModel.from_pretrained("camembert-base")
    
    self.config = self.encoder.config
    self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
    self.classifier = nn.Linear(self.config.hidden_size, num_labels)
    self.crf = CRF(num_tags=num_labels, batch_first=True)

  def forward(
      self,
      input_ids=None,
      attention_mask=None,
      token_type_ids=None,
      position_ids=None,
      head_mask=None,
      inputs_embeds=None,
      labels=None,
      output_attentions=None,
      output_hidden_states=None,
  ):
      r"""
      labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
          Labels for computing the token classification loss. Indices should be in ``[0, ..., config.num_labels -
          1]``.
      """
      outputs = self.encoder(
          input_ids,
          attention_mask=attention_mask,
          token_type_ids=token_type_ids,
          position_ids=position_ids,
          head_mask=head_mask,
          inputs_embeds=inputs_embeds,
          output_attentions=output_attentions,
          output_hidden_states=output_hidden_states,
      )

      sequence_output = outputs.last_hidden_state
      sequence_output = self.dropout(sequence_output)
      logits = self.classifier(sequence_output)
      
      loss = None
      if labels is not None:
          log_likelihood, tags = self.crf(logits, labels), self.crf.decode(logits)
          loss = 0 - log_likelihood
      else:
          tags = self.crf.decode(logits)
      tags = torch.tensor(tags)

      output = (tags,) + outputs[2:]
      return ((loss,) + output) if loss is not None else output

**Set kernel to GPU.**

In [None]:
import torch
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

**Define the model for fine tuning.**

In [None]:
from transformers import AdamW, get_scheduler, AutoModelForTokenClassification

model = CamemBERTCRF(num_labels=len(unique_labels))

**Add model to GPU.**

In [None]:
model.to(device)

**Set hyperparameters.**

In [None]:
optimizer = AdamW(
  model.parameters(),
  lr = 4e-5,
)

epochs = 20

num_training_steps = epochs * len(train_dataset)
print(epochs, len(train_dataset), num_training_steps)

lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

#### Fine tuning

**Train the model (Fine tuning).**

In [None]:
# Train phase
train_loss_set = []
sequence_length = 256

from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

for epoch in range(0, epochs):
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0
  
  print(f'Epoch number = {epoch}')
  # Train the model
  model.train()
  for step, batch in enumerate(train_dataset):
    # if step % 400 == 0 and not step == 0:
    #   print(f'  Step {step}  of {len(train_dataloader)}.')

    # Add batch to device CPU or GPU
    batch = tuple(t.to(device) for t in batch.values())
    # Unpack the inputs from our dataloader
    # b_labels, b_input_ids, b_token_type_ids, b_input_mask = batch
    b_input_ids, b_input_mask, b_labels = batch

    # print(b_input_ids.shape, b_input_mask.shape, b_labels.shape)

    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()
    # Forward pass
    outputs = model(b_input_ids.reshape(1, sequence_length), 
                    token_type_ids = None, 
                    attention_mask = b_input_mask.reshape(1, sequence_length), 
                    labels = b_labels.reshape(1, sequence_length)
    )

    # print(outputs)
    # print(outputs[0].shape, outputs[1].shape)

    # Get loss value
    loss = outputs[0]
    # print(loss)
    # Add it to train loss list
    train_loss_set.append(loss.item())    
    # Backward pass
    loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()

    lr_scheduler
    progress_bar.update(1)
    
    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1
  
  print("Train loss: {}".format(tr_loss/nb_tr_steps))

#### Test of the fine-tuned model

In [None]:
# from transformers import BertTokenizerFast, Trainer, TrainingArguments
# from sklearn.metrics import classification_report, f1_score
# from transformers.trainer_utils import IntervalStrategy

# def compute_metrics(pred):
#     labels = pred.label_ids.flatten()
#     preds = pred.predictions.flatten()
#     f1 = f1_score(labels, preds, average='macro')
#     print(classification_report(labels, preds))
#     return {
#         'f1': f1
#     }

In [None]:
# training_args = TrainingArguments(
#     output_dir='./results',
#     num_train_epochs=5,
#     # per_device_train_batch_size=32,
#     # per_device_eval_batch_size=32,
#     learning_rate=2e-5,
#     warmup_steps=200,
#     weight_decay=0.01,
#     evaluation_strategy="epoch",
#     save_strategy=IntervalStrategy.EPOCH,
#     logging_dir='./logs',
# )

In [None]:
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     compute_metrics=compute_metrics,
#     train_dataset=train_dataset,
#     eval_dataset=test_dataset
# )

In [None]:
# trainer.train()

In [None]:
# print(trainer.evaluate())