# A simple NER
For this project we preprocess the conllu dataset, train a custom model for token classification (roberta embeddings + lstm layer) for which we try to find the optimal hyperparameters and finally we train and evaluate the final model

In [1]:
#===============================================================================
#
#           FILE: 01_train_models.ipynb
#         AUTHOR: Bianca Ciobanica
#          EMAIL: bianca.ciobanica@student.uclouvain.be
#
#           BUGS: 
#        VERSION: 3.11.4
#        CREATED: 20-05-2024 
#
#===============================================================================
#    DESCRIPTION:  
#    
#   DEPENDENCIES:  torch, transformers, accelerate, evaluate, datasets, tqdm
#                  polars, ray, numpy
#
#          USAGE: jupyter notebook 01_train_models.ipynb 
#===============================================================================

## Imports

In [36]:
import sys
import os

import torch
from torch import nn
from torch import cuda
from torch.nn import CrossEntropyLoss, LSTM, Module, Linear, Dropout, LayerNorm
from torch.utils.data import DataLoader
from torch.optim import AdamW

from transformers.utils import logging
from transformers import RobertaTokenizerFast, RobertaModel, RobertaConfig, RobertaForTokenClassification, DataCollatorForTokenClassification, get_scheduler
from transformers.modeling_outputs import TokenClassifierOutput

from tqdm.auto import tqdm
from functools import partial
import os
import tempfile
from pathlib import Path
from ray import tune
from ray import train
from ray.train import Checkpoint, get_checkpoint
from ray.tune.schedulers import ASHAScheduler
import ray.cloudpickle as pickle

import numpy as np
from accelerate import Accelerator
from datasets import Dataset
import evaluate
import polars as pl

In [3]:
logging.set_verbosity_info()

print("Cuda is available : ", torch.cuda.is_available())
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Cuda is available :  True


### File paths

In [4]:
# DATASET
train_file = "./corpus/dataset/train.conllu"
dev_file = "./corpus/dataset/val.conllu"
test_file = "./corpus/dataset/test.conllu"

In [43]:
# MODEL output dir
os.makedirs('./models', exist_ok=True)

## Data preprocessing

### Corpus Loading
<b>Reference:</b> [Introduction to the CoNLL-2003 Shared Task: Language-Independent Named Entity Recognition](https://aclanthology.org/W03-0419) (Tjong Kim Sang & De Meulder, CoNLL 2003)

In [5]:
def load_conllu_file(input_file, encoding):
    data = None

    X = []
    Y = []
    tagset = {}

    with open(input_file, "r", encoding="utf-8") as fin:
        current_sentence_tokens = []
        current_sentence_labels = []
        
        line = fin.readline()
        while line:
            line = line.rstrip()
            if not line:
                X.append(current_sentence_tokens)
                Y.append(current_sentence_labels)
                current_sentence_tokens = []
                current_sentence_labels = []

                line = fin.readline()
                continue

            data = line.split()
            
            token = data[1]
            label = data[2]

            if label not in tagset:
                tagset[label] = len(tagset)
            current_sentence_tokens.append(token)
            current_sentence_labels.append(label)

            line = fin.readline()

    return X, Y, tagset

Here we split train and test and also remap id to labels

In [6]:
train_x, train_y, tagset = load_conllu_file(train_file, "utf-8")
test_x, test_y, _ = load_conllu_file(test_file, "utf-8")
dev_x, dev_y, _ = load_conllu_file(dev_file, "utf-8")

tagset_label2id = {'O': 0,'B-ORG': 1, 'I-ORG': 2, 'B-MISC': 3, 'I-MISC': 4, 'B-PER': 5, 'I-PER': 6, 'B-LOC': 7, 'I-LOC': 8}
tagset_id2label = dict(zip(tagset_label2id.values(), tagset_label2id.keys()))

# check if no errros
# print(tagset.keys() == tagset_custom.keys())

sys.stdout.write("Tagset  |  label id\n")
for label, label_id in tagset_label2id.items():
    sys.stdout.write("%6s      %i\n" % (label ,label_id))


Tagset  |  label id
     O      0
 B-ORG      1
 I-ORG      2
B-MISC      3
I-MISC      4
 B-PER      5
 I-PER      6
 B-LOC      7
 I-LOC      8


In [7]:
total_len = len(dev_x) + len(train_x) + len(test_x)
train_len = len(train_x) / total_len
test_len = len(test_x) / total_len
dev_len = len(dev_x) / total_len

print("total size : ", total_len)
print("train size : ", round(train_len, 2)) # 70 %
print("test size  : ", round(test_len, 2)) # 15 %
print("dev size   : ", round(dev_len, 2)) # 16 %

total size :  22134
train size :  0.68
test size  :  0.17
dev size   :  0.16


In [8]:
print(tagset_id2label)

{0: 'O', 1: 'B-ORG', 2: 'I-ORG', 3: 'B-MISC', 4: 'I-MISC', 5: 'B-PER', 6: 'I-PER', 7: 'B-LOC', 8: 'I-LOC'}


In [9]:
# example
#index = 0
#sentence = train_x[index]
#tags = train_y[index]
#sys.stdout.write("\nTokens: [%d]\n  %s\n\n" % (len(sentence), str(sentence)))
#sys.stdout.write("Labels: [%d]\n  %s\n" % (len(tags), str(tags)))

### Dataset creation
We create a dataset object with train and test split which will be used for our training

In [10]:
train_set = Dataset.from_dict({"tokens": train_x, "labels" : train_y})
test_set = Dataset.from_dict({"tokens": test_x, "labels" : test_y})
dev_set = Dataset.from_dict({"tokens": dev_x, "labels" : dev_y})

#dataset = DatasetDict({"train" : train_set, "test" : test_set})

In [11]:
train_max_len = len(max(train_x, key = len))
test_max_len = len(max(test_x, key = len))
print("Longest sequence in train : ", train_max_len) 
print("Longest sequence in test  : ", test_max_len)

Longest sequence in train :  113
Longest sequence in test  :  124


### Tokenizer initialization

In [12]:
tokenizer = RobertaTokenizerFast.from_pretrained('roberta-base', add_prefix_space=True)

loading file vocab.json from cache at /home/cbianc/.cache/huggingface/hub/models--roberta-base/snapshots/e2da8e2f811d1448a5b465c236feacd80ffbac7b/vocab.json
loading file merges.txt from cache at /home/cbianc/.cache/huggingface/hub/models--roberta-base/snapshots/e2da8e2f811d1448a5b465c236feacd80ffbac7b/merges.txt
loading file tokenizer.json from cache at /home/cbianc/.cache/huggingface/hub/models--roberta-base/snapshots/e2da8e2f811d1448a5b465c236feacd80ffbac7b/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /home/cbianc/.cache/huggingface/hub/models--roberta-base/snapshots/e2da8e2f811d1448a5b465c236feacd80ffbac7b/tokenizer_config.json
loading configuration file config.json from cache at /home/cbianc/.cache/huggingface/hub/models--roberta-base/snapshots/e2da8e2f811d1448a5b465c236feacd80ffbac7b/config.json
Model config RobertaConfig {
  "_name_or_path": "roberta-base",

In [13]:
def tokenize_func(row):
    return tokenizer(row,
                    is_split_into_words=True,
                     add_special_tokens=False
                    )

### Align tokens with new labels
We use `RobertaTokenizerFast` for tokenizing our inputs then use the method `words_ids()` which gives the position the original token. Thus, if the original token is separated into pieces after using the RobertaTokenizer, its index will repeat. We create a for loop that if next word is current word and current word has a **B-XXX** label (meaning the original token is split in pieces), we add **I-XXX** label. Given that we mapped the ids for the labels in pairs (uneven id == **B-XXX** and even id == **I-XXX**) we are able to create the new desired.

In [14]:
def print_example(words, labels):
    line1 = "Tokens    : "
    line2 = "Labels    :"
    line3 = "Label ids :"
    for word, label in zip(words, labels):
        #full_label = label_names[label]
        max_length = max(len(words), len(labels))
        line1 += word + " " * (max_length - len(word) + 1)
        line2 += label + " " * (max_length - len(label) + 1)
        line3 += str(tagset[label]) + " " * (max_length - len(str(tagset[label])) + 1)
    print(line1)
    print(line2)
    print(line3)
def helper_visualizing():
    index = 12  # example
    
    print_example(train_x[index], train_y[index])
    
    inputs = tokenizer(train_x[index], is_split_into_words=True, add_special_tokens=False)
    print("\nOriginal number of tokens:", len(train_x[index]))
    print("Number of input_ids:", len(inputs["input_ids"]), "\n")
    
    
    
    for i, token_id in enumerate(inputs['input_ids']):
        if i < len(train_x[index]):
            token = tokenizer.decode([token_id])
            original_token = train_x[index][i]
            print(f"Input ID: {token_id:<10} Original Token: {original_token:<15} Tokenized Token: {token}")
        else:
            print(f"Input ID: {token_id:<10} Original Token: None            Tokenized Token: {tokenizer.decode([token_id])}")
    
helper_visualizing()

Tokens    : Only      France    and       Britain   backed    Fischler  's        proposal  .         
Labels    :O         B-LOC     O         B-LOC     O         B-PER     O         O         O         
Label ids :1         5         1         5         1         3         1         1         1         

Original number of tokens: 9
Number of input_ids: 12 

Input ID: 4041       Original Token: Only            Tokenized Token:  Only
Input ID: 1470       Original Token: France          Tokenized Token:  France
Input ID: 8          Original Token: and             Tokenized Token:  and
Input ID: 1444       Original Token: Britain         Tokenized Token:  Britain
Input ID: 4094       Original Token: backed          Tokenized Token:  backed
Input ID: 274        Original Token: Fischler        Tokenized Token:  F
Input ID: 13239      Original Token: 's              Tokenized Token: isch
Input ID: 1371       Original Token: proposal        Tokenized Token: ler
Input ID: 128        Original

In [15]:
def align_labels_with_tokens(labels, word_ids):
    # word id is the original position in the input sequence
    new_labels = []
    current_word = None
    
    for word_id in word_ids:
        # if not same word
        if word_id != current_word:    
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
            
        elif word_id is None:
            new_labels.append(-100)
            
       # same word as previous (piece of token)
        else:
            label = labels[word_id]
            # if the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [16]:
# test alignment function for new labels
def test_align():
    print("Check to see if new labels are correct\n")
    index = 2

    inputs = tokenizer(train_x[index], is_split_into_words=True, add_special_tokens=False)
    labels = list(map(lambda label : tagset_label2id[label], train_y[index]))
    new_tokens = " ".join(tokenizer.convert_ids_to_tokens(inputs['input_ids'])).replace("Ġ", " ").split()
    
    print("Original tokens:", train_x[index])
    print("New Tokens : ",  new_tokens)
    word_ids = inputs.word_ids()
    print("position of new tokens in original tokens: ", word_ids)
    print("Original labels:", train_y[index])
    print("Original label ids : ", labels)
    
    print()
    print()
    
    new_labels_id = align_labels_with_tokens(labels, word_ids)
    new_labels = [tagset_id2label[label_id] for label_id in new_labels_id]
    print()
    print("new labels : ")
    print_example(new_tokens, new_labels)
    
test_align()

Check to see if new labels are correct

Original tokens: ['BRUSSELS', '1996-08-22']
New Tokens :  ['BR', 'USS', 'ELS', '1996', '-', '08', '-', '22']
position of new tokens in original tokens:  [0, 0, 0, 1, 1, 1, 1, 1]
Original labels: ['B-LOC', 'O']
Original label ids :  [7, 0]



new labels : 
Tokens    : BR       USS      ELS      1996     -        08       -        22       
Labels    :B-LOC    I-LOC    I-LOC    O        O        O        O        O        
Label ids :5        8        8        1        1        1        1        1        


In [17]:
def tokenize_and_align_labels(dataset):
    
    all_tokens = dataset.pop("tokens")
    all_labels = dataset.pop("labels")

    tokenized_dataset = tokenize_func(all_tokens)
    max_len = len(max(tokenized_dataset['input_ids'], key=len))

    new_labels = []
    all_labels_id = []
    
    # go through each labels
    for i, labels in enumerate(all_labels):
        # conver labels to their id
        labels_id = [tagset_label2id[label] for label in labels]
        token_ids = tokenized_dataset.word_ids(i)

        new_labels.append(align_labels_with_tokens(labels_id, token_ids))
        all_labels_id.append(labels_id)

    #tokenized_dataset["new_labels"] = new_labels
    tokenized_dataset["labels"] = new_labels
    
    
    return tokenized_dataset

### Final train and test set structure

In [18]:
# map train and test with tokenizing func and add new labels column
train_set = train_set.map(
    tokenize_and_align_labels,
    batched=True,
)
dev_set = dev_set.map(
    tokenize_and_align_labels,
    batched=True,
)
test_set = test_set.map(
    tokenize_and_align_labels,
    batched=True,
)

Map:   0%|          | 0/14986 [00:00<?, ? examples/s]

Map:   0%|          | 0/3465 [00:00<?, ? examples/s]

Map:   0%|          | 0/3683 [00:00<?, ? examples/s]

In [19]:
# save to disk for tuning with ray_gridsearch.py
train_set.save_to_disk('corpus/dataset/train')
dev_set.save_to_disk('corpus/dataset/dev')
test_set.save_to_disk('corpus/dataset/test')

Saving the dataset (0/1 shards):   0%|          | 0/14986 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3465 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3683 [00:00<?, ? examples/s]

In [20]:
input_train_max_size = len(max(train_set['input_ids'], key=len))
input_test_max_size = len(max(test_set['input_ids'], key=len))

In [21]:
#print(train_set.num_rows)
#print(test_set.num_rows / (train_set.num_rows + test_set.num_rows))
#
#print(train_set.features)

`train_set` has 14986 elements (80 %)  and `test_set` has 3683 elements (20%). Both have `input_ids`, `attention_mask`, `new_labels` , `new_labels_id` as features that will serve for our model.

The tagset contains 9 labels.

## Compute metrics for evaluation
We use [seqeval][1] for sequence labeling evaluation. It takes the lists of labels as strings, not integers, so we will need to fully decode the predictions and labels before passing them to the metric.

[1]: https://github.com/chakki-works/seqeval?tab=readme-ov-file

We also create a post processing function in order to convert ids to their original labels

In [22]:
def convert_ids_2_labels(model_predictions, true_labels):

    # go through each batch and ignore when id = -100 (padding token)
    true_labels = [
        [tagset_id2label[label_id] 
        for label_id in batch if label_id != -100 ] 
        for batch in true_labels ]
    
    model_predictions = [
        [tagset_id2label[pred] 
         for (pred, lab) in zip(prediction, label) if lab != -100 ]
        for prediction, label in zip(model_predictions, true_labels) ]
    
    return model_predictions, true_labels # both are same length

In [23]:
def compute_metrics(model_predictions):
    """ input : ŷ,  logits (log prob of model's prediction)
                y, true label of current prediction

                converts prediction ids to labels in string for computing scores
                
        returns dict with overall precision, recall, f1, accuracy
    """
    logits, true_labels = model_predictions

    predictions = np.argmax(logits, axis=1) # take the predicted tag with highest score

    return metric.compute(predictions=predictions, references=true_labels)

## Model initialization

First, we setup the [model][2]. We tried the base version (125M params) of `RobertaForTokenClassification`, a RoBERTa[1] Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.

The model returns `loss`, `scores`, `hidden_states`, `attentions`.

[1]: Liu, Yinhan, et al. (2019). RoBERTa: A Robustly Optimized BERT Pretraining Approach. 

[2]: https://huggingface.co/FacebookAI/roberta-large


In [24]:
config = RobertaConfig.from_pretrained("roberta-base", return_dict=False)

loading configuration file config.json from cache at /home/cbianc/.cache/huggingface/hub/models--roberta-base/snapshots/e2da8e2f811d1448a5b465c236feacd80ffbac7b/config.json
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "return_dict": false,
  "transformers_version": "4.41.1",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 50265
}



In [25]:
print("Config of the RobertaForTokenClassification model")
robertaClass_model = RobertaForTokenClassification.from_pretrained('roberta-base', num_labels=len(tagset), label2id=tagset_label2id, id2label=tagset_id2label)
robertaClass_model.to(device)

loading configuration file config.json from cache at /home/cbianc/.cache/huggingface/hub/models--roberta-base/snapshots/e2da8e2f811d1448a5b465c236feacd80ffbac7b/config.json
Model config RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-ORG",
    "2": "I-ORG",
    "3": "B-MISC",
    "4": "I-MISC",
    "5": "B-PER",
    "6": "I-PER",
    "7": "B-LOC",
    "8": "I-LOC"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-LOC": 7,
    "B-MISC": 3,
    "B-ORG": 1,
    "B-PER": 5,
    "I-LOC": 8,
    "I-MISC": 4,
    "I-ORG": 2,
    "I-PER": 6,
    "O": 0
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id"

Config of the RobertaForTokenClassification model


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions

RobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (L

### LSTM layer
We then modified the structure in order to add an [LSTM][1] layer with 2 hidden layers (stacked LSTM).

simple lstm unit

![lstm unit](./pictures/lstm_unit.png)

forget gate, input gate, output gate

![lstm_structure](./pictures/lstm_structure.png)

[1]: http://web.stanford.edu/~jurafsky/slp3/9.pdf

#### Parameters
https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html

- **input_size**: The number of expected features in the input x.
- **hidden_size**: The number of features in the hidden state h.
- **num_layers**: Number of recurrent layers. E.g., setting `num_layers=2` would mean stacking two LSTMs together to form a stacked LSTM, with the second LSTM taking in outputs of the first LSTM and computing the final results. Default: 1.
- **bias**: If `False`, then the layer does not use bias weights `b_ih` and `b_hh`. Default: `True`.
- **batch_first**: If `True`, then the input and output tensors are provided as `(batch, seq, feature)` instead of `(seq, batch, feature)`. Note that this does not apply to hidden or cell states. See the Inputs/Outputs sections below for details. Default: `False`.
- **dropout**: If non-zero, introduces a Dropout layer on the outputs of each LSTM layer except the last layer, with dropout probability equal to `dropout`. Default: 0.
- **bidirectional**: If `True`, becomes a bidirectional LSTM. Default: `False`.
- **proj_size**: If > 0, will use LSTM with projections of corresponding size. Default: 0.

#### Input

**Inputs**: `input`, `(h_0, c_0)`

- `input`: tensor of shape `(L, Hin)` for unbatched input, `(L, N, Hin)` when `batch_first=False` or `(N, L, Hin)` when `batch_first=True` containing the features of the input sequence. The input can also be a packed variable length sequence. See `torch.nn.utils.rnn.pack_padded_sequence()` or `torch.nn.utils.rnn.pack_sequence()` for details.
- `h_0`: tensor of shape `(D * num_layers, Hout)` for unbatched input or `(D * num_layers, N, Hout)` containing the initial hidden state for each element in the input sequence. Defaults to zeros if `(h_0, c_0)` is not provided.
- `c_0`: tensor of shape `(D * num_layers, Hcell)` for unbatched input or `(D * num_layers, N, Hcell)` containing the initial cell state for each element in the input sequence. Defaults to zeros if `(h_0, c_0)` is not provided.

**Note**: <span style="color:red;">Here our input is the last hidden layer (embeddings) of the `RobertaModel`.</span> We get the outputs of the `roberta` model ([BaseModelOutputWithPoolingAndCrossAttentions][1]) and the first element is the last hidden layer. This is what we need to pass to the LSTM as input.

[1]: https://github.com/huggingface/transformers/blob/v4.41.2/src/transformers/models/roberta/modeling_roberta.py#L679


#### Stucture

In [26]:
class CustomRobertaTokenClassWithLSTM(Module):
    def __init__(self, config, num_labels, hidden_dim, n_hidden_layers, lstm_dropout):
        super().__init__()
        self.num_labels = num_labels # number of classes to predict

        # layer 1
        self.roberta = RobertaModel(config, add_pooling_layer=False)

        # layer 2
        self.dropout = Dropout(0.3) # same shape as input
        
        # layer 3
        self.lstm = LSTM(config.hidden_size, hidden_size=hidden_dim, num_layers=n_hidden_layers, batch_first=True) # takes last hidden layer output from roberta

        self.lnorm = LayerNorm(hidden_dim)
        
        self.lstm_dropout = Dropout(lstm_dropout) # same shape as input
        
        self.fc = Linear(hidden_dim, num_labels)

        # Initialize weights and apply final processing
        self.roberta.post_init()
       
    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
                position_ids=None, head_mask=None, inputs_embeds=None, output_attentions=None,
                output_hidden_states=None, labels=None, return_dict=True):
        
        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0] # output of last hidden layer
        sequence_output = self.dropout(sequence_output)
        sequence_output,_ = self.lstm(sequence_output) # LSTM Outputs: output, (h_n, c_n) -> we only need output for token classification
        sequence_output = self.lstm_dropout(sequence_output)
        sequence_output = self.lnorm(sequence_output)

        logits = self.fc(sequence_output)
      
        loss = None
        if labels is not None:
            labels = labels.to(logits.device)
            loss_fct = CrossEntropyLoss()
            # Only keep active parts of the loss
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            
        return (loss, logits)
        

#### Diagram
![model diagram](./pictures/ner_lstm_diagram.svg)

In [27]:
custom_model = CustomRobertaTokenClassWithLSTM(config, num_labels=len(tagset), hidden_dim=256, n_hidden_layers=1, lstm_dropout=0.2)
custom_model = custom_model.to(device)

print("Config of the RoBERTA + LSTM \n",custom_model)

Config of the RoBERTA + LSTM 
 CustomRobertaTokenClassWithLSTM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features

In [28]:
#print("Model parameters:")
#for name, param in custom_model.named_parameters():
#    print(f"Parameter name: {name}, Size: {param.size()}, Requires grad: {param.requires_grad}")

## Check before training
The output layer of our model is a probability distribution over our classes (we take the log of softmax activation function) as we want to predict the y possible tag $c$ among the 9 others classes $C$. The initial loss is then defined by 
$-ln(1/$ number of classes$) = -2.19$ (The natural log penalizes more the model). We use the [cross entropy loss][1] function ($l_n$) for our model:
$$ l_n = -w_{y_n} \log \left( \frac{\exp(x_{n,y_n})}{\sum_{c=1}^{C} \exp(x_{n,c})} \right) $$

Given that before training weights are initialized at random, the probability distribution should be uniform 
(& the mean is $1/C$). Thus at the beginning the loss should be close to $ln(1/9)$. Let's compute this and check for our case.

We give one batch to the model (input ids, true labels, attention mask as we padded) and get the outputs (loss and logits) and then compare the mean of the loss $log(1/9)$ for each tokens with the initial loss -2.19 . We also check the shape of the logits (shape of ŷ) which is (*batch_size*, *sequence_length*, *num_labels*).

[1]: https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html

In [29]:
#### Check before trainig ####
def check_dimensions(model, name=None):

    print('\n_________________________________________')
    print(f'Small check on {name} before training\n')
    batch = [train_set[0]]
    batch_size = len(batch)
    
    ids = train_set[0]["input_ids"] + [1] * input_train_max_size
    targets = train_set[0]["labels"] + [-100] * input_train_max_size
    mask = train_set[0]["attention_mask"] + [0] * input_train_max_size
    
    input_ids_size = len(ids)
    
    print("ID 2 token  | Target class\n")
    for token, label in zip(tokenizer.convert_ids_to_tokens(ids), targets):
        if label == -100:
            continue
        print('{0:10}  {1}  {2}'.format(token.replace("Ġ", ""), tagset_id2label[label], label))
    
    print("\nWe pad inputs and labels to longest sequence in batch, here we used longest sequence in train")
    
    # create batch of size 1
    ids_tensor = torch.tensor(ids).to(device).unsqueeze(0)
    targets_tensor = torch.tensor(targets).to(device).unsqueeze(0)
    mask_tensor = torch.tensor(mask).to(device).unsqueeze(0)
    
    outputs = model(input_ids=ids_tensor, labels=targets_tensor, attention_mask=mask_tensor) # First elem is loss and second the logits
    initial_loss = outputs[0]
    
    print("\nModel's loss : ", initial_loss.item(), "~ initial loss 2.19 \n")

    logits = outputs[1]
    predictions =  logits.argmax(dim=-1)
    true_preds, true_labels = convert_ids_2_labels(predictions.cpu().numpy(), [targets])
    print("True predictions : ", true_preds)
    print("True labels : ", true_labels)
    
    #print(compute_metrics((training_logits.argmax(dim=-1).cpu().numpy(), [batch[0]['labels']])))
    # check the shape
    print("Correct shape : ")
    print("batch size   :", batch_size, "\ninput_size   :", input_ids_size, "\ntarget size  :", len(tagset))
    print("\nLogit tensor shape : ", logits.shape)

check_dimensions(robertaClass_model, "Roberta4TokenClass") # roberta4TokenClassif
check_dimensions(custom_model, "Roberta+LSTM") # robertaWithLSTM


_________________________________________
Small check on Roberta4TokenClass before training

ID 2 token  | Target class

EU          B-ORG  1
rejects     O  0
German      B-MISC  3
call        O  0
to          O  0
boycott     O  0
British     B-MISC  3
lamb        O  0
.           O  0

We pad inputs and labels to longest sequence in batch, here we used longest sequence in train

Model's loss :  2.3993735313415527 ~ initial loss 2.19 

True predictions :  [['B-PER', 'B-PER', 'B-PER', 'B-PER', 'B-PER', 'B-PER', 'B-PER', 'B-PER', 'B-PER']]
True labels :  [['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']]
Correct shape : 
batch size   : 1 
input_size   : 163 
target size  : 9

Logit tensor shape :  torch.Size([1, 163, 9])

_________________________________________
Small check on Roberta+LSTM before training

ID 2 token  | Target class

EU          B-ORG  1
rejects     O  0
German      B-MISC  3
call        O  0
to          O  0
boycott     O  0
British     B-MISC  3
lamb     

## Dataloaders and hyperparameters
We setup the [dataloaders][1] so we can iterate over batches. Dataloaders also enables dynamic padding for sequences with different lengths which is more efficient than doing it on the whole corpus beforehand.

Here we use the `collate_fn` argument to pass `data_collator` that will be called to transform the list of samples into a batch. This will apply padding also labels and not only tokens.

[1]: <https://pytorch.org/tutorials/beginner/basics/data_tutorial.html>

In [30]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [31]:
def load_train_dev(batch_size):
    
    train_dataloader = DataLoader(
        train_set,
        collate_fn=data_collator,
        batch_size=batch_size,
        shuffle=True,
        pin_memory=True
    )
    test_dataloader = DataLoader(
        test_set,
        collate_fn=data_collator,
        batch_size=batch_size,
        shuffle=True,
        pin_memory=True
    )

    return train_dataloader, test_dataloader
    

We use the [AdamW][1] optimizer for the gradient descent and finally the scheduler to manage
the learning during the training process.

[1]: https://pytorch.org/docs/stable/generated/torch.optim.Adam.html

## Training loop
In this section we do the magic with some maths. We build the training and evaluation. For each epoch, a first loop iterates over each batch of the train dataloader. The purpose of the training loop is to minimize the loss of the model through backpropagation using an adapive gradient descent.

**Training loop:**

(1) Get the outputs of the model and the loss. The shape is *batch_size*, *sequence_len*, *num_output_classes* \
(2) We do backpropagation of the gradient \
(3) Then we update the parameters (i.e. weights and biases) \
(4) Then we update the learning rate \
(5) Finally, we reset the gradients to prevent an accumulation

Once the training loop has been perform, the model mode changes to evaluation to verify if the model learns something through epochs. This evaluation loop iterates over each batch of the `test_dataloader`.

**Evaluation loop:**

(1) Each batch passes through the model and we collect the outputs \
(2) From those outputs we extract the logits on the CPU in the form of numpy arrays in order to **postprocess** them

Once both lists are completed, the **compute_metrics** computes the metrics for classification. It provides the **exact-match** score, **f1-score**, **accuracy**, **precision**, **recall** for this epoch.

In [32]:
def train_model(model, n_epochs, optimizer, lr_scheduler, train_dataloader):
    
    for epoch in range(n_epochs):
        model.train()
        for batch in train_dataloader:
            batch = accelerator.prepare(batch).to(device)
            
            outputs = model(**batch) 
            loss = outputs[0]
            
            #print(f"Loss: {loss.item()}")
            
            accelerator.backward(loss)
            #torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad() 
            progress_bar.update(1)

In [33]:
def test_model(model, test_loader):
    metric = evaluate.load('seqeval')

    ### Evaluation ###
    model.eval()
    for batch in test_loader:
        batch = accelerator.prepare(batch).to(device)
        
        with torch.no_grad():
            outputs = model(**batch)

        predictions = outputs[1].argmax(dim=-1) 
        labels = batch["labels"] 

        predictions_gathered = accelerator.gather(predictions).detach().cpu().clone().numpy()
        labels_gathered = accelerator.gather(labels).detach().cpu().clone().numpy()

        true_predictions, true_labels = convert_ids_2_labels(predictions_gathered, labels_gathered) 
        metric.add_batch(predictions=true_predictions, references=true_labels) 

    metrics = metric.compute()
    print(metrics)
    print({key: round(metrics[f"overall_{key}"],3) for key in ["precision", "recall", "f1", "accuracy"]})

    # save
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    torch.save(unwrapped_model.state_dict(), f"./models/final_model_lstm")

    if accelerator.is_main_process:
        tokenizer.save_pretrained(f"./models/final_lstm_tokenizer")

    return {'metrics': metrics, 'model_name' : "RobertaLSTMForTokenClass"}


## Optimal hyperparameters
After we searched for optimal hyperparamters using a separate script `ray_gridsearch.py` we load the best config and train our final model

In [34]:
### Initialize optimal hyperparameters
dropout = 0.745
num_lstm_layers = 3
hidden_dim = 256
n_epochs = 5
batch_size = 16

### Train and test set
train_loader, test_loader = load_train_dev(batch_size) 

n_training_steps = n_epochs * len(train_loader)  
progress_bar = tqdm(range(n_training_steps))

### accelerator ###
accelerator = Accelerator(mixed_precision='fp16')

### Model, Otpimizer
final_model = CustomRobertaTokenClassWithLSTM(config, len(tagset), hidden_dim, num_lstm_layers, dropout).to(device)
final_optimizer = AdamW(final_model.parameters(), lr=1e-5)
final_lr_scheduler = get_scheduler(optimizer=final_optimizer, name="linear", num_warmup_steps=0, num_training_steps=n_training_steps)

### Run train
train_model(final_model, n_epochs, final_optimizer, final_lr_scheduler, train_loader) # ready for the spaghetti

  0%|          | 0/4685 [00:00<?, ?it/s]

In [42]:
### Run eval on test set
results = test_model(final_model, test_loader)

{'LOC': {'precision': 0.5437956204379562, 'recall': 0.6252997601918465, 'f1': 0.5817066369213608, 'number': 1668}, 'MISC': {'precision': 0.19047619047619047, 'recall': 0.10256410256410256, 'f1': 0.13333333333333333, 'number': 702}, 'ORG': {'precision': 0.3971915747241725, 'recall': 0.4768211920529801, 'f1': 0.4333789329685362, 'number': 1661}, 'PER': {'precision': 0.3840676457498887, 'recall': 0.533704390847248, 'f1': 0.4466873706004141, 'number': 1617}, 'overall_precision': 0.4237417775738106, 'overall_recall': 0.4904390934844193, 'overall_f1': 0.45465736561345915, 'overall_accuracy': 0.8895520931594184}
{'precision': 0.424, 'recall': 0.49, 'f1': 0.455, 'accuracy': 0.89}


tokenizer config file saved in ./models/final_lstm_tokenizer/tokenizer_config.json
Special tokens file saved in ./models/final_lstm_tokenizer/special_tokens_map.json


## Evaluation
Here we create a df with overall metrics for each model and for each epoch

In [44]:
def write_eval_2_csv(results):
    results_df = [{
        'model_name': results['model_name'],
        'overall_f1': results['metrics']['overall_f1'],
        'overall_recall': results['metrics']['overall_recall'],
        'overall_precision': results['metrics']['overall_precision'],
        'overall_accuracy': results['metrics']['overall_accuracy'],
    }]
    
    
    eval_df = pl.DataFrame(results_df)
    
    eval_df.write_csv("evaluations_df.csv")

    return eval_df

In [45]:
final_results = write_eval_2_csv(results)
print("Scores after training loop")
final_results

Scores after training loop


model_name,overall_f1,overall_recall,overall_precision,overall_accuracy
str,f64,f64,f64,f64
"""RobertaLSTMForTokenClass""",0.454657,0.490439,0.423742,0.889552


### Results
These are the final results after training our custom Roberta + LSTM layer

My scores after eval loop

| model_name               | overall_f1 | overall_recall | overall_precision | overall_accuracy |
|--------------------------|------------|----------------|-------------------|------------------|
| RobertaLSTMForTokenClass | 0.469755   | 0.499115       | 0.443658          | 0.892496         |


It seems like the model is not able to capture all true positive instances as we have a very low f1. This could be due to imbalanced classes. Ideally, we should have checked the proportion for each set of tags and resample if needed in order to avoid unbalanced classes as unbalanced data can lead to underpredicted or conversely overpredicted classes. We could also find more optimal hyperparameters and perform cross-validation to increase the generalization ability of the model.