# Name Entity Recognition of Text Corpus Using Transformer Encoder-Based Model

This Python notebook does Name Entity Recognition of a Text Corpus, and is broken down as following:

### 1. Tokenizer, Data Loader and Dataset

### 2. Transformer Encoder-Based Model

### 3. Training and Validation Methods

### 4. F1 Score Calculation and Test Split Predictions

In [3]:
from typing import Dict, List, Optional
from collections import Counter
import os
import csv
!pip install torchmetrics
!pip install pytorch-metric-learning
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
!pip install pytorch-lightning
import torch.optim as optim
import torchmetrics
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn import TransformerEncoder, TransformerEncoderLayer, TransformerDecoder
import numpy

[0m

## 1. Tokenizer, Data Loader and Dataset


The Tokenizer class contains information about tokenizing a given sentence. The encode method encodes a given text corpus, creating a dictionary of the words that appear in the corpus. The data loader method takes the text corpus and its associated tags and creates a dictionary with the corpus tensor in as 'text' and its associated tags tensor as 'tags'. Finally, the NERDataset creates the final encoded tensor that is ready to be fed into the Transformer model.

In [4]:
class Tokenizer:
    def __init__(self):
        # two special tokens for padding and unknown
        self.token2idx = {"<pad>": 0, "<unk>": 1}
        self.idx2token = ["<pad>", "<unk>"]
        self.is_fit = False
    
    @property
    def pad_id(self):
        return self.token2idx["<pad>"]
    
    def __len__(self):
        return len(self.idx2token)
    
    def fit(self, train_texts: List[str]):
        counter = Counter()
        for text in train_texts:
            counter.update(text.lower().strip().split())
        
        # manually set a vocabulary size for the data set
        vocab_size = 20000
        self.idx2token.extend([token for token, count in counter.most_common(vocab_size - 2)])
        for (i, token) in enumerate(self.idx2token):
            self.token2idx[token] = i
            
        self.is_fit = True
                
    def encode(self, text: str, max_length: Optional[int] = None) -> List[int]:
        if not self.is_fit:
            raise Exception("Please fit the tokenizer on the training tokens")
            
        #first, remove punctuation from text.
        splitText = text.lower().strip().split()
        
        #create the tokens list
        limit = -1

        tokens = [0]*len(splitText)
        if max_length is not None:
            if max_length < len(splitText):
                limit = max_length
                tokens = [0]*max_length
        
        for i in range(len(tokens)):
            if i < len(splitText):
                if splitText[i] in self.token2idx:
                    curToken = self.token2idx[splitText[i]]
                else:
                    curToken = self.token2idx["<unk>"]
                tokens[i] = curToken
            if i == limit:
                break
        
        if max_length is not None:
            if max_length > len(splitText):
                for x in range(len(splitText), max_length):
                    tokens.append(0)
        return tokens


In [5]:
def load_raw_data(filepath: str, with_tags: bool = True):
    data = {'text': []}
    if with_tags:
        data['tags'] = []
        with open(filepath) as f:
            reader = csv.reader(f)
            for text, tags in reader:
                data['text'].append(text)
                data['tags'].append(tags)
    else:
        with open(filepath) as f:
            for line in f:
                data['text'].append(line.strip())
    return data

In [7]:
data_dir = "/NERProject/"
tokenizer = Tokenizer()
train_raw = load_raw_data(os.path.join(data_dir, "train.csv"))
val_raw = load_raw_data(os.path.join(data_dir, "val.csv"))
test_raw = load_raw_data(os.path.join(data_dir, "test_tokens.txt"), with_tags=False)
# fit the tokenizer on the training tokens
tokenizer.fit(train_raw['text'])

In [8]:
class NERDataset: 
    tag2idx = {'O': 1, 'B-PER': 2, 'I-PER': 3, 'B-ORG': 4, 'I-ORG': 5, 'B-LOC': 6, 'I-LOC': 7, 'B-MISC': 8, 'I-MISC': 9}
    idx2tag = ['<pad>', 'O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG','B-LOC', 'I-LOC', 'B-MISC', 'I-MISC']
  
    def __init__(self, raw_data: Dict[str, List[str]], tokenizer: Tokenizer, max_length: int = 128):
        self.tokenizer = tokenizer
        self.token_ids = []
        self.tag_ids = []
        self.with_tags = False
        for text in raw_data['text']:
            self.token_ids.append(tokenizer.encode(text, max_length=max_length))
        if 'tags' in raw_data:
            self.with_tags = True
            for tags in raw_data['tags']:
                self.tag_ids.append(self.encode_tags(tags, max_length=max_length))
    
    def encode_tags(self, tags: str, max_length: Optional[int] = None):
        tag_ids = [self.tag2idx[tag] for tag in tags.split()]
        if max_length is None:
            return tag_ids
        # truncate the tags if longer than max_length
        if len(tag_ids) > max_length:
            return tag_ids[:max_length]
        # pad with 0s if shorter than max_length
        else:
            return tag_ids + [0] * (max_length - len(tag_ids))  # 0 as padding for tags
        
    def __len__(self):
        return len(self.token_ids)
    
    def __getitem__(self, idx):
        token_ids = torch.LongTensor(self.token_ids[idx])
        
        mask = token_ids == self.tokenizer.pad_id  # padding tokens
        
        if self.with_tags:
            # for training and validation
            return token_ids, mask, torch.LongTensor(self.tag_ids[idx])
        else:
            # for testing
            return token_ids, mask
        

In [9]:
tr_data = NERDataset(train_raw, tokenizer, max_length = 128)
va_data = NERDataset(val_raw, tokenizer, max_length = 128)
te_data = NERDataset(test_raw, tokenizer, max_length = 128)

## 2. Transformer Encoder-Based Model

I used 10 torch.nn.TransformerEncoderLayer layers, a positional encoder and 2 linear layers, along with uniform weight initialization. I found the right hyperparameters by conducting a localized binary search, with the assumption that accuracy as a function of a given hyperparameter is continuous locally.

In [10]:
class TransformerModel(nn.Module):
    
    def __init__(self, vocab_size: int, d_model: int, nhead: int, d_hid: int,
                 nlayers: int, dropout: float = 0.5, batch_first = False):
        super().__init__()
        self.model_type = 'Transformer'
        
        self.encoder = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        
        encoder_layers = TransformerEncoderLayer(d_model, nhead, d_hid, dropout, batch_first = batch_first)
        self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
        
        self.d_model = d_model
        self.linear1 = nn.Linear(d_model, 64)
        self.ReLU = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(64, 10)

        self.init_weights()

    def init_weights(self) -> None:
        initrange = 0.25
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.linear1.bias.data.zero_()
        self.linear1.weight.data.uniform_(-initrange, initrange)
        self.linear2.bias.data.zero_()
        self.linear2.weight.data.uniform_(-initrange, initrange)

    def forward(self, src: torch.Tensor, src_mask: torch.Tensor) -> torch.Tensor:
        """
        Args:
            src: Tensor, shape [seq_len, batch_size]
            src_mask: Tensor, shape [seq_len, seq_len]

        Returns:
            output Tensor of shape [seq_len, batch_size, ntoken]
        """
        src = self.encoder(src) * math.sqrt(self.d_model)
        src = self.pos_encoder(src)
        output = self.transformer_encoder(src.transpose(1,0), src_key_padding_mask = src_mask)
        output = self.linear1(output.transpose(1,0))
        output = self.dropout(output)
        output = self.ReLU(output)
        output = self.linear2(output)
        return output
    

class PositionalEncoding(nn.Module):

    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x: Tensor, shape [seq_len, batch_size, embedding_dim]
        """
        x = x + self.pe[:x.size(0)]
        return self.dropout(x)


## 3. Training and Validation Methods

Training and validation methods. In the validation method, the inputs are through the model and the resulting logits (output) are used to calculate loss against the validate split labels. In the training method, the same thing happens except we call loss.backwards(), which calculates the derivative of the loss with respect to the parameters in the model at every point in the computational graph (which is implicitly maintained by PyTorch). Then, calling optimizer.step() updates the parameters of the model using Adam (the optimizer we are using for this project).

In [11]:
#modify as required
def validate(
    model: nn.Module, 
    dataloader: DataLoader, 
    device: torch.device,
):
    acc_metric = torchmetrics.Accuracy(task = 'multiclass', num_classes = 10, compute_on_step=False).to(device)
    loss_metric = torchmetrics.MeanMetric(compute_on_step=False).to(device)
    model.eval()
    
    with torch.no_grad():
        for batch in tqdm(dataloader):
            input_ids, input_mask, tags = batch[0].to(device), batch[1].to(device), batch[2].to(device)
            # output shape: (batch_size, max_length, num_classes)
            logits = model(input_ids, input_mask)
            # ignore padding index 0 when calculating loss
            loss = F.cross_entropy(logits.reshape(-1,10), tags.reshape(-1), ignore_index = 0)

            loss_metric.update(loss, input_mask.numel() - input_mask.sum())
            is_active = torch.logical_not(input_mask)  # non-padding elements
            # only consider non-padded tokens when calculating accuracy
            acc_metric.update(logits[is_active], tags[is_active])

    
    print(f"| Validate | loss {loss_metric.compute():.4f} | acc {acc_metric.compute():.4f} |")

In [12]:
def train(
    model: nn.Module, 
    dataloader: DataLoader, 
    optimizer: optim.Optimizer,
    device: torch.device,
    epoch: int,
):
    acc_metric = torchmetrics.Accuracy(task = 'multiclass', num_classes = 10, compute_on_step=True).to(device)
    loss_metric = torchmetrics.MeanMetric(compute_on_step=True).to(device)
    model.train()
    # loop through all batches in the training
    for batch in tqdm(dataloader):
        input_ids, input_mask, tags = batch[0].to(device), batch[1].to(device), batch[2].to(device)
        optimizer.zero_grad()

        # output shape: (batch_size, max_length, num_classes)
        logits = model(input_ids, input_mask)
        # ignore padding index 0 when calculating loss
        loss = F.cross_entropy(logits.reshape(-1,10), tags.reshape(-1), ignore_index = 0)
        
        loss.backward()
        optimizer.step()
        
        loss_metric.update(loss, input_mask.numel() - input_mask.sum())
        is_active = torch.logical_not(input_mask)  # non-padding elements
        # only consider non-padded tokens when calculating accuracy
        acc_metric.update(logits[is_active], tags[is_active])
    
    print(f"| Epoch {epoch} | loss {loss_metric.compute():.4f} | acc {acc_metric.compute():.4f} |")
    

In [13]:
torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# data loaders
train_dataloader = DataLoader(tr_data, batch_size=32, shuffle=True, drop_last=False)
val_dataloader = DataLoader(va_data, batch_size=32,drop_last = False)
test_dataloader = DataLoader(te_data, batch_size=32, drop_last = False)

# move the model to device
model = TransformerModel(vocab_size = len(tokenizer), 
    d_model = 128, 
    d_hid = 128,
    nhead = 8, 
    nlayers = 10, dropout = 0.2, batch_first = False).to(device)


optimizer = optim.Adam(model.parameters())



In [21]:
for epoch in range(10):
    train(model, train_dataloader, optimizer, device, epoch)
    validate(model, val_dataloader, device)

100%|██████████| 439/439 [00:33<00:00, 13.12it/s]


| Epoch 0 | loss 0.1674 | acc 0.9505 |


100%|██████████| 102/102 [00:02<00:00, 46.86it/s]


| Validate | loss 0.2635 | acc 0.9296 |


100%|██████████| 439/439 [00:33<00:00, 13.14it/s]


| Epoch 1 | loss 0.1454 | acc 0.9572 |


100%|██████████| 102/102 [00:02<00:00, 46.71it/s]


| Validate | loss 0.2902 | acc 0.9327 |


100%|██████████| 439/439 [00:33<00:00, 13.15it/s]


| Epoch 2 | loss 0.1294 | acc 0.9617 |


100%|██████████| 102/102 [00:02<00:00, 46.86it/s]


| Validate | loss 0.2640 | acc 0.9339 |


100%|██████████| 439/439 [00:33<00:00, 13.14it/s]


| Epoch 3 | loss 0.1171 | acc 0.9652 |


100%|██████████| 102/102 [00:02<00:00, 46.27it/s]


| Validate | loss 0.2467 | acc 0.9337 |


100%|██████████| 439/439 [00:33<00:00, 13.05it/s]


| Epoch 4 | loss 0.1073 | acc 0.9671 |


100%|██████████| 102/102 [00:02<00:00, 46.96it/s]

| Validate | loss 0.2952 | acc 0.9350 |





## 4. F1 Score Calculation and Test Split Predictions

Defines the predict method, in which inputs are fed into the model, and the resulting logits are used to predict the NER tags of the input. 

Runs those tags through conlleval, a script that is used to measure the f1 score of NER. 

In [22]:
def predict(model: nn.Module, dataloader: DataLoader, device: torch.device) -> List[List[str]]:
    model.eval()
    preds = []
    with torch.no_grad():
        for i,batch in enumerate(tqdm(dataloader)):
            input_ids, input_mask = batch[0].to(device), batch[1].to(device)
            logits = model(input_ids, input_mask)
            currPred = numpy.argmax(logits.detach().cpu().numpy(), axis=2)
            for index,row in enumerate(currPred):
                sentTags = []
                for rowIndex, value in enumerate(row):
                    if input_mask[index][rowIndex]:
                        break
                    else:
                        sentTags.append(NERDataset.idx2tag[value])
                preds.append(sentTags)
    return preds

In [23]:
!wget https://raw.githubusercontent.com/sighsmile/conlleval/master/conlleval.py
from conlleval import evaluate

--2023-09-07 06:40:06--  https://raw.githubusercontent.com/sighsmile/conlleval/master/conlleval.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7502 (7.3K) [text/plain]
Saving to: ‘conlleval.py.2’


2023-09-07 06:40:06 (50.7 MB/s) - ‘conlleval.py.2’ saved [7502/7502]



In [24]:
# use the conlleval script to measure the entity-level f1
pred_tags = []
tag_count = 0
append_O_count = 0
len_idx = []
for idx, tags in enumerate(predict(model, val_dataloader, device)):
    tag_count += len(tags)
    len_idx.append(len(tags))
    pred_tags.extend(tags)
    append_O_count += 1
    pred_tags.append('O')
        
true_tags = []
true_tag_count = 0
append_O_count = 0

for idx,tags in enumerate(val_raw['tags']):
    true_tag_count += len(tags.strip().split())
    true_tags.extend(tags.strip().split())
    append_O_count += 1
    true_tags.append('O')
    
evaluate(true_tags, pred_tags, verbose=True)

100%|██████████| 102/102 [00:03<00:00, 33.85it/s]


processed 54612 tokens with 5942 phrases; found: 5472 phrases; correct: 4011.
accuracy:  64.85%; (non-O)
accuracy:  93.88%; precision:  73.30%; recall:  67.50%; FB1:  70.28
              LOC: precision:  85.03%; recall:  79.80%; FB1:  82.34  1724
             MISC: precision:  76.44%; recall:  73.21%; FB1:  74.79  883
              ORG: precision:  63.97%; recall:  60.10%; FB1:  61.98  1260
              PER: precision:  66.29%; recall:  57.76%; FB1:  61.73  1605


(73.30043859649122, 67.50252440255807, 70.28210968985456)

In [25]:
preds = predict(model, test_dataloader, device)
with open("submission.txt", "w") as f:
    for tags in preds:
        f.write(" ".join(tags) + "\n")

100%|██████████| 108/108 [00:02<00:00, 36.16it/s]
