# NER Using BERT

## Import / Download Library

In [2]:
!pip install pytorch-pretrained-bert

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pytorch-pretrained-bert
  Downloading pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123 kB)
[K     |████████████████████████████████| 123 kB 6.1 MB/s 
Collecting boto3
  Downloading boto3-1.24.32-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 58.2 MB/s 
Collecting botocore<1.28.0,>=1.27.32
  Downloading botocore-1.27.32-py3-none-any.whl (9.0 MB)
[K     |████████████████████████████████| 9.0 MB 57.1 MB/s 
[?25hCollecting jmespath<2.0.0,>=0.7.1
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting s3transfer<0.7.0,>=0.6.0
  Downloading s3transfer-0.6.0-py3-none-any.whl (79 kB)
[K     |████████████████████████████████| 79 kB 4.4 MB/s 
Collecting urllib3<1.27,>=1.25.4
  Downloading urllib3-1.26.10-py2.py3-none-any.whl (139 kB)
[K     |████████████████████████████████| 139 kB 35.9 MB/s 
  Downloading urllib3-1.25.11-py2.py3-none-any.whl

In [3]:
import torch
from torch.optim import Adam
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertForTokenClassification, BertAdam
from tqdm import tqdm, trange

import pandas as pd
import numpy as np

## Load & Preprocessing data 

In [4]:
#Loading data
data= pd.read_csv("https://raw.githubusercontent.com/yrnigam/Named-Entity-Recognition-NER-using-LSTMs/master/ner_dataset.csv",encoding="latin1")
data = data.fillna(method="ffill")

#Show data 
print("Number of sentences: ", len(data.groupby(['Sentence #'])))

words = list(set(data["Word"].values))
n_words = len(words)
print("Number of words in the dataset: ", n_words)

tags = list(set(data["Tag"].values))
print("Tags:", tags)
n_tags = len(tags)
print("Number of Labels: ", n_tags)

print("What the dataset looks like:")
# Show the first 3 rows
data.head(n=3) 

Number of sentences:  47959
Number of words in the dataset:  35178
Tags: ['B-gpe', 'B-tim', 'I-per', 'B-geo', 'I-tim', 'B-nat', 'B-eve', 'I-nat', 'I-geo', 'B-art', 'I-art', 'I-gpe', 'B-per', 'O', 'I-org', 'I-eve', 'B-org']
Number of Labels:  17
What the dataset looks like:


Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O


In [5]:
# Preprocess data 
#Creating a class to get data in desired formate. i.e. Word,POS,Tag
class SentanceGetter(object):
  def __init__(self,data):
    self.n_sent = 1 #counter
    self.data = data
    # aggregate function to extract Word,POS,Tag
    agg_func = lambda s:[(w,p,t) for w,p,t in zip(s['Word'].tolist(),s['POS'].tolist(),s['Tag'].tolist())]
    self.grouped = self.data.groupby("Sentence #").apply(agg_func)
    self.sentances = [s for s in self.grouped]

In [6]:
# get sentences and tags
getter = SentanceGetter(data)
sentances = getter.sentances
sentences = [" ".join([s[0] for s in sent]) for sent in getter.sentances] 
print(sentences[0]) # sentence str (w)
labels = [[s[2] for s in sent] for sent in getter.sentances]
print(labels[0]) # label of sentence (tag)

Thousands of demonstrators have marched through London to protest the war in Iraq and demand the withdrawal of British troops from that country .
['O', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'B-gpe', 'O', 'O', 'O', 'O', 'O']


In [7]:
# mapping 
word2idx =  {w : i+1 for i,w in enumerate(words)}
tag2idx  =  {t : i for i,t in enumerate(tags)}

## Model

In [8]:
# parameters
MAX_LEN = 40
bs =64
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [9]:
#pretrained tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

100%|██████████| 231508/231508 [00:00<00:00, 361575.21B/s]


In [10]:
# tokenized text 
tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

In [11]:
# tokenized text to id as input for later
input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [12]:
# tags to id as input for later
tags = pad_sequences([[tag2idx.get(l) for l in lab] for lab in labels],
                     maxlen=MAX_LEN, value=tag2idx["O"], padding="post",
                     dtype="long", truncating="post")

In [13]:
# bert attention mask
attention_masks = [[float(i>0) for i in ii] for ii in input_ids]

In [14]:
# train / test data loader preprocess 
tr_inputs, val_inputs, tr_tags, val_tags = train_test_split(input_ids, tags, 
                                                            random_state=2018, test_size=0.3)
tr_masks, val_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.3)

In [15]:
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)

In [16]:
train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=bs)

In [17]:
# bert classification ner model 
model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(tag2idx))

100%|██████████| 407873900/407873900 [00:33<00:00, 12127392.24B/s]


In [18]:
model.cuda()

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): BertLayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
      

In [19]:
# finetuning the parameters for ADAM 
FULL_FINETUNING = True
if FULL_FINETUNING:
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
         'weight_decay_rate': 0.01}
         ,
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 
         'weight_decay_rate': 0.0}
    ]
else:
    param_optimizer = list(model.classifier.named_parameters()) 
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

# adam optimizer with finetuens parameters 
optimizer = Adam(optimizer_grouped_parameters, lr=3e-5)

In [20]:
def accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=2).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [21]:
# Main training loop 
epochs = 3
max_grad_norm = 1.0

for _ in trange(epochs, desc="Epoch"):
    model.train()
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0
    # traing on batch 
    for step, batch in enumerate(train_dataloader):
        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        # forward pass
        loss = model(b_input_ids, token_type_ids=None,
                     attention_mask=b_input_mask, labels=b_labels)
        
        
        # backward pass
        loss.backward()

        # track train loss
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

        # gradient clipping to prevent exploding gradients in very deep networks
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)

        # update parameters to take one gradient step
        optimizer.step()
        model.zero_grad()
   
    # VALIDATION on validation batch set
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []
    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        
        with torch.no_grad():
            tmp_eval_loss = model(b_input_ids, token_type_ids=None,
                                  attention_mask=b_input_mask, labels=b_labels)
            logits = model(b_input_ids, token_type_ids=None,
                           attention_mask=b_input_mask)
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
        true_labels.append(label_ids)
        
        tmp_eval_accuracy = accuracy(logits, label_ids)
        
        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy
        
        nb_eval_examples += b_input_ids.size(0)
        nb_eval_steps += 1
    eval_loss = eval_loss/nb_eval_steps
    pred_tags = [tags[p_i] for p in predictions for p_i in p]
    valid_tags = [tags[l_ii] for l in true_labels for l_i in l for l_ii in l_i]

    # print  loss/accuracy per epoch
    train_loss=tr_loss/nb_tr_steps
    print(f'\tTrain Loss: {train_loss:.3f}')
    eval_acc=eval_accuracy/nb_eval_steps
    print(f'\tEval Loss: {eval_loss:.3f} | Eval Acc: {eval_acc*100:.2f}%')


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Train loss: 0.3583867723033542
Validation loss: 0.20298737042480045
Validation Accuracy: 0.9375801282051286


Epoch:  33%|███▎      | 1/3 [05:24<10:48, 324.15s/it]

Train loss: 0.18293541420073736
Validation loss: 0.1624317960275544
Validation Accuracy: 0.9511224626068374


Epoch:  67%|██████▋   | 2/3 [10:48<05:24, 324.38s/it]

Train loss: 0.1426045602134296
Validation loss: 0.14312040236261156
Validation Accuracy: 0.9517135416666662


Epoch: 100%|██████████| 3/3 [16:10<00:00, 323.53s/it]
