#### **Importing Python Libraries and preparing the environment**

* pandas
* numpy
* sklearn
* pytorch
* transformers
* seqeval

In [2]:
# import pandas as pd
# import numpy as np
# from sklearn.metrics import accuracy_score
# import torch
# from torch.utils.data import Dataset, DataLoader
# from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification

In [2]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


#### Reading Data

In [3]:
data = pd.read_csv('ner_dataset1.txt')
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,,of,IN,O
2,,demonstrators,NNS,O
3,,have,VBP,O
4,,marched,VBN,O


In [4]:
data.count()

Sentence #      47959
Word          1048565
POS           1048575
Tag           1048575
dtype: int64

In [5]:
print("Number of tags: {}".format(len(data.Tag.unique())))
frequencies = data.Tag.value_counts()
frequencies

Number of tags: 17


Tag
O        887908
B-geo     37644
B-tim     20333
B-org     20143
I-per     17251
B-per     16990
I-org     16784
B-gpe     15870
I-geo      7414
I-tim      6528
B-art       402
B-eve       308
I-art       297
I-eve       253
B-nat       201
I-gpe       198
I-nat        51
Name: count, dtype: int64

In [6]:
tags = {}
for tag, count in zip(frequencies.index, frequencies):
    if tag != "O":
        if tag[2:5] not in tags.keys():
            tags[tag[2:5]] = count
        else:
            tags[tag[2:5]] += count
    continue

print(sorted(tags.items(), key=lambda x: x[1], reverse=True))

[('geo', 45058), ('org', 36927), ('per', 34241), ('tim', 26861), ('gpe', 16068), ('art', 699), ('eve', 561), ('nat', 252)]


##### Let's remove "art", "eve" and "nat" named entities, as performance on them will probably be not good

In [7]:
entities_to_replace = ["B-art", "I-art", "B-eve", "I-eve", "B-nat", "I-nat"]
# data = data[~data.Tag.isin(entities_to_remove)]
data.replace({'Tag':entities_to_replace},'O',inplace=True)

In [8]:
labels_to_ids = {k: v for v, k in enumerate(data.Tag.unique())}
ids_to_labels = {v: k for v, k in enumerate(data.Tag.unique())}
labels_to_ids

{'O': 0,
 'B-geo': 1,
 'B-gpe': 2,
 'B-per': 3,
 'I-geo': 4,
 'B-org': 5,
 'I-org': 6,
 'B-tim': 7,
 'I-per': 8,
 'I-gpe': 9,
 'I-tim': 10}

In [9]:
ids_to_labels

{0: 'O',
 1: 'B-geo',
 2: 'B-gpe',
 3: 'B-per',
 4: 'I-geo',
 5: 'B-org',
 6: 'I-org',
 7: 'B-tim',
 8: 'I-per',
 9: 'I-gpe',
 10: 'I-tim'}

In [10]:
# pandas has a very handy "forward fill" function to fill missing values based on the last upper non-nan value
data = data.fillna(method='ffill')
data.head()

  data = data.fillna(method='ffill')


Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 1,Thousands,NNS,O
1,Sentence: 1,of,IN,O
2,Sentence: 1,demonstrators,NNS,O
3,Sentence: 1,have,VBP,O
4,Sentence: 1,marched,VBN,O


In [19]:
# data[['Sentence #','Word']].groupby(['Sentence #'])['Word'].transform(lambda x: ' '.join(x))

In [18]:
# let's create a new column called "sentence" which groups the words by sentence
data['sentence'] = data[['Sentence #','Word']].groupby(['Sentence #'])['Word'].transform(lambda x: ' '.join(x))
# let's also create a new column called "word_labels" which groups the tags by sentence
data['word_labels'] = data[['Sentence #','Tag']].groupby(['Sentence #'])['Tag'].transform(lambda x: ','.join(x))
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag,sentence,word_labels
0,Sentence: 1,Thousands,NNS,O,Thousands of demonstrators have marched throug...,"O,O,O,O,O,O,B-geo,O,O,O,O,O,B-geo,O,O,O,O,O,B-..."
1,Sentence: 1,of,IN,O,Thousands of demonstrators have marched throug...,"O,O,O,O,O,O,B-geo,O,O,O,O,O,B-geo,O,O,O,O,O,B-..."
2,Sentence: 1,demonstrators,NNS,O,Thousands of demonstrators have marched throug...,"O,O,O,O,O,O,B-geo,O,O,O,O,O,B-geo,O,O,O,O,O,B-..."
3,Sentence: 1,have,VBP,O,Thousands of demonstrators have marched throug...,"O,O,O,O,O,O,B-geo,O,O,O,O,O,B-geo,O,O,O,O,O,B-..."
4,Sentence: 1,marched,VBN,O,Thousands of demonstrators have marched throug...,"O,O,O,O,O,O,B-geo,O,O,O,O,O,B-geo,O,O,O,O,O,B-..."


In [20]:
data = data[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
data.head()

Unnamed: 0,sentence,word_labels
0,Thousands of demonstrators have marched throug...,"O,O,O,O,O,O,B-geo,O,O,O,O,O,B-geo,O,O,O,O,O,B-..."
1,Families of soldiers killed in the conflict jo...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-per,O,O,..."
2,They marched from the Houses of Parliament to ...,"O,O,O,O,O,O,O,O,O,O,O,B-geo,I-geo,O"
3,"Police put the number of marchers at 10,000 wh...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
4,The protest comes on the eve of the annual con...,"O,O,O,O,O,O,O,O,O,O,O,B-geo,O,O,B-org,I-org,O,..."


#### **Preparing the dataset and dataloader**

In [36]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [37]:
class dataset(Dataset):
  def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

  def __getitem__(self, index):
        # step 1: get the sentence and word labels
        sentence = self.data.sentence[index].strip().split()
        word_labels = self.data.word_labels[index].split(",")

        # step 2: use tokenizer to encode sentence (includes padding/truncation up to max length)
        # BertTokenizerFast provides a handy "return_offsets_mapping" functionality for individual tokens
        encoding = self.tokenizer(sentence,
                             # is_pretokenized=True,
                             is_split_into_words = True,
                             return_offsets_mapping=True,
                             padding='max_length',
                             truncation=True,
                             max_length=self.max_len)

        # step 3: create token labels only for first word pieces of each tokenized word
        labels = [labels_to_ids[label] for label in word_labels]
        # code based on https://huggingface.co/transformers/custom_datasets.html#tok-ner
        # create an empty array of -100 of length max_length
        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100

        # set only labels whose first offset position is 0 and the second is not 0
        try:
            i = 0
            for idx, mapping in enumerate(encoding["offset_mapping"]):
              if mapping[0] == 0 and mapping[1] != 0:
                # overwrite label
                encoded_labels[idx] = labels[i]
                i += 1
        except:
            pass
          # print("encoded_labels-",len(encoded_labels), encoded_labels)
          # print("labels - ", len(labels),labels)
          # print("idx - ", idx,"  i - ",i)

        # step 4: turn everything into PyTorch tensors
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(encoded_labels)

        return item

  def __len__(self):
        return self.len

In [38]:
MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 1
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10
train_size = 0.8

train_dataset = data.sample(frac=train_size,random_state=200)
test_dataset = data.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (47607, 2)
TRAIN Dataset: (38086, 2)
TEST Dataset: (9521, 2)


In [39]:
# for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[0]["input_ids"]), training_set[0]["labels"]):
#   print('{0:10}  {1}'.format(token, label))

In [40]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [41]:
len(training_loader),len(training_set)

(9522, 38086)

In [42]:
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(labels_to_ids))
model.to(device)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [43]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [44]:
# Defining the training function on the 80% of the dataset for tuning the bert model
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()

    for idx, batch in enumerate(training_loader):

        ids = batch['input_ids'].type(torch.LongTensor).to(device, dtype = torch.long)
        mask = batch['attention_mask'].type(torch.LongTensor).to(device, dtype = torch.long)
        labels = batch['labels'].type(torch.LongTensor).to(device, dtype = torch.long)

        loss, tr_logits = model(input_ids=ids, attention_mask=mask, labels=labels,return_dict = False)
        # print(" loss, tr_logits - ",loss, tr_logits)
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)

        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")

        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)

        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        #active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))

        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)

        tr_labels.extend(labels)
        tr_preds.extend(predictions)

        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy

        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )

        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

In [45]:
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)

#### **Evaluating the model**

In [46]:
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):

            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)
            
            eval_logits = model(ids, attention_mask=mask,return_dict = False)[0]

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)
            
            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)

            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)

            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)

            eval_labels.append(labels)
            eval_preds.append(predictions)

            tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    labels = [[ids_to_labels[j.item()] for j in id] for id in eval_labels]
    predictions = [[ids_to_labels[j.item()] for j in id] for id in eval_preds]


    return labels, predictions

In [51]:
labels, predictions = valid(model, testing_loader)

In [52]:
from seqeval.metrics import classification_report
print(classification_report(labels, predictions))

              precision    recall  f1-score   support

         geo       0.82      0.89      0.86      7426
         gpe       0.95      0.94      0.95      3236
         org       0.64      0.57      0.60      3970
         per       0.72      0.77      0.75      3328
         tim       0.84      0.86      0.85      4098

   micro avg       0.80      0.82      0.81     22058
   macro avg       0.80      0.81      0.80     22058
weighted avg       0.80      0.82      0.81     22058



#### **Inference**

In [2]:
sentence = "The Bush administration will ask Congress for more than $ 240 billion to cover the cost of military operations in Iraq and Afghanistan for the next two fiscal years ."

inputs = tokenizer(sentence.split(),
                    # is_pretokenized=True,
                    is_split_into_words=True,
                    return_offsets_mapping=True,
                    padding='max_length',
                    truncation=True,
                    max_length=MAX_LEN,
                    return_tensors="pt")

# move to gpu
ids = inputs["input_ids"].to(device)
mask = inputs["attention_mask"].to(device)
# forward pass
outputs = model(ids, attention_mask=mask)
logits = outputs[0]

active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level

tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
token_predictions = [ids_to_labels[i] for i in flattened_predictions.cpu().numpy()]
wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)

prediction = []
for token_pred, mapping in zip(wp_preds, inputs["offset_mapping"].squeeze().tolist()):
  #only predictions on first word pieces are important
  if mapping[0] == 0 and mapping[1] != 0:
    prediction.append(token_pred[1])
  else:
    continue

print(sentence.split())
print(prediction)

#### **Saving the model for future use**

In [55]:
import os
directory = "./model"
tokenizer.save_vocabulary(directory)
# save the model weights and its configuration file
model.save_pretrained(directory)

#### **Prediciton Pipeline**

In [21]:
# sentence.split()

In [3]:
# import pandas as pd
# import numpy as np
# from sklearn.metrics import accuracy_score
import torch
# from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification
device = 'cuda'
labels_to_ids = {'O': 0, 'B-geo': 1, 'B-gpe': 2, 'B-per': 3, 'I-geo': 4, 'B-org': 5, 'I-org': 6, 'B-tim': 7, 'I-per': 8, 'I-gpe': 9, 'I-tim': 10}
ids_to_labels = {0: 'O', 1: 'B-geo', 2: 'B-gpe', 3: 'B-per', 4: 'I-geo', 5: 'B-org', 6: 'I-org', 7: 'B-tim', 8: 'I-per', 9: 'I-gpe', 10: 'I-tim'}
new_model = BertForTokenClassification.from_pretrained('./model_1', num_labels=len(labels_to_ids))
tokenizer = BertTokenizerFast.from_pretrained('./model_1')
new_model.to(device)
MAX_LEN = 128
# TRAIN_BATCH_SIZE = 4
# VALID_BATCH_SIZE = 2
# EPOCHS = 1
# LEARNING_RATE = 1e-05
# MAX_GRAD_NORM = 10
# train_size = 0.8

def get_prediction_bert(sentence):
    inputs = tokenizer(sentence.split(),
                        # is_pretokenized=True, 
                        is_split_into_words = True,
                        return_offsets_mapping=True, 
                        padding='max_length', 
                        truncation=True, 
                        max_length=MAX_LEN,
                        return_tensors="pt")
    
    # move to gpu
    ids = inputs["input_ids"].to(device)
    mask = inputs["attention_mask"].to(device)
    # forward pass
    outputs = new_model(ids, attention_mask=mask)
    logits = outputs[0]
    
    active_logits = logits.view(-1, new_model.num_labels) # shape (batch_size * seq_len, num_labels)
    flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level
    
    tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
    token_predictions = [ids_to_labels[i] for i in flattened_predictions.cpu().numpy()]
    wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)
    
    prediction = []
    for token_pred, mapping in zip(wp_preds, inputs["offset_mapping"].squeeze().tolist()):
      #only predictions on first word pieces are important
      if mapping[0] == 0 and mapping[1] != 0:
        prediction.append(token_pred[1])
      else:
        continue
    return prediction

def get_span(prediction):
    out = []
    c=0
    ent=False
    for i in prediction:
        if i != 'O':
            ent = True
            if i[0]=='B':
                tag = i
                start = c
                end = c+1
            else:
                end = c+1
        else:
            if ent:
                # print(start,end,tag)
                out.append(Span(doc,start,end,tag))
                ent=False
        c=c+1
    if ent:
        # print(start,end,tag)
        out.append(Span(doc,start,end,tag))
    return out

In [9]:
import spacy
from spacy import displacy
from spacy.tokens import Span

sentence = "The Bush administration will ask Congress for more than $ 240 billion to cover the cost of military operations in Iraq and Afghanistan for the next two fiscal years . "
# sentence = "@HuggingFace is a company based in New York but is also has employees working in Paris"
prediction = get_prediction_bert(sentence)
temp = []
for i, j in zip(prediction,sentence.split()):
    temp.append(i+' - '+j)
temp

nlp = spacy.blank("en")
doc = nlp(sentence)
doc.spans["sc"] = get_span(prediction)
spacy.displacy.render(doc, style="span", jupyter=True)

In [10]:
temp

['O - The',
 'B-per - Bush',
 'O - administration',
 'O - will',
 'O - ask',
 'B-org - Congress',
 'O - for',
 'O - more',
 'O - than',
 'O - $',
 'O - 240',
 'O - billion',
 'O - to',
 'O - cover',
 'O - the',
 'O - cost',
 'O - of',
 'O - military',
 'O - operations',
 'O - in',
 'B-geo - Iraq',
 'O - and',
 'B-geo - Afghanistan',
 'O - for',
 'O - the',
 'O - next',
 'B-tim - two',
 'I-tim - fiscal',
 'O - years',
 'O - .']

In [6]:
['The Bush administration will ask Congress for more than $ 240 billion to cover the cost of military operations in Iraq and Afghanistan for the next two fiscal years . ',
 [(4, 8, 'per'),
  (33, 41, 'org'),
  (114, 118, 'geo'),
  (123, 134, 'geo'),
  (148, 158, 'tim')]]

['The Bush administration will ask Congress for more than $ 240 billion to cover the cost of military operations in Iraq and Afghanistan for the next two fiscal years . ',
 [(4, 8, 'per'),
  (33, 41, 'org'),
  (114, 118, 'geo'),
  (123, 134, 'geo'),
  (148, 158, 'tim')]]