In [1]:
!pip install -U transformers

Collecting transformers
  Downloading transformers-4.12.2-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 5.2 MB/s 
[?25hCollecting huggingface-hub>=0.0.17
  Downloading huggingface_hub-0.0.19-py3-none-any.whl (56 kB)
[K     |████████████████████████████████| 56 kB 4.5 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 38.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 21.5 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 41.6 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  At

In [1]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import BertTokenizer, BertTokenizerFast, BertForTokenClassification
from transformers import pipeline

In [2]:
import pandas as pd
import numpy as np

## Use of BERT-based NER for identifying the names of individuals in a given sentence

### Step 1: Compare the use of - 

(1) AutoTokenizer vs BertTokenizer and BertTokenizerFast

(2) Grouped entities

In [3]:
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)
example = "To take us through this results of the quarter and answer your question, we have with us the from the top \
Management of Heranba, Mr. R. K. Shetty - Managing Director; Mr. Raunak Shetty - Executive Director, and \
Mr. Raj Kumar Bafna - Chief Financial Officer."

ner_results = nlp(example)
print(ner_results)

[{'entity': 'B-ORG', 'score': 0.89626545, 'index': 23, 'word': 'Management', 'start': 106, 'end': 116}, {'entity': 'B-LOC', 'score': 0.93232435, 'index': 25, 'word': 'Her', 'start': 120, 'end': 123}, {'entity': 'I-LOC', 'score': 0.9942275, 'index': 26, 'word': '##an', 'start': 123, 'end': 125}, {'entity': 'I-LOC', 'score': 0.94235843, 'index': 27, 'word': '##ba', 'start': 125, 'end': 127}, {'entity': 'B-PER', 'score': 0.9997056, 'index': 31, 'word': 'R', 'start': 133, 'end': 134}, {'entity': 'B-PER', 'score': 0.9962091, 'index': 32, 'word': '.', 'start': 134, 'end': 135}, {'entity': 'I-PER', 'score': 0.9978155, 'index': 33, 'word': 'K', 'start': 136, 'end': 137}, {'entity': 'I-PER', 'score': 0.9964576, 'index': 34, 'word': '.', 'start': 137, 'end': 138}, {'entity': 'I-PER', 'score': 0.99959683, 'index': 35, 'word': 'She', 'start': 139, 'end': 142}, {'entity': 'I-PER', 'score': 0.9988771, 'index': 36, 'word': '##tty', 'start': 142, 'end': 145}, {'entity': 'B-PER', 'score': 0.99972147, '

In [5]:
# Use of grouped_entities only combines words that are split with "##"
# For example "Ra" is NOT joined with "##unak Shetty"
nlp_grp = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)
ner_grp_results = nlp_grp('Management of Heranba')
print(ner_grp_results)

[{'entity_group': 'ORG', 'score': 0.7859322, 'word': 'Her', 'start': 14, 'end': 17}, {'entity_group': 'LOC', 'score': 0.64316285, 'word': '##anba', 'start': 17, 'end': 21}]


In [6]:
# Explicit use of BERT DOES NOT return start and end indexes if tokenizer is "BertTokenizer"
tokenizer_bert = BertTokenizer.from_pretrained("dslim/bert-base-NER")
model_bert = BertForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp_bert = pipeline("ner", model=model_bert, tokenizer=tokenizer_bert)
ner_bert_results = nlp_bert(example)
print(ner_bert_results)

[{'entity': 'B-ORG', 'score': 0.8962654, 'index': 23, 'word': 'Management', 'start': None, 'end': None}, {'entity': 'B-LOC', 'score': 0.93232465, 'index': 25, 'word': 'Her', 'start': None, 'end': None}, {'entity': 'I-LOC', 'score': 0.9942275, 'index': 26, 'word': '##an', 'start': None, 'end': None}, {'entity': 'I-LOC', 'score': 0.9423585, 'index': 27, 'word': '##ba', 'start': None, 'end': None}, {'entity': 'B-PER', 'score': 0.9997056, 'index': 31, 'word': 'R', 'start': None, 'end': None}, {'entity': 'B-PER', 'score': 0.9962091, 'index': 32, 'word': '.', 'start': None, 'end': None}, {'entity': 'I-PER', 'score': 0.9978155, 'index': 33, 'word': 'K', 'start': None, 'end': None}, {'entity': 'I-PER', 'score': 0.9964576, 'index': 34, 'word': '.', 'start': None, 'end': None}, {'entity': 'I-PER', 'score': 0.99959683, 'index': 35, 'word': 'She', 'start': None, 'end': None}, {'entity': 'I-PER', 'score': 0.9988771, 'index': 36, 'word': '##tty', 'start': None, 'end': None}, {'entity': 'B-PER', 'sco

In [7]:
# Explicit use of BERT DOES return start and end indexes if tokenizer is "BertTokenizerFast"
tokenizer_bert = BertTokenizerFast.from_pretrained("dslim/bert-base-NER")
model_bert = BertForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp_bert = pipeline("ner", model=model_bert, tokenizer=tokenizer_bert)
ner_bert_results = nlp_bert(example)
print(ner_bert_results)

[{'entity': 'B-ORG', 'score': 0.8962654, 'index': 23, 'word': 'Management', 'start': 106, 'end': 116}, {'entity': 'B-LOC', 'score': 0.93232465, 'index': 25, 'word': 'Her', 'start': 120, 'end': 123}, {'entity': 'I-LOC', 'score': 0.9942275, 'index': 26, 'word': '##an', 'start': 123, 'end': 125}, {'entity': 'I-LOC', 'score': 0.9423585, 'index': 27, 'word': '##ba', 'start': 125, 'end': 127}, {'entity': 'B-PER', 'score': 0.9997056, 'index': 31, 'word': 'R', 'start': 133, 'end': 134}, {'entity': 'B-PER', 'score': 0.9962091, 'index': 32, 'word': '.', 'start': 134, 'end': 135}, {'entity': 'I-PER', 'score': 0.9978155, 'index': 33, 'word': 'K', 'start': 136, 'end': 137}, {'entity': 'I-PER', 'score': 0.9964576, 'index': 34, 'word': '.', 'start': 137, 'end': 138}, {'entity': 'I-PER', 'score': 0.99959683, 'index': 35, 'word': 'She', 'start': 139, 'end': 142}, {'entity': 'I-PER', 'score': 0.9988771, 'index': 36, 'word': '##tty', 'start': 142, 'end': 145}, {'entity': 'B-PER', 'score': 0.99972147, 'in

### Step 2: Identify entities by combining B-* with I-* and successive B-*

In [8]:
# Convert the list of dictionaries into a data frame
bert_ner_list = ner_results
bert_ner_matrix = pd.DataFrame.from_records(bert_ner_list)
bert_ner_matrix

Unnamed: 0,entity,score,index,word,start,end
0,B-ORG,0.896265,23,Management,106,116
1,B-LOC,0.932325,25,Her,120,123
2,I-LOC,0.994228,26,##an,123,125
3,I-LOC,0.942358,27,##ba,125,127
4,B-PER,0.999706,31,R,133,134
5,B-PER,0.996209,32,.,134,135
6,I-PER,0.997815,33,K,136,137
7,I-PER,0.996458,34,.,137,138
8,I-PER,0.999597,35,She,139,142
9,I-PER,0.998877,36,##tty,142,145


In [9]:
begin_flag = bert_ner_matrix.entity.str.contains("^B-")
begin_idx = np.where(begin_flag == True)[0]
begin_idx

array([ 0,  1,  4,  5, 10, 11, 15])

In [10]:
# Generate sub-matrices to infer start and end characters
num_elements = bert_ner_matrix.shape[0]
num_entities = len(begin_idx)
start_end_matrix = pd.DataFrame(data=np.zeros((num_entities, 3)), columns=['Entity_Type', 'Start', 'End'])
# start_end_matrix

for idx in range(num_entities-1):
    start_pt = begin_idx[idx]
    end_pt = begin_idx[idx+1]
    temp_frame = bert_ner_matrix.iloc[start_pt:end_pt, :]
    start_end_matrix.iloc[idx, :] = (temp_frame.iloc[0, 0], temp_frame.iloc[0, 4], temp_frame.iloc[-1, 5])
    
# Add the last occurrence of an entity
start_pt = begin_idx[-1]
end_pt = num_elements
temp_frame = bert_ner_matrix.iloc[start_pt:end_pt, :]
start_end_matrix.iloc[num_entities-1, :] = (temp_frame.iloc[0, 0], temp_frame.iloc[0, 4], temp_frame.iloc[-1, 5])

start_end_matrix

Unnamed: 0,Entity_Type,Start,End
0,B-ORG,106.0,116.0
1,B-LOC,120.0,127.0
2,B-PER,133.0,134.0
3,B-PER,134.0,145.0
4,B-PER,171.0,173.0
5,B-PER,173.0,184.0
6,B-PER,215.0,230.0


In [11]:
# Combine adjacent sequences
diff_vec = start_end_matrix['Start'][1:].reset_index(drop=True) - start_end_matrix['End'][0:-1].reset_index(drop=True)
diff_vec

0     4.0
1     6.0
2     0.0
3    26.0
4     0.0
5    31.0
dtype: float64

In [12]:
# Add a column to the matrix
start_end_matrix['Diff'] = list([100.0]) + diff_vec.to_list()
start_end_matrix

Unnamed: 0,Entity_Type,Start,End,Diff
0,B-ORG,106.0,116.0,100.0
1,B-LOC,120.0,127.0,4.0
2,B-PER,133.0,134.0,6.0
3,B-PER,134.0,145.0,0.0
4,B-PER,171.0,173.0,26.0
5,B-PER,173.0,184.0,0.0
6,B-PER,215.0,230.0,31.0


In [13]:
num_elements_new = start_end_matrix.shape[0]

for idx in range(num_elements_new-1, 0, -1):
    if start_end_matrix.iloc[idx, 3] == 0:
        start_end_matrix.iloc[idx-1, 2] = start_end_matrix.iloc[idx, 2]
        
start_end_matrix

Unnamed: 0,Entity_Type,Start,End,Diff
0,B-ORG,106.0,116.0,100.0
1,B-LOC,120.0,127.0,4.0
2,B-PER,133.0,145.0,6.0
3,B-PER,134.0,145.0,0.0
4,B-PER,171.0,184.0,26.0
5,B-PER,173.0,184.0,0.0
6,B-PER,215.0,230.0,31.0


In [14]:
# Remove all occurrences of 0
# entity_matrix = start_end_matrix[start_end_matrix['Diff']!=0]
entity_matrix = start_end_matrix[start_end_matrix['Diff']!=0].copy()
entity_matrix

Unnamed: 0,Entity_Type,Start,End,Diff
0,B-ORG,106.0,116.0,100.0
1,B-LOC,120.0,127.0,4.0
2,B-PER,133.0,145.0,6.0
4,B-PER,171.0,184.0,26.0
6,B-PER,215.0,230.0,31.0


In [15]:
# Add the strings correspding to each entity type
entity_matrix['Entity_Name'] = ""
num_elements_entity_matrix = entity_matrix.shape[0]

for idx in range(num_elements_entity_matrix):
  start_pt = entity_matrix.iloc[idx, 1].astype(int)
  end_pt = entity_matrix.iloc[idx, 2].astype(int)
  entity_matrix.iloc[idx, 4] = example[start_pt:end_pt]

entity_matrix

Unnamed: 0,Entity_Type,Start,End,Diff,Entity_Name
0,B-ORG,106.0,116.0,100.0,Management
1,B-LOC,120.0,127.0,4.0,Heranba
2,B-PER,133.0,145.0,6.0,R. K. Shetty
4,B-PER,171.0,184.0,26.0,Raunak Shetty
6,B-PER,215.0,230.0,31.0,Raj Kumar Bafna
