For the NER task, we use **BertForTokenClassification** which is included in the **Transformers library** by **HuggingFace**.

### Downloading and preprocessing the data

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [15]:
ROOT_PATH = "/content/drive/Shareddrives/CRP/"
data_model_path = ROOT_PATH + 'data-model.json'
DATA_PATH = ROOT_PATH + 'annotations/'

In [16]:
f = open(data_model_path)
data_model = json.load(f)
data_model['tags']

[{'color': '#666666',
  'customId': 'b7a1039b-bf2e-43d2-844e-f68b0e15d7f3',
  'id': 'b7a1039b-bf2e-43d2-844e-f68b0e15d7f3',
  'shortcut': 'p',
  'title': 'Place'},
 {'color': '#32BCB1',
  'customId': '631b8086-5020-4570-ab50-dd68f4312bdd',
  'id': '631b8086-5020-4570-ab50-dd68f4312bdd',
  'shortcut': 'f',
  'title': 'Founder'},
 {'color': '#881D81',
  'customId': 'd40de1cc-50d4-422b-822d-18a693795e62',
  'id': 'd40de1cc-50d4-422b-822d-18a693795e62',
  'shortcut': 's',
  'title': 'StartUpName'},
 {'color': '#D32455',
  'customId': '0565d95d-7cef-4f63-90ca-eb7ef00e4ad5',
  'id': '0565d95d-7cef-4f63-90ca-eb7ef00e4ad5',
  'shortcut': 'i',
  'title': 'Investors'},
 {'color': '#6570FF',
  'customId': '363c1bbe-fc12-4ddd-966c-b3c1650a12cc',
  'id': '363c1bbe-fc12-4ddd-966c-b3c1650a12cc',
  'shortcut': 'y',
  'title': 'Year'}]

In [3]:
import json
import os
import pandas as pd
f = open(data_model_path)
data_model = json.load(f)
data_model = data_model['tags']
data_model = pd.json_normalize(data_model)
data_model

Unnamed: 0,id,customId,title,shortcut,color
0,b7a1039b-bf2e-43d2-844e-f68b0e15d7f3,b7a1039b-bf2e-43d2-844e-f68b0e15d7f3,Place,p,#666666
1,631b8086-5020-4570-ab50-dd68f4312bdd,631b8086-5020-4570-ab50-dd68f4312bdd,Founder,f,#32BCB1
2,d40de1cc-50d4-422b-822d-18a693795e62,d40de1cc-50d4-422b-822d-18a693795e62,StartUpName,s,#881D81
3,0565d95d-7cef-4f63-90ca-eb7ef00e4ad5,0565d95d-7cef-4f63-90ca-eb7ef00e4ad5,Investors,i,#D32455
4,363c1bbe-fc12-4ddd-966c-b3c1650a12cc,363c1bbe-fc12-4ddd-966c-b3c1650a12cc,Year,y,#6570FF


In [4]:
annot1_path = DATA_PATH + '1.json'
f = open(annot1_path)
annot1 = json.load(f)
annotation = pd.json_normalize(annot1['annotations'][0]['value']['tags'])
annot_df = pd.json_normalize(annot1['tokenized'])

In [5]:
annotation = annotation.merge(data_model[['id', 'shortcut']], left_on='tag', right_on='id')[['begin', 'end', 'shortcut']]
annotation.head()

Unnamed: 0,begin,end,shortcut
0,0,0,p
1,5,5,s
2,11,11,s
3,98,98,s
4,143,143,s


We use IOB-tagging here because named entities usually comprise more than 1 word (e.g. a startup's name can be ECO CORPORATION).

In [6]:
import itertools
def IOB_transfer(annotation):
  id = list(annotation['begin'])
  tag = list('B-'+(annotation.shortcut))
  I = annotation[annotation['begin']!=annotation['end']]
  I['begin'] +=1
  tmpid = []
  tmptag = []
  for i in I.index:
    tmpid.append([j for j in range(I.begin[i], I.end[i]+1)])
    tmptag.append(["I-"+I.shortcut[i] for j in range(I.begin[i], I.end[i]+1)])
  id_I = list(itertools.chain.from_iterable(tmpid))
  tag_I = list(itertools.chain.from_iterable(tmptag))
  id += id_I
  tag+=tag_I
  return pd.DataFrame({'id':id, 'tag':tag})
annot = IOB_transfer(annotation)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [7]:
import numpy as np
annot_df = annot_df.merge(annot, how='left', on='id')
annot_df = annot_df[['id', 'sentenceIdx', 'word', 'tag']]
annot_df = annot_df.fillna("O")
annot_df

Unnamed: 0,id,sentenceIdx,word,tag
0,0,0,PHILADELPHIA,B-p
1,1,0,--(,O
2,2,0,BUSINESS,O
3,3,0,WIRE,O
4,4,0,)--,O
...,...,...,...,...
519,519,13,s,O
520,520,13,Board,O
521,521,13,of,O
522,522,13,Directors,O


In [8]:
#ID - Label conversion
labels_to_ids = {'O': 0, 'B-s': 1, 'B-p': 2, 'B-i': 3, 'B-f': 4, 'B-y': 5, 'I-s': 6, 'I-p': 7, 'I-i': 8, 'I-f': 9, 'I-y': 10}
ids_to_labels = {0:'O', 1:'B-s', 2:'B-p', 3:'B-i', 4:'B-f', 5:'B-y', 6:'I-s', 7:'I-p', 8:'I-i', 9:'I-f', 10:'I-y'}

In [9]:
pd.json_normalize(annot1['tokenized'])

Unnamed: 0,word,cumulatedX,sentenceIdx,id
0,PHILADELPHIA,0,0,0
1,--(,12,0,1
2,BUSINESS,15,0,2
3,WIRE,24,0,3
4,)--,28,0,4
...,...,...,...,...
519,s,2933,13,519
520,Board,2935,13,520
521,of,2941,13,521
522,Directors,2944,13,522


In [10]:
def json_to_df(json_path, data_model):
  f = open(json_path)
  js_raw = json.load(f)
  try:
    annotation = pd.json_normalize(js_raw['annotations'][0]['value']['tags'])
    annotation = annotation.merge(data_model[['id', 'shortcut']], left_on='tag', right_on='id')[['begin', 'end', 'shortcut']]
    annot = IOB_transfer(annotation)

    annot_df = pd.json_normalize(js_raw['tokenized'])
    annot_df = annot_df.merge(annot, how='left', on='id')[['id', 'sentenceIdx', 'word', 'tag']].fillna("O")
    annot_df['sentence'] = annot_df[['sentenceIdx','word','tag']].groupby(['sentenceIdx'])['word'].transform(lambda x: ' '.join(x))
    annot_df['word_label'] = annot_df[['sentenceIdx','word','tag']].groupby(['sentenceIdx'])['tag'].transform(lambda x: ','.join(x))
    annot_df = annot_df[["sentence", "word_label"]].drop_duplicates().reset_index(drop=True)
    annot_df['num_O'] = annot_df.apply(lambda x:x[1].count('O'), axis=1)
    annot_df['length'] = annot_df.apply(lambda x:x[1].count(',')+1, axis=1)
    annot_df = annot_df.drop(annot_df[annot_df['num_O']==annot_df['length']].index)
    annot_df = annot_df.reset_index(drop=True)
    idx=[]
    s = 0
    l = 0
    length = list(annot_df['length'])
    for i in range(len(length)):
      if i==0:
        l += length[i]
        
      else:
        if l+length[i]>128:
          s+=1
          l = length[i]
        else: 
          l += length[i]
      idx.append(s)
    annot_df['s_id'] = idx  
    annot_df['sentences'] = annot_df[['s_id','sentence','word_label']].groupby(['s_id'])['sentence'].transform(lambda x: ' '.join(x))
    annot_df['word_labels'] = annot_df[['s_id','sentence','word_label']].groupby(['s_id'])['word_label'].transform(lambda x: ','.join(x))  
    annot_df = annot_df[["sentences", "word_labels"]].drop_duplicates().reset_index(drop=True)
    return annot_df
  except:
    return None

In [11]:
json_to_df(annot1_path, data_model)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,sentences,word_labels
0,PHILADELPHIA --( BUSINESS WIRE )-- Ceptur Ther...,"B-p,O,O,O,O,B-s,O,O,O,O,O,B-s,O,O,O,O,O,O,O,O,..."
1,“ We are extremely grateful for the support of...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-s,O,O,O,O,O,..."
2,"Colin Walsh , Ph . D ., Partner at Qiming Vent...","O,O,O,O,O,O,O,O,O,B-i,I-i,I-i,I-i,O,O,O,O,O,O,..."


In [12]:
json_names = os.listdir(DATA_PATH)
ner_data = pd.DataFrame()
for i in json_names:
  json_path = DATA_PATH + i
  tmp_df = json_to_df(json_path, data_model)
  if tmp_df is not None:
    ner_data = ner_data.append(tmp_df, ignore_index=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [13]:
ner_data.head()

Unnamed: 0,sentences,word_labels
0,After spending more than two years in stealth ...,"O,O,O,O,O,O,O,O,O,O,B-s,I-s,O,O,O,O,O,O,O,O,O,..."
1,"Based in Cambridge , MA , they focus on enabli...","O,O,B-p,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
2,The Meat4All project was awarded the Horizon 2...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
3,"Commenting in a statement , Iñigo Charola , CE...","O,O,O,O,O,B-f,I-f,O,O,O,B-s,I-s,O,O,O,O,O,O,O,..."
4,Talus Bio aims to revolutionize drug developme...,"B-s,I-s,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."


In [14]:
ner_data

Unnamed: 0,sentences,word_labels
0,After spending more than two years in stealth ...,"O,O,O,O,O,O,O,O,O,O,B-s,I-s,O,O,O,O,O,O,O,O,O,..."
1,"Based in Cambridge , MA , they focus on enabli...","O,O,B-p,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
2,The Meat4All project was awarded the Horizon 2...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
3,"Commenting in a statement , Iñigo Charola , CE...","O,O,O,O,O,B-f,I-f,O,O,O,B-s,I-s,O,O,O,O,O,O,O,..."
4,Talus Bio aims to revolutionize drug developme...,"B-s,I-s,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
...,...,...
561,"CAMBRIDGE , Mass .--( BUSINESS WIRE )-- Aviced...","B-p,O,B-p,O,O,O,O,B-s,I-s,O,O,O,O,O,O,O,O,O,O,..."
562,ImmuneID is a precision immunology company tha...,"B-s,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
563,ImmuneID was founded by Christoph Westphal and...,"B-s,O,O,O,B-f,I-f,O,B-f,I-f,O,O,B-y,O,O,O,O,O,..."
564,Columbia University spinout Kallyope launched ...,"O,O,O,B-s,O,O,O,B-y,O,O,O,O,O,O,O,O,O,O,O,O,O,..."


In [None]:
!pip install transformers~=3.0.2
#Install transformers (older version)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers~=3.0.2
  Downloading transformers-3.0.2-py3-none-any.whl (769 kB)
[K     |████████████████████████████████| 769 kB 5.3 MB/s 
Collecting tokenizers==0.8.1.rc1
  Downloading tokenizers-0.8.1rc1-cp37-cp37m-manylinux1_x86_64.whl (3.0 MB)
[K     |████████████████████████████████| 3.0 MB 56.6 MB/s 
[?25hCollecting sentencepiece!=0.1.92
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 48.6 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 47.6 MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.53-py3-none-any.whl size=895260 sha256=a838703f337bdf4458ef5d561d2475755a3447

### Preparing the dataset and dataloader

In [None]:
import numpy as np
from sklearn.metrics import balanced_accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast, BertConfig, BertForTokenClassification

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [None]:
# define some key variables that will be used later on in the training/evaluation process
# epoch is set to 20 when we apply weighted loss function
# otherwise epoch around 5 is enough for the training process
MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 20
LEARNING_RATE = 5e-06
MAX_GRAD_NORM = 10
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

BERT relies on wordpiece tokenization, rather than word tokenization. Therefore we also define the labels at the wordpiece-level, rather than the word-level.

Below, we define a regular PyTorch dataset class. Here, each sentence gets tokenized, the special tokens that BERT expects are added, the tokens are padded or truncated based on the max length of the model, the attention mask is created and the labels are created based on the dictionary which we defined above. Word pieces that should be ignored have a label of -100 (which is the default ignore_index of PyTorch's CrossEntropyLoss).

In [None]:
class dataset(Dataset):
  def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

  def __getitem__(self, index):
        # step 1: get the sentence and word labels 
        sentence = self.data.sentences[index].strip().split()  
        word_labels = self.data.word_labels[index].split(",") 

        # step 2: use tokenizer to encode sentence (includes padding/truncation up to max length)
        # BertTokenizerFast provides a handy "return_offsets_mapping" functionality for individual tokens
        encoding = self.tokenizer(sentence,
                             is_pretokenized=True, 
                             #In the new transformers v4, the tokenizer encoding argument is_pretokenized becomes is_split_into_words.
                             #is_split_into_words=True,
                             return_offsets_mapping=True, 
                             padding='max_length', 
                             truncation=True, 
                             max_length=self.max_len)
        
        # step 3: create token labels only for first word pieces of each tokenized word
        labels = [labels_to_ids[label] for label in word_labels] 
        # code based on https://huggingface.co/transformers/custom_datasets.html#tok-ner
        # create an empty array of -100 of length max_length
        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100
        
        # set only labels whose first offset position is 0 and the second is not 0
        i = 0
        for idx, mapping in enumerate(encoding["offset_mapping"]):
          if mapping[0] == 0 and mapping[1] != 0:
            # overwrite label
            encoded_labels[idx] = labels[i]
            i += 1

        # step 4: turn everything into PyTorch tensors
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(encoded_labels)
        
        return item

  def __len__(self):
        return self.len

In [None]:
# based on the class we defined above, we create 2 datasets for training and for testing.
train_size = 0.8
train_dataset = ner_data.sample(frac=train_size,random_state=200)
test_dataset = ner_data.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(ner_data.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (566, 2)
TRAIN Dataset: (453, 2)
TEST Dataset: (113, 2)


In [None]:
# verify that the input ids and corresponding targets are correct
for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[0]["input_ids"]), training_set[0]["labels"]):
  print('{0:10}  {1}'.format(token, label))

[CLS]       -100
year        0
founded     0
:           0
2019        5
location    0
:           0
cambridge   2
,           0
ma          2
or          1
##na        -100
therapeutic  6
##s         -100
’           0
business    0
is          0
centered    0
on          0
inn         0
##ova       -100
##ting      -100
and         0
creating    0
treatments  0
based       0
on          0
circular    0
rna         0
(           0
or          0
##na        -100
)           0
.           -100
the         0
company     0
uses        0
advanced    0
technology  0
to          0
develop     0
or          0
##na        -100
the         0
##ra        -100
##pies      -100
that        0
overcome    0
the         0
issues      0
related     0
to          0
linear      0
messenger   0
rna         0
(           0
mrna        0
)           0
therapeutic  0
development  0
.           0
or          0
##na        -100
,           0
which       0
sprung      0
from        0
research    0
at          

In [None]:
# define the corresponding PyTorch dataloaders
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

### Defining the model

In [None]:
# to apply weighted loss function, we modify the modeling_bert.py file inside the downloaded transformers package
# the path of the file is as below:
# /usr/local/lib/python3.7/dist-packages/transformers/modeling_bert.py
# specifically, we change the line from:
#   loss_fct = CrossEntropyLoss()
# to:
#   weights = torch.tensor([0.01, 0.09, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1])
#   loss_fct = CrossEntropyLoss(weights=weights)
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(labels_to_ids))
model.to(device)

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

### Training the model

Before training the model, we perform a sanity check to see if the initial loss of our model is close to -ln(1/number of classes) = -ln(1/10) = 2.40.

In [None]:
inputs = training_set[2]
input_ids = inputs["input_ids"].unsqueeze(0)
attention_mask = inputs["attention_mask"].unsqueeze(0)
labels = inputs["labels"].unsqueeze(0)

input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
labels = labels.to(device)

outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
initial_loss = outputs[0]
initial_loss

tensor(2.6211, device='cuda:0', grad_fn=<NllLossBackward0>)

In [None]:
tr_logits = outputs[1]
tr_logits.shape

torch.Size([1, 128, 11])

In [None]:
tr_logits = outputs[1]
tr_logits.shape

torch.Size([1, 128, 11])

In [None]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [None]:
# defining the training function on the 80% of the dataset for tuning the bert model
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()
    
    for idx, batch in enumerate(training_loader):
        
        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        labels = batch['labels'].to(device, dtype = torch.long)

        loss, tr_logits = model(input_ids=ids, attention_mask=mask, labels=labels)
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)
        
        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")
           
        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        
        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        #active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))
        
        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        tr_labels.extend(labels)
        tr_preds.extend(predictions)

        tmp_tr_accuracy = balanced_accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

In [None]:
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)

Training epoch: 1
Training loss per 100 training steps: 2.721761465072632




Training loss per 100 training steps: 0.8293197686129278
Training loss epoch: 0.7751651999720356
Training accuracy epoch: 0.12464097019882002
Training epoch: 2
Training loss per 100 training steps: 0.4923681616783142
Training loss per 100 training steps: 0.33071873624726095
Training loss epoch: 0.32167413033414305
Training accuracy epoch: 0.1370823590498385
Training epoch: 3
Training loss per 100 training steps: 0.4334253668785095




Training loss per 100 training steps: 0.21706272034656884
Training loss epoch: 0.2126705069421676
Training accuracy epoch: 0.23472319651379014
Training epoch: 4
Training loss per 100 training steps: 0.12815460562705994




Training loss per 100 training steps: 0.1542347884635524




Training loss epoch: 0.1532084951994189
Training accuracy epoch: 0.4944578132087239
Training epoch: 5
Training loss per 100 training steps: 0.11496679484844208




Training loss per 100 training steps: 0.11078484245751163




Training loss epoch: 0.11562907842821196
Training accuracy epoch: 0.7163928812507162
Training epoch: 6
Training loss per 100 training steps: 0.09094753116369247




Training loss per 100 training steps: 0.09314707050671672




Training loss epoch: 0.09294364902011135
Training accuracy epoch: 0.7892559192364391
Training epoch: 7
Training loss per 100 training steps: 0.03400127962231636




Training loss per 100 training steps: 0.0768939601534074




Training loss epoch: 0.07639403164125326
Training accuracy epoch: 0.8185202991888867
Training epoch: 8
Training loss per 100 training steps: 0.03535307198762894




Training loss per 100 training steps: 0.06498710565859138




Training loss epoch: 0.06521515708351344
Training accuracy epoch: 0.8524807532746523
Training epoch: 9
Training loss per 100 training steps: 0.11784877628087997




Training loss per 100 training steps: 0.05602737371394835




Training loss epoch: 0.0539279428980591
Training accuracy epoch: 0.8834252360262549
Training epoch: 10
Training loss per 100 training steps: 0.07517123222351074




Training loss per 100 training steps: 0.04781223734896077




Training loss epoch: 0.047561758116149065
Training accuracy epoch: 0.9101669121691175
Training epoch: 11
Training loss per 100 training steps: 0.03747338056564331




Training loss per 100 training steps: 0.04034418993085475




Training loss epoch: 0.04102091546366481
Training accuracy epoch: 0.9090174108365612
Training epoch: 12
Training loss per 100 training steps: 0.02824500948190689




Training loss per 100 training steps: 0.03543489634308337




Training loss epoch: 0.035228143499249166
Training accuracy epoch: 0.9218074945274919
Training epoch: 13
Training loss per 100 training steps: 0.029539786279201508




Training loss per 100 training steps: 0.029912035225309653




Training loss epoch: 0.030742408311517352
Training accuracy epoch: 0.9335070930389479
Training epoch: 14
Training loss per 100 training steps: 0.04318073019385338




Training loss per 100 training steps: 0.024544823739567016




Training loss epoch: 0.024699901077455205
Training accuracy epoch: 0.9395296098340943
Training epoch: 15
Training loss per 100 training steps: 0.048148516565561295




Training loss per 100 training steps: 0.024052937911583645




Training loss epoch: 0.02330624468199778
Training accuracy epoch: 0.9488000082561072
Training epoch: 16
Training loss per 100 training steps: 0.029133042320609093




Training loss per 100 training steps: 0.01978913166255939




Training loss epoch: 0.019779215485118982
Training accuracy epoch: 0.9572374947262342
Training epoch: 17
Training loss per 100 training steps: 0.005471902899444103




Training loss per 100 training steps: 0.0186460504209217




Training loss epoch: 0.0186204749165642
Training accuracy epoch: 0.9629766888241627
Training epoch: 18
Training loss per 100 training steps: 0.0028416519053280354




Training loss per 100 training steps: 0.0168317515509074




Training loss epoch: 0.01624219967950985
Training accuracy epoch: 0.9616057620058408
Training epoch: 19
Training loss per 100 training steps: 0.005771413445472717




Training loss per 100 training steps: 0.015723869893792094




Training loss epoch: 0.015658369097907684
Training accuracy epoch: 0.9683302166438856
Training epoch: 20
Training loss per 100 training steps: 0.007452077232301235




Training loss per 100 training steps: 0.013431814997134233




Training loss epoch: 0.0138406475613776
Training accuracy epoch: 0.9713181555601291


### Evaluating the model

In [None]:
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
    
    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            
            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)
            
            loss, eval_logits = model(input_ids=ids, attention_mask=mask, labels=labels)
            
            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)
        
            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")
              
            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            
            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        
            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            
            eval_labels.extend(labels)
            eval_preds.extend(predictions)
            
            tmp_eval_accuracy = balanced_accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    labels = [ids_to_labels[id.item()] for id in eval_labels]
    predictions = [ids_to_labels[id.item()] for id in eval_preds]
    
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions

In [None]:
labels, predictions = valid(model, testing_loader)

Validation loss per 100 evaluation steps: 0.0638757050037384




Validation Loss: 0.12449547997267361
Validation Accuracy: 0.8861215876625133


However, the accuracy metric, even though it's for imbalannced data, can be misleading. A lot of labels are "outside" (O), even after omitting predictions on the [PAD] tokens. What is important is looking at the precision, recall and f1-score of the individual tags. For this, we use **sklearn.metrics.classification_report**.

In [None]:
import sklearn
print(sklearn.metrics.classification_report(labels, predictions))

              precision    recall  f1-score   support

         B-f       0.74      0.91      0.82        77
         B-i       0.71      0.78      0.75       105
         B-p       0.64      0.82      0.72        50
         B-s       0.65      0.79      0.72        92
         B-y       0.97      0.88      0.92        32
         I-f       0.79      0.93      0.85        85
         I-i       0.82      0.82      0.82       137
         I-p       0.56      0.22      0.31        23
         I-s       0.71      0.89      0.79        36
         I-y       0.00      0.00      0.00         1
           O       0.99      0.98      0.99      8574

    accuracy                           0.97      9212
   macro avg       0.69      0.73      0.70      9212
weighted avg       0.98      0.97      0.97      9212



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Saving the model for future use

In [None]:
directory = "./model"mo

if not os.path.exists(directory):
    os.makedirs(directory)

# save vocabulary of the tokenizer
tokenizer.save_vocabulary(directory)
# save the model weights and its configuration file
model.save_pretrained(directory)
print('All files saved')

All files saved


### Inference

In [None]:
# input a list of news articles for NER
# suppose the name of the file is 'news.csv', with columns including 'Source', 'Title', 'Date', 'Content', 'URL'
news_articles = pd.read_csv('news.csv')

In [None]:
# function for returning list of recognized labels for each article
def inference_ner(news_article):
  sentence = news_article

  inputs = tokenizer(sentence.split(),
                      is_pretokenized=True, 
                      return_offsets_mapping=True, 
                      padding='max_length', 
                      truncation=True, 
                      max_length=MAX_LEN,
                      return_tensors="pt")

  # move to gpu
  ids = inputs["input_ids"].to(device)
  mask = inputs["attention_mask"].to(device)
  # forward pass
  outputs = model(ids, attention_mask=mask)
  logits = outputs[0]

  active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
  flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level

  tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
  token_predictions = [ids_to_labels[i] for i in flattened_predictions.cpu().numpy()]
  wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)

  prediction = []
  for token_pred, mapping in zip(wp_preds, inputs["offset_mapping"].squeeze().tolist()):
    #only predictions on first word pieces are important
    if mapping[0] == 0 and mapping[1] != 0:
      prediction.append(token_pred[1])
    else:
      continue
  return(prediction)

In [None]:
# inference for all the input articles
inference_result = pd.DataFrame(columns = ['Startup', 'Founder(s)', 'Investor(s)', 'City/Country', 'Year Founded', 'Source', 'Title', 'Date', 'Content', 'URL'])
for _, news_article in news_articles.iterrows(): 
  prediction = inference_ner(news_article['Content'])
  news_article_split = news_article['Content'].split()
  try:
    startup = set([news_article_split[i] for i in [index for (index, item) in enumerate(prediction) if item == 'B-s' or item == 'I-s']])
  except:
    startup = 'NA'
  try:
    founder = set([news_article_split[i] for i in [index for (index, item) in enumerate(prediction) if item == 'B-f' or item == 'I-f']])
  except:
    founder = 'NA'
  try:
    investor = set([news_article_split[i] for i in [index for (index, item) in enumerate(prediction) if item == 'B-i' or item == 'I-i']])
  except:
    investor = 'NA'
  try:
    place = set([news_article_split[i] for i in [index for (index, item) in enumerate(prediction) if item == 'B-p' or item == 'I-p']])
  except:
    place = 'NA'
  try:
    year_founded = set([news_article_split[i] for i in [index for (index, item) in enumerate(prediction) if item == 'B-y' or item == 'I-y']])
  except:
    year_founded = 'NA'
  inference_result = inference_result.append({
    'Startup': startup, 
    'Founder(s)': founder, 
    'Investor(s)': investor, 
    'City/Country': place, 
    'Year Founded': year_founded, 
    'Source': news_article['Source'], 
    'Title': news_article['Title'], 
    'Date': news_article['Date'], 
    'Content': news_article['Content'], 
    'URL': news_article['URL']
    }, ignore_index=True)



In [None]:
inference_result

Unnamed: 0,Startup,Founder(s),Investor(s),City/Country,Year Founded,Source,Title,Date,Content,URL
0,"{Bio,, Upstream}","{Aaron, Deykin}",{Truex},{},{},biopharmadive,"Upstream Bio, a richly funded startup, reveals...",2-Jun-22,An inflammatory disease drug cast aside by Ast...,https://www.biopharmadive.com/news/upstream-bi...
1,"{Forcyte’s, Forcyte, Biotechnologies,}",{},"{Milestone, Acequia, Jude, Y, Capital,, Gomila...","{LOS, ANGELES:}",{2018},businesswire,Mechano-therapeutics Discovery Startup Forcyte...,1-Jun-22,"LOS ANGELES: Forcyte Biotechnologies, a tech-e...",https://www.businesswire.com/news/home/2022060...


### Conversion of annotated files

In [None]:
# convert annotated files to a list of information for building database
import pandas as pd
import numpy as np
import glob
files_list = glob.glob("/content/drive/Shareddrives/CRP/annotations/*")

def info_retract(begin, end, data_json):
  info = ''
  for i in range(begin, end):
    info += data_json['tokenized'][i]['word']+' '
  return info

def annot_extract(DATA_PATH):
  df_new = pd.DataFrame(columns=np.arange(5))
  df_new.columns =['Name', 'Founder(s)', 'City/Country', 'Investor(s)', 'Year Founded']
  f = open(DATA_PATH)
  data_json = json.load(f)
  place = []
  founder = []
  startup = []
  investor = []
  year_founded = []
  for annot_i in data_json['annotations'][0]['value']['tags']:
    begin = annot_i['begin']
    end = annot_i['end']+1
    if annot_i['tag'] == 'b7a1039b-bf2e-43d2-844e-f68b0e15d7f3':
      place.append(info_retract(begin, end, data_json))
    elif annot_i['tag'] == '631b8086-5020-4570-ab50-dd68f4312bdd':
      founder.append(info_retract(begin, end, data_json))
    elif annot_i['tag'] == 'd40de1cc-50d4-422b-822d-18a693795e62':
      startup.append(info_retract(begin, end, data_json))
    elif annot_i['tag'] == '0565d95d-7cef-4f63-90ca-eb7ef00e4ad5':
      investor.append(info_retract(begin, end, data_json))
    elif annot_i['tag'] == '363c1bbe-fc12-4ddd-966c-b3c1650a12cc':
      year_founded.append(info_retract(begin, end, data_json))
  df_new = df_new.append({
        'Name': set(startup) if bool(set(startup)) else np.NaN,
        'Founder(s)': set(founder) if bool(set(founder)) else np.NaN, 
        'City/Country': set(place) if bool(set(place)) else np.NaN, 
        'Investor(s)': set(investor) if bool(set(investor)) else np.NaN, 
        'Year Founded': set(year_founded) if bool(set(year_founded)) else np.NaN}, ignore_index=True)  
  return df_new

df_new = pd.DataFrame(columns=np.arange(5))
df_new.columns =['Name', 'Founder(s)', 'City/Country', 'Investor(s)', 'Year Founded'] 
for i in files_list:
  df_new = df_new.append(annot_extract(i))
df_new = df_new.dropna(axis=0, how='all')
df_new.to_csv('annnotation_result.csv')