# <img src="https://img.icons8.com/external-flaticons-lineal-color-flat-icons/64/undefined/external-big-data-smart-technology-flaticons-lineal-color-flat-icons-2.png"/> **NLP Research<br>BERT Named Entity Recognition**
## <img src="https://img.icons8.com/external-fauzidea-flat-fauzidea/64/undefined/external-man-avatar-avatar-fauzidea-flat-fauzidea.png"/> **`Dimas Dwi Putra`**

# **Connect Google Drive Storage**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Install Requirements Python Libraries**

In [None]:
! pip install transformers==3.0.1
! pip install transformers seqeval[gpu]

# **Importing Python Libraries and preparing the environment**

In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertTokenizerFast, BertConfig, BertForTokenClassification, BertModel

# **Set the default device**

In [4]:
import torch
from torch import cuda

USE_CUDA = torch.cuda.is_available()
device = 'cuda' if cuda.is_available() else 'cpu'

print(f"Device: {device}")

if USE_CUDA:
    device = torch.device("cuda")
    print("\nUsing GPU")
    print('\nDevice name:', torch.cuda.get_device_name(0))
else:
    print("\nNo GPU available, using the CPU instead.")

Device: cuda

Using GPU

Device name: Tesla P100-PCIE-16GB


# **Preprocessing the data**

In [5]:
data = pd.read_csv("/content/drive/MyDrive/bert_bilstm_crf_named_entity_recognition/BERT-NER/input/ner_dataset.csv", encoding='unicode_escape')
data.head(20)

Unnamed: 0,Sentence #,Word,POS,Tag
0,Sentence: 0,studies,NNS,O
1,Sentence: 0,on,IN,O
2,Sentence: 0,magnesium,NN,O
3,Sentence: 0,s,NN,O
4,Sentence: 0,mechanism,NN,O
5,Sentence: 0,of,IN,O
6,Sentence: 0,action,NN,O
7,Sentence: 0,in,IN,O
8,Sentence: 0,digitalis,NN,plant
9,Sentence: 0,induced,VBD,O


In [6]:
data.count()

Sentence #    38205
Word          38205
POS           38205
Tag           38205
dtype: int64

In [7]:
print("Number of tags: {}".format(len(data.Tag.unique())))
frequencies = data.Tag.value_counts()
frequencies

Number of tags: 3


O          32002
disease     3559
plant       2644
Name: Tag, dtype: int64

In [8]:
tags = {}
for tag, count in zip(frequencies.index, frequencies):
    if tag != "O":
        if tag[0:10] not in tags.keys():
            tags[tag[0:10]] = count
        else:
            tags[tag[0:10]] += count
    continue

print(sorted(tags.items(), key=lambda x: x[1], reverse=True))

[('disease', 3559), ('plant', 2644)]


In [9]:
labels_to_ids = {k: v for v, k in enumerate(data.Tag.unique())}
ids_to_labels = {v: k for v, k in enumerate(data.Tag.unique())}
labels_to_ids

{'O': 0, 'disease': 2, 'plant': 1}

In [10]:
data['sentence'] = data[['Sentence #','Word','Tag']].groupby(['Sentence #'])['Word'].transform(lambda x: ' '.join(x))
data['word_labels'] = data[['Sentence #','Word','Tag']].groupby(['Sentence #'])['Tag'].transform(lambda x: ','.join(x))

In [11]:
data.head()

Unnamed: 0,Sentence #,Word,POS,Tag,sentence,word_labels
0,Sentence: 0,studies,NNS,O,studies on magnesium s mechanism of action in ...,"O,O,O,O,O,O,O,O,plant,O,disease"
1,Sentence: 0,on,IN,O,studies on magnesium s mechanism of action in ...,"O,O,O,O,O,O,O,O,plant,O,disease"
2,Sentence: 0,magnesium,NN,O,studies on magnesium s mechanism of action in ...,"O,O,O,O,O,O,O,O,plant,O,disease"
3,Sentence: 0,s,NN,O,studies on magnesium s mechanism of action in ...,"O,O,O,O,O,O,O,O,plant,O,disease"
4,Sentence: 0,mechanism,NN,O,studies on magnesium s mechanism of action in ...,"O,O,O,O,O,O,O,O,plant,O,disease"


In [12]:
data = data[["sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)

In [13]:
data.head()

Unnamed: 0,sentence,word_labels
0,studies on magnesium s mechanism of action in ...,"O,O,O,O,O,O,O,O,plant,O,disease"
1,the mechanism by which magnesium affects digit...,"O,O,O,O,O,O,plant,O,disease,O,O,O,O,O,O,O,O,O"
2,magnesium s direct effect on calcium and potas...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
3,inhibitory effect of green tea on the growth o...,"O,O,O,plant,plant,O,O,O,O,O,disease,disease,O"
4,in 10 separate experiments mice with establish...,"O,O,O,O,O,O,O,O,O,O,O,O,O,disease,disease,O,O,..."


In [14]:
len(data)

1157

In [15]:
data.iloc[1].sentence

'the mechanism by which magnesium affects digitalis induced arrhythmias was studied in dogs with and without beta receptor'

In [16]:
data.iloc[1].word_labels

'O,O,O,O,O,O,plant,O,disease,O,O,O,O,O,O,O,O,O'

# **Preparing the dataset and dataloader**

In [17]:
MAX_LEN = 300
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 10
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10
tokenizer = BertTokenizerFast.from_pretrained('/content/drive/MyDrive/bert_bilstm_crf_named_entity_recognition/BERT-NER/model/biobert')

In [18]:
class dataset(Dataset):
  def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

  def __getitem__(self, index):
        # step 1: get the sentence and word labels 
        sentence = self.data.sentence[index].strip().split()  
        word_labels = self.data.word_labels[index].split(",") 

        # step 2: use tokenizer to encode sentence (includes padding/truncation up to max length)
        # BertTokenizerFast provides a handy "return_offsets_mapping" functionality for individual tokens
        encoding = self.tokenizer(sentence,
                             is_pretokenized=True, 
                             return_offsets_mapping=True, 
                             padding='max_length', 
                             truncation=True, 
                             max_length=self.max_len)
        
        # step 3: create token labels only for first word pieces of each tokenized word
        labels = [labels_to_ids[label] for label in word_labels] 
        # code based on https://huggingface.co/transformers/custom_datasets.html#tok-ner
        # create an empty array of -100 of length max_length
        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100
        
        # set only labels whose first offset position is 0 and the second is not 0
        i = 0
        for idx, mapping in enumerate(encoding["offset_mapping"]):
          if mapping[0] == 0 and mapping[1] != 0:
            # overwrite label
            encoded_labels[idx] = labels[i]
            i += 1

        # step 4: turn everything into PyTorch tensors
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(encoded_labels)
        
        return item

  def __len__(self):
        return self.len

In [19]:
train_size = 0.8
train_dataset = data.sample(frac=train_size,random_state=200)
test_dataset = data.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = dataset(train_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (1157, 2)
TRAIN Dataset: (926, 2)
TEST Dataset: (231, 2)


In [20]:
training_set[0]

{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0

In [21]:
for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[0]["input_ids"]), training_set[0]["labels"]):
  print('{0:10}  {1}'.format(token, label))

[CLS]       -100
among       0
specific    0
foods       0
high        0
in          0
car         0
##ote       -100
##no        -100
##ids       -100
br          0
##occo      -100
##li        -100
and         0
spin        1
##ach       -100
were        0
most        0
consistently  0
associated  0
with        0
a           0
lower       0
risk        0
of          0
cat         2
##ara       -100
##ct        -100
[SEP]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[

In [22]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

# **Defining the model - Biobert**

In [23]:
labels_to_ids

{'O': 0, 'disease': 2, 'plant': 1}

In [24]:
model = BertForTokenClassification.from_pretrained(
    '/content/drive/MyDrive/bert_bilstm_crf_named_entity_recognition/BERT-NER/model/biobert', 
    num_labels=len(labels_to_ids),
    id2label={i: label for i, label in enumerate(labels_to_ids)},
    label2id={label: i for i, label in enumerate(labels_to_ids)}
    )
model.to(device)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at /content/drive/MyDrive/bert_bilstm_crf_named_entity_recognition/BERT-NER/model/biobert and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

# **Training the model - Biobert**

In [25]:
inputs = training_set[2]
input_ids = inputs["input_ids"].unsqueeze(0)
attention_mask = inputs["attention_mask"].unsqueeze(0)
labels = inputs["labels"].unsqueeze(0)

input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
labels = labels.to(device)

outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
initial_loss = outputs[0]
initial_loss

tensor(1.2529, device='cuda:0', grad_fn=<NllLossBackward0>)

In [26]:
tr_logits = outputs[1]
tr_logits.shape

torch.Size([1, 300, 3])

In [27]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [28]:
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()
    
    for idx, batch in enumerate(training_loader):
        
        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        labels = batch['labels'].to(device, dtype = torch.long)

        loss, tr_logits = model(input_ids=ids, attention_mask=mask, labels=labels)
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)
        
        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per training steps: {loss_step}")
           
        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        
        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        #active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))
        
        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        tr_labels.extend(labels)
        tr_preds.extend(predictions)

        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

In [29]:
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)

Training epoch: 1
Training loss per training steps: 1.167625069618225
Training loss per training steps: 0.40473906976161617
Training loss epoch: 0.379166390811061
Training accuracy epoch: 0.8590877631671815
Training epoch: 2
Training loss per training steps: 0.15398240089416504
Training loss per training steps: 0.14274888138959904
Training loss epoch: 0.13885187550351538
Training accuracy epoch: 0.943066372640314
Training epoch: 3
Training loss per training steps: 0.1465419977903366
Training loss per training steps: 0.10589381565551946
Training loss epoch: 0.10442506827028661
Training accuracy epoch: 0.956090830911119
Training epoch: 4
Training loss per training steps: 0.16753114759922028
Training loss per training steps: 0.08724692522889317
Training loss epoch: 0.08678522461961055
Training accuracy epoch: 0.9606928174567579
Training epoch: 5
Training loss per training steps: 0.06250385195016861
Training loss per training steps: 0.07473600776589448
Training loss epoch: 0.07466819799697

# **Evaluating the model - Biobert**

In [30]:
def valid(model, testing_loader):
    # put model in evaluation mode
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []
    
    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            
            ids = batch['input_ids'].to(device, dtype = torch.long)
            mask = batch['attention_mask'].to(device, dtype = torch.long)
            labels = batch['labels'].to(device, dtype = torch.long)
            
            loss, eval_logits = model(input_ids=ids, attention_mask=mask, labels=labels)
            
            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)
        
            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")
              
            # compute evaluation accuracy
            flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            
            # only compute accuracy at active labels
            active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        
            labels = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)
            
            eval_labels.extend(labels)
            eval_preds.extend(predictions)
            
            tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    labels = [ids_to_labels[id.item()] for id in eval_labels]
    predictions = [ids_to_labels[id.item()] for id in eval_preds]
    
    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions

In [31]:
labels, predictions = valid(model, testing_loader)

Validation loss per 100 evaluation steps: 0.16138367354869843
Validation Loss: 0.11911071915629096
Validation Accuracy: 0.9547230753118077


In [32]:
print(labels)

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'plant', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'disease', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'disease', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'plant', 'O', 'O', 'O', 'O', 'O', 'plant', 'O', 'O', 'disease', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'disease', 'disease', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'plant', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'disease', 'disease', 'disease', 'disease', 'O', 'O', 'plant', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'disease', 'disease', 'O', 'O', 'O', 'O', 'O', 'O', 'plant', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'disease', 'disease', 'O', 'O', 'O', 'O', 'O', 'O', 'plant', 'plant', 'disease', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'plant', 'plant', 'O', 'O', 'O', 'O', 

In [33]:
print(predictions)

['O', 'O', 'O', 'O', 'O', 'plant', 'plant', 'plant', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'disease', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'plant', 'O', 'O', 'O', 'O', 'O', 'plant', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'disease', 'disease', 'disease', 'O', 'O', 'O', 'O', 'disease', 'disease', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'plant', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'disease', 'disease', 'O', 'O', 'plant', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'disease', 'disease', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'plant', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'plant', 'plant', 'disease', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'plant', 'plant', 'O', 'O', 'O', 'O', 'O', 'O', 

In [34]:
y_true = labels
y_pred = predictions

In [35]:
from sklearn.metrics import classification_report

In [36]:
scores_classification_reports = classification_report(y_true, y_pred, target_names=None, output_dict=True)
df_data_scores = pd.DataFrame(scores_classification_reports).transpose()
df_data_scores = df_data_scores.rename_axis('Entities')
# print(classification_report(y_true, y_pred, target_names=None))
print(df_data_scores)

              precision    recall  f1-score      support
Entities                                                
O              0.973489  0.976749  0.975116  5376.000000
disease        0.846316  0.768642  0.805611   523.000000
plant          0.872236  0.941645  0.905612   377.000000
accuracy       0.957298  0.957298  0.957298     0.957298
macro avg      0.897347  0.895679  0.895447  6276.000000
weighted avg   0.956809  0.957298  0.956816  6276.000000


In [None]:
! pip install xlsxwriter

import xlsxwriter

writer = pd.ExcelWriter('/content/drive/MyDrive/bert_bilstm_crf_named_entity_recognition/BERT-NER/BERT_Report.xlsx', engine='xlsxwriter')
df_data_scores.to_excel(writer, sheet_name='Uji1', index=True)
writer.save()

# **Check what model trained**

In [38]:
sentence = "examination of the data from all ten experiments revealed that complete tumor tumor regression occurred in 14 of 346 papilloma bearing mice 4 that were treated with green green tea tea in the drinking water or with i p injections of green green tea tea constituents whereas none of the 220 papilloma bearing control mice treated with only vehicle exhibited complete tumor tumor"

inputs = tokenizer(sentence.split(),
                    is_pretokenized=True, 
                    return_offsets_mapping=True, 
                    padding='max_length', 
                    truncation=True, 
                    max_length=MAX_LEN,
                    return_tensors="pt")

# move to gpu
ids = inputs["input_ids"].to(device)
mask = inputs["attention_mask"].to(device)
# forward pass
outputs = model(ids, attention_mask=mask)
logits = outputs[0]

active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level

tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
token_predictions = [ids_to_labels[i] for i in flattened_predictions.cpu().numpy()]
wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)

prediction = []
for token_pred, mapping in zip(wp_preds, inputs["offset_mapping"].squeeze().tolist()):
  #only predictions on first word pieces are important
  if mapping[0] == 0 and mapping[1] != 0:
    prediction.append(token_pred[1])
  else:
    continue

print(sentence.split())
print(prediction)

['examination', 'of', 'the', 'data', 'from', 'all', 'ten', 'experiments', 'revealed', 'that', 'complete', 'tumor', 'tumor', 'regression', 'occurred', 'in', '14', 'of', '346', 'papilloma', 'bearing', 'mice', '4', 'that', 'were', 'treated', 'with', 'green', 'green', 'tea', 'tea', 'in', 'the', 'drinking', 'water', 'or', 'with', 'i', 'p', 'injections', 'of', 'green', 'green', 'tea', 'tea', 'constituents', 'whereas', 'none', 'of', 'the', '220', 'papilloma', 'bearing', 'control', 'mice', 'treated', 'with', 'only', 'vehicle', 'exhibited', 'complete', 'tumor', 'tumor']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'disease', 'disease', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'plant', 'plant', 'plant', 'plant', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'plant', 'plant', 'plant', 'plant', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'disease', 'disease']


# **Saving the model for future use**

In [39]:
import os

directory = "/content/drive/MyDrive/bert_bilstm_crf_named_entity_recognition/BERT-NER/output/biobert-plant-disease"

if not os.path.exists(directory):
    os.makedirs(directory)

# save vocabulary of the tokenizer
tokenizer.save_vocabulary(directory)
# save the model weights and its configuration file
model.save_pretrained(directory)
print('All files saved')
print('completed')

All files saved
completed


# **Test Sentence**

In [40]:
def prepare_sentence(sentence, tokenizer, maxlen):    
      # step 1: tokenize the sentence
      tokenized_sentence = tokenizer.tokenize(sentence)
      
      # step 2: add special tokens 
      tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"] 

      # step 3: truncating/padding
      if (len(tokenized_sentence) > maxlen):
        # truncate
        tokenized_sentence = tokenized_sentence[:maxlen]
      else:
        # pad
        tokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]

      # step 4: obtain the attention mask
      attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]
      
      # step 5: convert tokens to input ids
      ids = tokenizer.convert_tokens_to_ids(tokenized_sentence)
      
      return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(attn_mask, dtype=torch.long),
            #'token_type_ids': torch.tensor(token_ids, dtype=torch.long),
      }

In [41]:
def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    """
    Word piece tokenization makes it difficult to match word labels
    back up with individual word pieces. This function tokenizes each
    word one at a time so that it is easier to preserve the correct
    label for each subword. It is, of course, a bit slower in processing
    time, but it will help our model achieve higher accuracy.
    """

    tokenized_sentence = []
    labels = []

    sentence = sentence.strip()

    for word, label in zip(sentence.split(), text_labels.split(",")):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [42]:
class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        # step 1: tokenize (and adapt corresponding labels)
        sentence = self.data.sentence[index]  
        word_labels = self.data.word_labels[index]  
        tokenized_sentence, labels = tokenize_and_preserve_labels(sentence, word_labels, self.tokenizer)
        
        # step 2: add special tokens (and corresponding labels)
        tokenized_sentence = ["[CLS]"] + tokenized_sentence + ["[SEP]"] # add special tokens
        labels.insert(0, "O") # add outside label for [CLS] token
        labels.insert(-1, "O") # add outside label for [SEP] token

        # step 3: truncating/padding
        maxlen = self.max_len

        if (len(tokenized_sentence) > maxlen):
          # truncate
          tokenized_sentence = tokenized_sentence[:maxlen]
          labels = labels[:maxlen]
        else:
          # pad
          tokenized_sentence = tokenized_sentence + ['[PAD]'for _ in range(maxlen - len(tokenized_sentence))]
          labels = labels + ["O" for _ in range(maxlen - len(labels))]

        # step 4: obtain the attention mask
        attn_mask = [1 if tok != '[PAD]' else 0 for tok in tokenized_sentence]
        
        # step 5: convert tokens to input ids
        ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)

        label_ids = [labels_to_ids[label] for label in labels]
        # the following line is deprecated
        #label_ids = [label if label != 0 else -100 for label in label_ids]
        
        return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(attn_mask, dtype=torch.long),
              #'token_type_ids': torch.tensor(token_ids, dtype=torch.long),
              'targets': torch.tensor(label_ids, dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

In [43]:
sentence = "the mechanism by which magnesium affects digitalis induced arrhythmias was studied in dogs with and without beta receptor".strip().split()

inputs = tokenizer(sentence, is_pretokenized=True, return_offsets_mapping=True, padding='max_length', truncation=True)
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"])
token_offsets = inputs["offset_mapping"]
print(tokens)
print(token_offsets)

['[CLS]', 'the', 'mechanism', 'by', 'which', 'ma', '##gnesium', 'affects', 'digital', '##is', 'induced', 'a', '##rr', '##hy', '##th', '##mia', '##s', 'was', 'studied', 'in', 'dogs', 'with', 'and', 'without', 'beta', 'receptor', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]'

In [44]:
word = "arrhythmias"

inputs = tokenizer(word, return_offsets_mapping=True, padding='max_length', truncation=True)
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"])
token_offsets = inputs["offset_mapping"]
print(tokens)
print(token_offsets)

['[CLS]', 'a', '##rr', '##hy', '##th', '##mia', '##s', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', 