In [0]:
import json
import transformers
from transformers import BertTokenizer
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
# import keras
import nltk
from nltk.tokenize.util import align_tokens
import mlflow
nltk.download('punkt')
# from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [0]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [0]:
basePath ='/dbfs/FileStore/Chanchal/model_path'

In [0]:
#Importing tokenizer

from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [0]:
#create dataframe from data
def read_file(path):
  data=open(path, "r")
  data=json.load(data)
  # print(data)
  string_labels = []
  for item in data:
    text = item['Text']
    id= item['id']
    sentences_split = nltk.word_tokenize(item["Text"])
    # print(sentences_split)
    try:
        token_spans = align_tokens(sentences_split,text)
    except:
        print("Error")
        # print(token_spans)
    for i in range(len(token_spans)):
        token_in_annotation = False
        for annotation in item['annotation']:
            if int(annotation["start"])<= token_spans[i][0] and int(annotation['end'])>=token_spans[i][1]:
                string_labels.append((text[token_spans[i][0]:token_spans[i][1]],annotation['type']))
                token_in_annotation = True
        if(token_in_annotation==False):
            string_labels.append((text[token_spans[i][0]:token_spans[i][1]],'O'))
  df = pd.DataFrame(string_labels, columns =['token', 'label'])
  return df


In [0]:
path="/dbfs/FileStore/Chanchal/Datasets/biored.json"
df=read_file(path)
df['label'].unique()
frequencies = df.label.value_counts()
frequencies


Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error
Error


O                             141466
DiseaseOrPhenotypicFeature      8963
GeneOrGeneProduct               8168
ChemicalEntity                  4634
SequenceVariant                 2738
OrganismTaxon                   2195
CellLine                         164
Name: label, dtype: int64

In [0]:
target_labels = ['DiseaseOrPhenotypicFeature','O']
df = df[df['label'].isin(target_labels)]

In [0]:

df['group'] = (df['token'] == '.').cumsum()
new_df = df.groupby('group').agg({'token': ' '.join, 'label': ' '.join}).reset_index(drop=True)

new_df.rename(columns = {'token':'text', 'label':'labels'}, inplace = True)

In [0]:
# Split labels based on whitespace and turn them into a list
labels = [i.split() for i in new_df['labels'].values.tolist()]

# Check how many labels are there in the dataset
unique_labels = set()

for lb in labels:
  [unique_labels.add(i) for i in lb if i not in unique_labels]


# Map each label into its id representation and vice versa
labels_to_ids = {k: v for v, k in enumerate(sorted(unique_labels))}
ids_to_labels = {v: k for v, k in enumerate(sorted(unique_labels))}
print(labels_to_ids)

{'DiseaseOrPhenotypicFeature': 0, 'O': 1}


In [0]:
MAX_LEN = 512
batch_size=16
epochs = 3
lr = 2e-05
MAX_GRAD_NORM = 10

In [0]:
# Tokenizing the input
def tokenizing(text):
  txt=text.values.tolist()
  text_tokenized = tokenizer(txt, padding='max_length', max_length=MAX_LEN , truncation=True, return_tensors="pt")
  input_ids=text_tokenized['input_ids']
  attention_masks=text_tokenized['attention_mask']
  return input_ids,attention_masks

In [0]:
#Balancing the labels according to the tokens
def align_label(texts, labels):
    tokenized_inputs = tokenizer(texts, padding='max_length', max_length=MAX_LEN , truncation=True)
    label_all_tokens=True

    word_ids = tokenized_inputs.word_ids()

    previous_word_idx = None
    label_ids = []

    for word_idx in word_ids:

        if word_idx is None:
            label_ids.append(-100)

        elif word_idx != previous_word_idx:
            try:
                label_ids.append(labels_to_ids[labels[word_idx]])
            except:
                label_ids.append(-100)
        else:
            try:
                label_ids.append(labels_to_ids[labels[word_idx]] if label_all_tokens else -100)
            except:
                label_ids.append(-100)
        previous_word_idx = word_idx


    return label_ids

In [0]:
#Converting tokenized inputs to tensors

lb = [i.split() for i in new_df['labels'].values.tolist()]
txt = new_df['text'].values.tolist()
new_label=[align_label(i,j) for i,j in zip(txt, lb)]
input_ids,attention_masks=tokenizing(new_df['text'])

In [0]:
input_ids[4]

tensor([  101,   119,  1109, 12645,  1104,  9686,  2137,  1108,  6321,  1272,
         1104,  6873,  5552,  1301,  2875,  1874,   117,  1105,  1376,  1191,
         1251,  1146, 13482,  1118,  1103, 21153, 16219,  1105, 21718, 24186,
         1616, 26310,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [0]:
# converting input into tensors for bert
pt_input_ids = torch.stack(list(input_ids), dim=0)

pt_attention_masks = torch.stack(list(attention_masks), dim=0)

pt_labels = torch.tensor(new_label, dtype=torch.long)

In [0]:
from torch.utils.data import TensorDataset, random_split
import random
# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(pt_input_ids, pt_attention_masks, pt_labels)
total_size = len(dataset)
train_size = int(0.6 * total_size)
val_size = int(0.20 * total_size)
test_size = total_size - train_size - val_size

train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))
print('{:>5,} testing samples'.format(test_size))

3,821 training samples
1,273 validation samples
1,275 testing samples


In [0]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler




train_dataloader = DataLoader(train_dataset, sampler = RandomSampler(train_dataset), batch_size = batch_size )

validation_dataloader = DataLoader(val_dataset, sampler = SequentialSampler(val_dataset), batch_size = batch_size   )

test_dataloader=DataLoader(train_dataset, sampler = SequentialSampler(test_dataset), batch_size = batch_size )


In [0]:
from transformers import BertForTokenClassification, AdamW, BertConfig
model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(labels_to_ids))
model.cuda()

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [0]:
ids = train_dataset[0][0].unsqueeze(0)
mask = train_dataset[0][1].unsqueeze(0)
targets = train_dataset[0][2].unsqueeze(0)
ids = ids.to(device)
mask = mask.to(device)
targets = targets.to(device)
outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
initial_loss = outputs[0]
initial_loss


tensor(0.9558, device='cuda:0', grad_fn=<NllLossBackward0>)

In [0]:
tr_logits = outputs[1]
tr_logits.shape

torch.Size([1, 512, 2])

In [0]:
# defining the optimizer
optimizer = AdamW(model.parameters(),
                  lr
                )


from transformers import get_linear_schedule_with_warmup
# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))



In [0]:
import mlflow

In [0]:
mlflow.start_run()
loss_values = []
val_loss_vaues=[]
total_t0 = time.time()
for epoch in range(epochs):
  print('======== Epoch {:} / {:} ========'.format(epoch + 1, epochs))
  print('Training...')
  # Measure how long the training epoch takes.
  training_stats = []
  model.train()
  total_loss = 0
  t0 = time.time()
  for step, batch in enumerate(train_dataloader):
    if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)

    optimizer.zero_grad()
    loss, logits = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask, 
                            labels=b_labels).to_tuple()
    total_loss += loss.item()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
    optimizer.step()
    scheduler.step()

  avg_train_loss = total_loss / len(train_dataloader)
  loss_values.append(avg_train_loss)
  # Measure how long this epoch took.
  training_time = format_time(time.time() - t0)
  
  print("")
  print("  Average training loss: {0:.2f}".format(avg_train_loss))
  # print("  Average training loss: {0:.2f}".format(avg_train_loss))
  print("  Training epcoh took: {:}".format(training_time))
  # Validation
  print("Running Validation...")


  t0 = time.time()



  model.eval()
  total_eval_loss = 0
  val_predictions = []
  for batch in validation_dataloader:
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)
    with torch.no_grad():
      loss, logits = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask,labels=b_labels).to_tuple()
    total_eval_loss += loss.item()
    predicted_labels = torch.argmax(logits, dim=2)
    val_predictions.extend(predicted_labels.detach().cpu().numpy())

  avg_val_loss = total_eval_loss / len(validation_dataloader)
  val_loss_vaues.append(avg_val_loss)
  validation_time = format_time(time.time() - t0)

  print("  Average validation loss: {0:.2f}".format(avg_val_loss))
  mlflow.log_param("lr", lr)
  mlflow.log_metric("epoch", epoch + 1)
  mlflow.log_metric("average_train_loss", avg_train_loss, step=epoch+1)
  mlflow.log_metric("average_val_loss", avg_val_loss,step=epoch+1)
  
print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))

mlflow.end_run()

Training...
  Batch    40  of    239.    Elapsed: 0:00:50.
  Batch    80  of    239.    Elapsed: 0:01:42.
  Batch   120  of    239.    Elapsed: 0:02:35.
  Batch   160  of    239.    Elapsed: 0:03:29.
  Batch   200  of    239.    Elapsed: 0:04:24.

  Average training loss: 0.17
  Training epcoh took: 0:05:18
Running Validation...
  Average validation loss: 0.13
Training...
  Batch    40  of    239.    Elapsed: 0:00:54.
  Batch    80  of    239.    Elapsed: 0:01:49.
  Batch   120  of    239.    Elapsed: 0:02:44.
  Batch   160  of    239.    Elapsed: 0:03:39.
  Batch   200  of    239.    Elapsed: 0:04:34.

  Average training loss: 0.10
  Training epcoh took: 0:05:27
Running Validation...
  Average validation loss: 0.12
Training...
  Batch    40  of    239.    Elapsed: 0:00:55.
  Batch    80  of    239.    Elapsed: 0:01:50.
  Batch   120  of    239.    Elapsed: 0:02:45.
  Batch   160  of    239.    Elapsed: 0:03:40.
  Batch   200  of    239.    Elapsed: 0:04:35.

  Average training loss: 0

In [0]:
model.save_pretrained(basePath + "/biored_bert")


In [0]:
# print('Predicting labels for {:,} test sentences...'.format(len(pt_input_ids)))

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions , true_labels = [], []

# Predict 
for batch in test_dataloader:
  # Add batch to GPU
  b_input_ids = batch[0].to(device)
  b_input_mask = batch[1].to(device)
  b_labels = batch[2].to(device)
  
  # Telling the model not to compute or store gradients, saving memory and 

  with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)

  logits = outputs[0]

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)

print('    DONE.')

    DONE.


In [0]:
from sklearn.metrics import f1_score

# First, combine the results across the batches.
all_predictions = np.concatenate(predictions, axis=0)
all_true_labels = np.concatenate(true_labels, axis=0)

print("After flattening the batches, the predictions have shape:")
print("    ", all_predictions.shape)

# Next, let's remove the third dimension (axis 2), which has the scores
# for all 18 labels. 

# For each token, pick the label with the highest score.
predicted_label_ids = np.argmax(all_predictions, axis=2)

print("\nAfter choosing the highest scoring label for each token:")
print("    ", predicted_label_ids.shape) 


# Eliminate axis 0, which corresponds to the sentences.
predicted_label_ids = np.concatenate(predicted_label_ids, axis=0)
all_true_labels = np.concatenate(all_true_labels, axis=0)

print("\nAfter flattening the sentences, we have predictions:")
print("    ", predicted_label_ids.shape)
print("and ground truth:")
print("    ", all_true_labels.shape)

After flattening the batches, the predictions have shape:
     (1275, 512, 2)

After choosing the highest scoring label for each token:
     (1275, 512)

After flattening the sentences, we have predictions:
     (652800,)
and ground truth:
     (652800,)


In [0]:
# Construct new lists of predictions which don't include any null tokens.
real_token_predictions = []
real_token_labels = []

# For each of the input tokens in the dataset...
for i in range(len(all_true_labels)):

    # If it's not a token with a null label...
    if not all_true_labels[i] == -100:
        
        # Add the prediction and the ground truth to their lists.
        real_token_predictions.append(predicted_label_ids[i])
        real_token_labels.append(all_true_labels[i])

print("Before filtering out `null` tokens, length = {:,}".format(len(all_true_labels)))
print(" After filtering out `null` tokens, length = {:,}".format(len(real_token_labels)))

Before filtering out `null` tokens, length = 652,800
 After filtering out `null` tokens, length = 39,149


In [0]:
from sklearn.metrics import f1_score
# from seqeval.metrics import classification_report


# f1 = f1_score(real_token_labels, real_token_predictions, average='micro')
labels = [ids_to_labels[id.item()] for id in real_token_labels]
predictions = [ids_to_labels[id.item()] for id in real_token_predictions]

# print ("F1 score: {:.2%}".format(f1))

In [0]:
# misclassified_tokens = []

# # For each token's index and corresponding true/predicted label...
# for i, (true_label, predicted_label) in enumerate(zip(labels, predictions)):
#     # Check if the token is misclassified
#     if true_label != predicted_label:
#         # Retrieve the token text from its index in the tokenizer's vocabulary
#         token = tokenizer.convert_ids_to_tokens([i])[0]
#         # Add the misclassified token's information to the list
#         misclassified_tokens.append((i, token, true_label, predicted_label))

# # Print misclassified tokens
# for i, token, true_label, predicted_label in misclassified_tokens:
#     print(f"Token: {token}, True Label: {true_label}, Predicted Label: {predicted_label}")

In [0]:
# from seqeval.metrics import classification_report
from sklearn.metrics import classification_report

print(classification_report(labels, predictions))

                            precision    recall  f1-score   support

DiseaseOrPhenotypicFeature       0.90      0.83      0.86      3463
                         O       0.98      0.99      0.99     35686

                  accuracy                           0.98     39149
                 macro avg       0.94      0.91      0.93     39149
              weighted avg       0.98      0.98      0.98     39149



In [0]:
report = classification_report(labels, predictions, output_dict=True)

df = pd.DataFrame(report).transpose()

In [0]:
df.to_csv("/dbfs/FileStore/Chanchal/logs/diseasebio,cdr--bio.csv", index = True)