In [8]:
#Install the required libraries
!git lfs install
!pip install datasets
!pip install transformers

Git LFS initialized.
Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_

In [25]:
#Import the required libraries
import sklearn
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import datasets
from datasets import load_dataset,config
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import DataCollatorForTokenClassification
config.progress_bar_type = None

In [26]:
#Load the ncbi dataset to fine tune the pre-trained model
med_dis = load_dataset("ncbi_disease")

In [11]:
#Exploring the dataset
#It consists of train,validation and test data
med_dis

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 5433
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 924
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 941
    })
})

In [12]:
#Set of items present within the train dictionary
med_dis["train"].features

{'id': Value(dtype='string', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'B-Disease', 'I-Disease'], id=None), length=-1, id=None)}

In [13]:
#Tags/labels defined in the ncbi datset to leverage ner
med_dis["train"].features["ner_tags"].feature.names

['O', 'B-Disease', 'I-Disease']

In [14]:
#Tokenizing using the pre-tained model Biomed NLP from huggingface
tokenizer=AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

In [15]:
#The first sentence in the train dataset
text_1=med_dis["train"][0]
print(text_1)

{'id': '0', 'tokens': ['Identification', 'of', 'APC2', ',', 'a', 'homologue', 'of', 'the', 'adenomatous', 'polyposis', 'coli', 'tumour', 'suppressor', '.'], 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0]}


In [16]:
tokenize_inp=tokenizer(text_1["tokens"],is_split_into_words=True)


In [17]:
tokens=tokenizer.convert_ids_to_tokens(tokenize_inp["input_ids"])
word_ids=tokenize_inp.word_ids()

In [18]:
tokenize_inp

{'input_ids': [2, 4824, 1927, 9187, 1028, 16, 43, 17986, 1927, 1920, 30390, 28641, 5178, 6383, 9241, 18, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [19]:
tokens #Why we recieved 17 tokens instead of 14

['[CLS]',
 'identification',
 'of',
 'apc',
 '##2',
 ',',
 'a',
 'homologue',
 'of',
 'the',
 'adenomatous',
 'polyposis',
 'coli',
 'tumour',
 'suppressor',
 '.',
 '[SEP]']

In [20]:
word_ids

[None, 0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, None]

In [21]:
#There is difference in tokens between the initial sentence and the once after applying tokenizer, hence they need more fine tuning
len(text_1["ner_tags"]),len(tokenize_inp["input_ids"])

(14, 17)

In [31]:
#Preprocessing tokenizing and allignment
"""This function is used to assign -100 to the special tokens and mask the subword that is a part
 of the previous subword.This process is done for the data to adapt with the tokenizer model"""
def tokenize_align(text_1,label_all_tokens=True):
  tokenize_inp=tokenizer(text_1["tokens"],truncation=True,is_split_into_words=True)
  tags=[]
  for id,tag in enumerate(text_1["ner_tags"]):
    word_ids=tokenize_inp.word_ids(batch_index=id)
    pre_word_id=None
    label_id=[]
    for word in word_ids:
      if word is None:
        label_id.append(-100)
      elif word!=pre_word_id:
        label_id.append(tag[word])
      else:
        label_id.append(tag[word] if label_all_tokens else -100)
      pre_word_id=word
    tags.append(label_id)
  tokenize_inp["tag"]=tags
  return tokenize_inp

In [32]:
r=tokenize_align(med_dis["train"][0:2])
print(r)

{'input_ids': [[2, 4824, 1927, 9187, 1028, 16, 43, 17986, 1927, 1920, 30390, 28641, 5178, 6383, 9241, 18, 3], [2, 1920, 30390, 28641, 5178, 12, 9187, 13, 6383, 17, 9241, 2213, 3562, 1920, 7028, 8076, 3374, 2007, 7402, 43, 2796, 1956, 13302, 8921, 4016, 6341, 9215, 12, 9822, 17, 6341, 9215, 13, 16, 19911, 1022, 19, 7569, 1921, 1930, 4666, 11281, 7392, 18, 3]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'tag': [[-100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0, -100], [-100, 0, 1, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]]}
