In [None]:
#Install the required libraries
!git lfs install
!pip install datasets
!pip install transformers
!pip install accelerate -U

: 

In [None]:
#Import the required libraries
import sklearn
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import datasets
from datasets import load_dataset,config
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import DataCollatorForTokenClassification
config.progress_bar_type = None

: 

In [None]:
#Load the ncbi dataset to fine tune the pre-trained model
med_dis = load_dataset("ncbi_disease")

: 

In [None]:
#Exploring the dataset
#It consists of train,validation and test data
med_dis

: 

In [None]:
import pandas as pd
df_train=pd.DataFrame(med_dis["train"][:])
df_validation=pd.DataFrame(med_dis["validation"][:])
df_test=pd.DataFrame(med_dis["test"][:])

: 

In [None]:
df_new=pd.concat([df_train,df_validation,df_test])

: 

**Pre-processing the data**

In [None]:
import nltk
import string
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

: 

In [None]:
from nltk.corpus import stopwords

def preprocess(tokens):
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    # Remove punctuation and numbers and convert to lower
    tokens = [word.lower() for word in tokens if word not in string.punctuation and not word.isdigit()]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.lower() not in stop_words]

    # Apply lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return tokens
df_new['cleaned_tokens'] = df_new['tokens'].apply(preprocess)


: 

**EDA on the cleaned tokens**

In [None]:
from collections import Counter
from wordcloud import WordCloud
import matplotlib.pyplot as plt
word_counts = Counter(word for tokens in df_new['cleaned_tokens'] for word in tokens)

# Create a word cloud based on word frequencies
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(word_counts)

# Display the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

: 

The most commonly occuring words in the NCBI dataset are words such as mutation, disease,patient and gene.

In [None]:
df_new

: 

In [None]:
flattened_tokens = [token for sublist in df_new["cleaned_tokens"] for token in sublist]

# Create a set from the flattened tokens to obtain unique tokens
vocab = set(flattened_tokens)

: 

In [None]:
vocab=list(vocab)

: 

In [None]:
len(vocab)

: 

In [None]:
#Set of items present within the train dictionary
med_dis["train"].features

: 

In [None]:
#Tags/labels defined in the ncbi datset to leverage ner
med_dis["train"].features["ner_tags"].feature.names

: 

In [None]:
#Tokenizing using the pre-tained model Biomed NLP from huggingface
tokenizer=AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext")

: 

In [None]:
#The first sentence in the train dataset
text_1=med_dis["train"][0]
print(text_1)

: 

In [None]:
tokenize_inp=tokenizer(text_1["tokens"],is_split_into_words=True)


: 

In [None]:
tokens=tokenizer.convert_ids_to_tokens(tokenize_inp["input_ids"])
word_ids=tokenize_inp.word_ids()

: 

In [None]:
tokenize_inp

: 

In [None]:
tokens #Why we recieved 17 tokens instead of 14

: 

In [None]:
word_ids

: 

In [None]:
#There is difference in tokens between the initial sentence and the once after applying tokenizer, hence they need more fine tuning
len(text_1["ner_tags"]),len(tokenize_inp["input_ids"])

: 

**Tokenizing based on pre-trained model**

In [None]:
#Preprocessing tokenizing and allignment
"""This function is used to assign -100 to the special tokens and mask the subword that is a part
 of the previous subword.This process is done for the data to adapt with the tokenizer model"""
def tokenize_align(text_1,label_all_tokens=True):
  tokenize_inp=tokenizer(text_1["tokens"],truncation=True,is_split_into_words=True)
  tags=[]
  for id,tag in enumerate(text_1["ner_tags"]):
    word_ids=tokenize_inp.word_ids(batch_index=id)
    pre_word_id=None
    label_id=[]
    for word in word_ids:
      if word is None:
        label_id.append(-100)
      elif word!=pre_word_id:
        label_id.append(tag[word])
      else:
        label_id.append(tag[word] if label_all_tokens else -100)
      pre_word_id=word
    tags.append(label_id)
  tokenize_inp["tag"]=tags
  return tokenize_inp

: 

In [None]:
r=tokenize_align(med_dis["train"][0:2])
print(r)

: 

In [None]:
encoded_ds = med_dis.map(tokenize_align,
                       batched=True,
                       remove_columns=
                        [
                            'ner_tags',
                            'tokens'
                        ]
                    )

: 

In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

: 

In [None]:
id2label = {0: "O",
    1: "B-Disease",
    2: "I-Disease"}

tag_values = list(id2label.values())
label2id = {tag: idx for idx, tag in id2label.items()}
print(label2id)

: 

In [None]:
#To save the model in the hub
model = (AutoModelForTokenClassification.from_pretrained(
    MODEL_CKPT,
    num_labels=3,
    id2label=id2label,
    label2id=label2id
    ).to(DEVICE))

: 

In [None]:
#Create a metric evaluation function

from sklearn.metrics import classification_report

def compute_metrics(p):
    predictions, labels = p
    label_list = list(range(len(label_list)))  # You might need to adjust this based on your label mapping

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    # Flatten the lists for classification_report
    flat_true_labels = [label for sublist in true_labels for label in sublist]
    flat_true_predictions = [label for sublist in true_predictions for label in sublist]

    report = classification_report(flat_true_labels, flat_true_predictions, output_dict=True)

    return report


: 

In [None]:
MODEL_CKPT = "bert-base-cased"
MODEL_NAME = f"{MODEL_CKPT}-finetuned-ner-NCBI_Disease"
NUM_OF_EPOCHS = 3
BATCH_SIZE = 16
STRATEGY = "epoch"
REPORTS_TO = "tensorboard"
WEIGHT_DECAY = 0.01
LR = 2e-5
DEVICE = torch.device("cpu")
STEPS = 35

: 

In [None]:
#Creating a Trainer arguments and defining each of the required parameters
from transformers import Trainer, TrainingArguments
args = TrainingArguments(
    MODEL_NAME,
    log_level="error",
    logging_first_step=True,
    learning_rate=LR,
    num_train_epochs=NUM_OF_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    evaluation_strategy=STRATEGY,
    report_to=REPORTS_TO,
    disable_tqdm=False,
    logging_steps=STEPS,
    weight_decay=WEIGHT_DECAY,
    save_strategy=STRATEGY,
    hub_private_repo=True,
    push_to_hub=True
)

: 

: 

: 