In [1]:
!pip install datasets
!pip install transformers
!pip install sentence-transformers
!pip install accelerate -U
!pip install nltk

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [2]:
# importing the necessary libraries
from datasets import load_dataset
from transformers import AutoTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
import torch.nn.functional as F
import re

from sentence_transformers import SentenceTransformer
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_similarity

import numpy as np
from nltk.translate.bleu_score import sentence_bleu
import json, os

In [3]:
# Loading the dataset
data = load_dataset("lavita/ChatDoctor-HealthCareMagic-100k")

data

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/542 [00:00<?, ?B/s]

(…)-00000-of-00001-5e7cb295b9cff0bf.parquet:   0%|          | 0.00/70.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/112165 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 112165
    })
})

In [4]:
# Slecting the first fifteen thousand rows of the dataset for training
df = data['train'].select(range(15000))

In [5]:
# Splitting the dataset
train_test_valid = df.train_test_split(test_size=0.2, seed=42)
test_valid = train_test_valid['test'].train_test_split(test_size=0.5, seed=42)

train_df = train_test_valid['train']
val_df = test_valid['train']
test_df = test_valid['test']

In [6]:
# extracting unique outputs fo use as unique labels
unique_labels = list(set(train_df['output'] + val_df['output'] + test_df['output']))

# label mapping
label_mapping = {label: idx for idx, label in enumerate(unique_labels)}

# saving the label mapping to a json file
label_mapping_path = "./label_mapping.json"
with open(label_mapping_path, 'w') as file:
    json.dump(label_mapping, file)

In [7]:
# Load a pre-trained sentence transformer model
sent_transf = SentenceTransformer('all-MiniLM-L6-v2')

# Encode all possible responses
response = sent_transf.encode(list(label_mapping.keys()))

# Save the response embeddings
response_path = "./response_embeddings.npy"
np.save(response_path, response)

# Save the semantic model
sent_transf_path = "./sent_transf"
os.makedirs(sent_transf_path, exist_ok=True)
sent_transf.save(sent_transf_path)


def semantic_search(query, top_k=5):
    # Encode
    query_embedding = sent_transf.encode([query])

    # Cosine similarity
    similarities = cosine_similarity(query_embedding, response)[0]

    # Get top-k similar responses
    top_indices = similarities.argsort()[-top_k:][::-1]

    return [(list(label_mapping.keys())[i], similarities[i]) for i in top_indices]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [8]:
# mapping the text labels to numerical labels
def map_label(example):
    example['label'] = label_mapping[example['output']]
    return example

# applying the label mappings
train_df = train_df.map(map_label)
val_df = val_df.map(map_label)
test_df = test_df.map(map_label)

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [9]:
# loading the BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

**Pre-processing**

- **Text Cleaning and Normalization:** The goal is to remove unwanted characters, reduce noise, and prepare the text for tokenization. This step is important as we need to handle variations in medical terminology and descriptions before tokenization.

In [10]:
# text cleaning
def clean_text(text):
    # Lowercase as this removes case sensitivity thereby helps to reduce vocabulary size
    text = text.lower()
    # removing special characters so that we can focus on the meaningful words
    text = re.sub(r'[^a-z\s]', '', text)
    # removing extra whitespace as this ensures a consistent format
    text = re.sub(r'\s+', ' ', text).strip()
    return text

- **Tokenization** process that is splitting text into smaller units (tokens) that the model can process. Since we are using the BERT model, we use the AutoTokenizer which uses the WordPiece tokenization method.

In [11]:
# tokenization
def tokenize(examples):

    # clean the instruction and input text fields
    cleaning = [clean_text(inst) for inst in examples['instruction']]
    cleaned_inputs = [clean_text(inp) for inp in examples['input']]

    # combine cleaned instruction and input into a single string as this allows the model tp understand the context better.
    inputs = [f"{inst} {inp}" for inst, inp in zip(cleaning, cleaned_inputs)]

    # The BERT tokenizer is used to split the text into tokens, pad the sequences to the same length, and truncate if necessary.
    tokenized_inputs = tokenizer(inputs, truncation=True, padding="max_length", max_length=512)

    # Add the label to the tokenized input
    tokenized_inputs["labels"] = examples["label"]
    return tokenized_inputs


# tokenize datasets
tokenized_train = train_df.map(tokenize, batched=True)
tokenized_val = val_df.map(tokenize, batched=True)
tokenized_test = test_df.map(tokenize, batched=True)

# formatting for pytorch
tokenized_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_val.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [38]:
#defining the model
num_labels = len(label_mapping)
model = BertForSequenceClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", num_labels=num_labels)

for param in model.parameters():
    if not param.is_contiguous():
        param.data = param.data.contiguous()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
# define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none"
)



In [40]:


# class CustomTrainer(Trainer):
#     def compute_loss(self, model, inputs, return_outputs=False):
#         labels = inputs.get("labels")
#         if labels is None:
#             raise ValueError("Labels should not be None")
#         outputs = model(**inputs)
#         logits = outputs.get("logits")
#         loss = F.cross_entropy(logits, labels)
#         return (loss, outputs) if return_outputs else loss


# trainer = CustomTrainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_train,
#     eval_dataset=tokenized_val,
# )

# defining a custom Trainer to calculate loss
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        if labels is None:
            raise ValueError("Labels should not be None")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        logits = logits.contiguous()
        labels = labels.contiguous()

        loss = F.cross_entropy(logits, labels)

        return (loss, outputs) if return_outputs else loss

# initializing the trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
)

In [41]:
# training the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,9.6181,9.687219
2,9.6176,9.978241


Epoch,Training Loss,Validation Loss
1,9.6181,9.687219
2,9.6176,9.978241
3,9.483,11.4497


TrainOutput(global_step=4500, training_loss=9.572035915798612, metrics={'train_runtime': 3629.1633, 'train_samples_per_second': 9.92, 'train_steps_per_second': 1.24, 'total_flos': 1.0729306939392e+16, 'train_loss': 9.572035915798612, 'epoch': 3.0})

In [42]:
# saving the model
model_path = "./saved_model"
os.makedirs(model_path, exist_ok=True)
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/vocab.txt',
 './saved_model/added_tokens.json',
 './saved_model/tokenizer.json')

In [43]:
# Evaluating the model
evaluation = trainer.evaluate()
evaluation

{'eval_loss': 9.68721866607666,
 'eval_runtime': 41.9402,
 'eval_samples_per_second': 35.765,
 'eval_steps_per_second': 4.483,
 'epoch': 3.0}

In [47]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Evaluate model on test data
predictions = trainer.predict(tokenized_test)

# Get the unique labels in the predictions
unique_labels_in_data = np.unique(predictions.label_ids)

# Check if all unique labels are present in the label mapping
filtered_label_mapping = {
    label: idx for label, idx in label_mapping.items() if idx in unique_labels_in_data
}

# Ensure target_names includes all labels in unique_labels_in_data as strings
# Sort to maintain the order between labels and target_names
target_names = [
    str(label) for idx, label in sorted(filtered_label_mapping.items(), key=lambda item: item[1])
]

# Function to compute metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)

    # Ensure the report uses the correct label names
    report = classification_report(labels, preds, labels=unique_labels_in_data, target_names=target_names)
    conf_matrix = confusion_matrix(labels, preds, labels=unique_labels_in_data)
    return report, conf_matrix

# Compute the metrics
report, conf_matrix = compute_metrics(predictions)
print("Classification Report:\n", report)
print("Confusion Matrix:\n", conf_matrix)


Classification Report:
               precision    recall  f1-score   support

           7       0.00      0.00      0.00       1.0
          11       0.00      0.00      0.00       1.0
          21       0.00      0.00      0.00       1.0
          22       0.00      0.00      0.00       1.0
          24       0.00      0.00      0.00       1.0
          45       0.00      0.00      0.00       1.0
          68       0.00      0.00      0.00       1.0
          88       0.00      0.00      0.00       1.0
          89       0.00      0.00      0.00       1.0
          93       0.00      0.00      0.00       1.0
          96       0.00      0.00      0.00       1.0
         122       0.00      0.00      0.00       1.0
         170       0.00      0.00      0.00       1.0
         171       0.00      0.00      0.00       1.0
         178       0.00      0.00      0.00       1.0
         196       0.00      0.00      0.00       1.0
         203       0.00      0.00      0.00       1.0
   

In [48]:
# Generating a response category
def predict_category(instruction, input_text):

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    # Clean the instruction and input text
    cleaned_instruction = clean_text(instruction)
    cleaned_input = clean_text(input_text)

    # Getting the BERT prediction
    inputs = tokenizer(f"{cleaned_instruction} {cleaned_input}", return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_id = torch.argmax(logits, dim=-1).item()
    bert_prediction = list(label_mapping.keys())[list(label_mapping.values()).index(predicted_id)]

    # Getting the semantic search results
    semantic_results = semantic_search(f"{cleaned_instruction} {cleaned_input}")

    if bert_prediction in [result[0] for result in semantic_results]:
        return bert_prediction
    else:
        return semantic_results[0][0]



instruction_input = "If you are a doctor, please answer the medical questions based on the patient's description."
sample_input = "I have a 13 month old baby who is very congested with a terrible cough. Its rattly/raspy and croupy sounding cough. She started choking on her coughs and the mucous that has come up. She also has a fever and runny nose. Should i take her to urgent care?"
predicted_label = predict_category(instruction_input, sample_input)
print(f"Instruction: {instruction_input}")
print(f"Input: {sample_input}")
print(f"Predicted Response: {predicted_label}")

Instruction: If you are a doctor, please answer the medical questions based on the patient's description.
Input: I have a 13 month old baby who is very congested with a terrible cough. Its rattly/raspy and croupy sounding cough. She started choking on her coughs and the mucous that has come up. She also has a fever and runny nose. Should i take her to urgent care?
Predicted Response: Hi, fever, cough and cold in a 13-month-old child suggest upper respiratory tract infection. Since you are already giving antipyretic medicine for 5 days, the child should be given antibiotic depending on the chest examination. Kindly get the child examined by doctor so that antibiotic can be started. Blood test like complete blood count, CRP level should be done to know the causes of fever. Take care.


In [49]:
# Getting model and tokenizer
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

In [53]:
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=e7cc54cd2cc54efb110bfd1a25391e99624fd2c391a60a41c815671943e5a884
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [54]:
import torch
from sklearn.metrics import classification_report, confusion_matrix
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer

# Load GPT-2 model and tokenizer for Perplexity calculation
gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2")
gpt2_tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

# Ensure models are on the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
gpt2_model.to(device)  # Move GPT-2 model to device

def calculate_perplexity(text):
    input_ids = gpt2_tokenizer.encode(text, return_tensors='pt').to(device)  # Move input to device
    with torch.no_grad():
        outputs = gpt2_model(input_ids, labels=input_ids)
    loss = outputs.loss
    perplexity = torch.exp(loss).item()
    return perplexity

# Function to calculate BLEU, ROUGE, and Perplexity scores
def calculate_metrics(prediction, true_output):
    bleu_score = sentence_bleu([true_output.split()], prediction.split())
    rouge_scorer_obj = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    rouge_scores = rouge_scorer_obj.score(true_output, prediction)
    perplexity = calculate_perplexity(prediction)
    metrics = {
        "BLEU": bleu_score,
        "ROUGE-1": rouge_scores['rouge1'].fmeasure,
        "ROUGE-L": rouge_scores['rougeL'].fmeasure,
        "Perplexity": perplexity
    }
    return metrics

# Function to generate response category and calculate metrics
def predict_category(instruction, input_text, true_output):
    # Clean the instruction and input text
    cleaned_instruction = clean_text(instruction)
    cleaned_input = clean_text(input_text)

    # Get BERT prediction
    inputs = tokenizer(
        f"{cleaned_instruction} {cleaned_input}",
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=512
    ).to(device)  # Move inputs to device

    model.to(device)  # Move BERT model to device

    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_class_id = torch.argmax(logits, dim=-1).item()
    bert_prediction = list(label_mapping.keys())[list(label_mapping.values()).index(predicted_class_id)]

    # Get semantic search results
    semantic_results = semantic_search(f"{cleaned_instruction} {cleaned_input}")

    # If BERT prediction is in top semantic results, return it; otherwise, return top semantic result
    if bert_prediction in [result[0] for result in semantic_results]:
        prediction = bert_prediction
    else:
        prediction = semantic_results[0][0]  # Return the top semantic search result

    # Calculate metrics
    metrics = calculate_metrics(prediction, true_output)

    return prediction, metrics

# Test the model with a sample input
sample_instruction = "If you are a doctor, please answer the medical questions based on the patient's description."
sample_input = "my husband was working on a project in the house and all of a sudden a bump about the size of a half dollar appeard on his left leg inside below the knee. He is 69 years old and had triple by pass surgery 7 years ago. It stung when it first happened. Doesn t hurt now. He is seated with his leg ellevated. Is this an emergency?"
true_output = "Hello. It could be a blood collection due to minor injury or a vein rupture which is also common at this age. It is not an emergency, but you should apply compression bandage and warm compresses if six hours have past. Furthermore, it should get relieved over the next few days but if it continues to increase or persist then you should see a Doctor who can examine the patient. Take care. Chat Doctor."

predicted_label, metrics = predict_category(sample_instruction, sample_input, true_output)
print(f"Instruction: {sample_instruction}")
print(f"Input: {sample_input}")
print(f"True Output: {true_output}")
print(f"Predicted Response: {predicted_label}")
print(f"Metrics: {metrics}")


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Instruction: If you are a doctor, please answer the medical questions based on the patient's description.
Input: my husband was working on a project in the house and all of a sudden a bump about the size of a half dollar appeard on his left leg inside below the knee. He is 69 years old and had triple by pass surgery 7 years ago. It stung when it first happened. Doesn t hurt now. He is seated with his leg ellevated. Is this an emergency?
True Output: Hello. It could be a blood collection due to minor injury or a vein rupture which is also common at this age. It is not an emergency, but you should apply compression bandage and warm compresses if six hours have past. Furthermore, it should get relieved over the next few days but if it continues to increase or persist then you should see a Doctor who can examine the patient. Take care. Chat Doctor.
Predicted Response: Hi, happy to help you on your health concern on part of Chat Doctor! In the present case of below-knee swelling or lump, if