In [None]:
!pip install datasets
!pip install transformers
!pip install sentence-transformers
!pip install accelerate -U
!pip install nltk
!pip install rouge_score

In [1]:
# import necessary libraries
import os
from datasets import load_dataset
from transformers import AutoTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
import torch.nn.functional as F
import re

from sentence_transformers import SentenceTransformer
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import cosine_similarity

import numpy as np
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer

import json

In [2]:
# load dataset
ds = load_dataset("lavita/ChatDoctor-HealthCareMagic-100k")

ds

Downloading readme:   0%|          | 0.00/542 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/70.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/112165 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 112165
    })
})

In [4]:
# # shuffle and select a subset of the data
# full_dataset = ds['train'].shuffle(seed=42).select(range(11000))


# select the first 15000 rows of the dataset
full_dataset = ds['train'].select(range(15000))

full_dataset

Dataset({
    features: ['instruction', 'input', 'output'],
    num_rows: 15000
})

In [5]:
# split the data
train_testvalid = full_dataset.train_test_split(test_size=0.2, seed=42)
test_valid = train_testvalid['test'].train_test_split(test_size=0.5, seed=42)

train_dataset = train_testvalid['train']
val_dataset = test_valid['train']
test_dataset = test_valid['test']

In [6]:
# extract unique outputs to use as labels
unique_labels = list(set(train_dataset['output'] + val_dataset['output'] + test_dataset['output']))

# create a label mapping
label_mapping = {label: idx for idx, label in enumerate(unique_labels)}

# save the label mapping to a json file
label_mapping_path = "./label_mapping.json"
with open(label_mapping_path, 'w') as file:
    json.dump(label_mapping, file)

print(f"Label mapping saved to {label_mapping_path}")

Label mapping saved to ./label_mapping.json


In [7]:
# --- semantic search --- #
# Load a pre-trained sentence transformer model
semantic_model = SentenceTransformer('all-MiniLM-L6-v2')

# Encode all possible responses
response_embeddings = semantic_model.encode(list(label_mapping.keys()))

# Save the response embeddings
response_embeddings_path = "./response_embeddings.npy"
np.save(response_embeddings_path, response_embeddings)
print(f"Response embeddings saved to {response_embeddings_path}")

# Save the semantic model
semantic_model_save_path = "./semantic_model"
os.makedirs(semantic_model_save_path, exist_ok=True)
semantic_model.save(semantic_model_save_path)
print(f"Semantic model saved to {semantic_model_save_path}")


def semantic_search(query, top_k=5):
    # Encode the query
    query_embedding = semantic_model.encode([query])

    # Calculate cosine similarity
    similarities = cosine_similarity(query_embedding, response_embeddings)[0]

    # Get top-k similar responses
    top_indices = similarities.argsort()[-top_k:][::-1]

    return [(list(label_mapping.keys())[i], similarities[i]) for i in top_indices]
# --- semantic search --- #

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Response embeddings saved to ./response_embeddings.npy
Semantic model saved to ./semantic_model


In [8]:
# map text labels to numeric labels
def map_labels(example):
    example['label'] = label_mapping[example['output']]
    return example


# apply the label mapping
train_dataset = train_dataset.map(map_labels)
val_dataset = val_dataset.map(map_labels)
test_dataset = test_dataset.map(map_labels)

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [9]:
# load the BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

In [10]:
# text cleaning
def clean_text(text):
    # convert to lowercase
    text = text.lower()
    # remove special characters and digits
    text = re.sub(r'[^a-z\s]', '', text)
    # remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [11]:
# tokenization
def tokenize_function(examples):

    # clean the instruction and input text
    cleaned_instructions = [clean_text(inst) for inst in examples['instruction']]
    cleaned_inputs = [clean_text(inp) for inp in examples['input']]

    # combine cleaned instruction and input
    inputs = [f"{inst} {inp}" for inst, inp in zip(cleaned_instructions, cleaned_inputs)]

    tokenized_inputs = tokenizer(inputs, truncation=True, padding="max_length", max_length=512)
    tokenized_inputs["labels"] = examples["label"]
    return tokenized_inputs


# tokenize datasets
tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)
tokenized_test = test_dataset.map(tokenize_function, batched=True)

# format for PyTorch
tokenized_train.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_val.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_test.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [12]:
# define model
num_labels = len(label_mapping)
model = BertForSequenceClassification.from_pretrained("emilyalsentzer/Bio_ClinicalBERT", num_labels=num_labels)

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [14]:
# define a custom Trainer to calculate loss
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        if labels is None:
            raise ValueError("Labels should not be None")
        outputs = model(**inputs)
        logits = outputs.get("logits")
        loss = F.cross_entropy(logits, labels)
        return (loss, outputs) if return_outputs else loss


# initialize trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
)

In [None]:
# train the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,9.6506,9.805124
2,9.6441,10.115467
3,9.5129,11.774781


TrainOutput(global_step=4500, training_loss=9.591654947916666, metrics={'train_runtime': 12996.4726, 'train_samples_per_second': 2.77, 'train_steps_per_second': 0.346, 'total_flos': 1.0729306939392e+16, 'train_loss': 9.591654947916666, 'epoch': 3.0})

In [16]:
# save the model
model_save_path = "./saved_model"
os.makedirs(model_save_path, exist_ok=True)
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"Model and tokenizer saved to {model_save_path}")

Model and tokenizer saved to ./saved_model


In [17]:
# evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

Evaluation results: {'eval_loss': 9.805124282836914, 'eval_runtime': 67.6456, 'eval_samples_per_second': 22.174, 'eval_steps_per_second': 2.779, 'epoch': 3.0}


In [18]:
# generate response category
def predict_category(instruction, input_text):
    # Clean the instruction and input text
    cleaned_instruction = clean_text(instruction)
    cleaned_input = clean_text(input_text)

    # Get BERT prediction
    inputs = tokenizer(f"{cleaned_instruction} {cleaned_input}", return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_class_id = torch.argmax(logits, dim=-1).item()
    bert_prediction = list(label_mapping.keys())[list(label_mapping.values()).index(predicted_class_id)]

    # Get semantic search results
    semantic_results = semantic_search(f"{cleaned_instruction} {cleaned_input}")

    # If BERT prediction is in top semantic results, return it. otherwise, return top semantic result
    if bert_prediction in [result[0] for result in semantic_results]:
        return bert_prediction
    else:
        return semantic_results[0][0]  # Return the top semantic search result


# Test the model with a sample input
sample_instruction = "If you are a doctor, please answer the medical questions based on the patient's description."
sample_input = "I have a 13 month old baby who is very congested with a terrible cough. Its rattly/raspy and croupy sounding cough. She started choking on her coughs and the mucous that has come up. She also has a fever and runny nose. Should i take her to urgent care?"
predicted_label = predict_category(sample_instruction, sample_input)
print(f"Instruction: {sample_instruction}")
print(f"Input: {sample_input}")
print(f"Predicted Response: {predicted_label}")

Instruction: If you are a doctor, please answer the medical questions based on the patient's description.
Input: I have a 13 month old baby who is very congested with a terrible cough. Its rattly/raspy and croupy sounding cough. She started choking on her coughs and the mucous that has come up. She also has a fever and runny nose. Should i take her to urgent care?
Predicted Response: Hi, fever, cough and cold in a 13-month-old child suggest upper respiratory tract infection. Since you are already giving antipyretic medicine for 5 days, the child should be given antibiotic depending on the chest examination. Kindly get the child examined by doctor so that antibiotic can be started. Blood test like complete blood count, CRP level should be done to know the causes of fever. Take care.


In [24]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast

In [28]:
# Load GPT-2 model and tokenizer for Perplexity calculation
gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2")
gpt2_tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

def calculate_perplexity(text):
    input_ids = gpt2_tokenizer.encode(text, return_tensors='pt')
    with torch.no_grad():
        outputs = gpt2_model(input_ids, labels=input_ids)
    loss = outputs.loss
    perplexity = torch.exp(loss).item()
    return perplexity

# Function to calculate BLEU, ROUGE, and Perplexity scores
def calculate_metrics(prediction, true_output):
    bleu_score = sentence_bleu([true_output.split()], prediction.split())
    rouge_scorer_obj = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    rouge_scores = rouge_scorer_obj.score(true_output, prediction)
    perplexity = calculate_perplexity(prediction)
    metrics = {
        "BLEU": bleu_score,
        "ROUGE-1": rouge_scores['rouge1'].fmeasure,
        "ROUGE-L": rouge_scores['rougeL'].fmeasure,
        "Perplexity": perplexity
    }
    return metrics

# Function to generate response category and calculate metrics
def predict_category(instruction, input_text, true_output):
    # Clean the instruction and input text
    cleaned_instruction = clean_text(instruction)
    cleaned_input = clean_text(input_text)

    # Get BERT prediction
    inputs = tokenizer(f"{cleaned_instruction} {cleaned_input}", return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_class_id = torch.argmax(logits, dim=-1).item()
    bert_prediction = list(label_mapping.keys())[list(label_mapping.values()).index(predicted_class_id)]

    # Get semantic search results
    semantic_results = semantic_search(f"{cleaned_instruction} {cleaned_input}")

    # If BERT prediction is in top semantic results, return it. otherwise, return top semantic result
    if bert_prediction in [result[0] for result in semantic_results]:
        prediction = bert_prediction
    else:
        prediction = semantic_results[0][0]  # Return the top semantic search result

    # Calculate metrics
    metrics = calculate_metrics(prediction, true_output)

    return prediction, metrics

# Test the model with a sample input
sample_instruction = "If you are a doctor, please answer the medical questions based on the patient's description."
sample_input = "my husband was working on a project in the house and all of a sudden a bump about the size of a half dollar appeard on his left leg inside below the knee. He is 69 years old and had triple by pass surgery 7 years ago. It stung when it first happened. Doesn t hurt now. He is seated with his leg ellevated. Is this an emergency?"
true_output = "Hello. It could be a blood collection due to minor injury or a vein rupture which is also common at this age. It is not an emergency, but you should apply compression bandage and warm compresses if six hours have past. Furthermore, it should get relieved over the next few days but if it continues to increase or persist then you should see a Doctor who can examine the patient. Take care. Chat Doctor."

predicted_label, metrics = predict_category(sample_instruction, sample_input, true_output)
print(f"Instruction: {sample_instruction}")
print(f"Input: {sample_input}")
print(f"True Output: {true_output}")
print(f"Predicted Response: {predicted_label}")
print(f"Metrics: {metrics}")

Instruction: If you are a doctor, please answer the medical questions based on the patient's description.
Input: my husband was working on a project in the house and all of a sudden a bump about the size of a half dollar appeard on his left leg inside below the knee. He is 69 years old and had triple by pass surgery 7 years ago. It stung when it first happened. Doesn t hurt now. He is seated with his leg ellevated. Is this an emergency?
True Output: Hello. It could be a blood collection due to minor injury or a vein rupture which is also common at this age. It is not an emergency, but you should apply compression bandage and warm compresses if six hours have past. Furthermore, it should get relieved over the next few days but if it continues to increase or persist then you should see a Doctor who can examine the patient. Take care. Chat Doctor.
Predicted Response: Hi, happy to help you on your health concern on part of Chat Doctor! In the present case of below-knee swelling or lump, if

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [23]:
# from transformers import AutoModelForCausalLM
# import math

# # Load the model for calculating perplexity
# causal_model = AutoModelForCausalLM.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

# # Function to calculate BLEU score
# def calculate_bleu(reference, candidate):
#     reference = [reference.split()]
#     candidate = candidate.split()
#     score = sentence_bleu(reference, candidate)
#     return score

# # Function to calculate ROUGE score
# def calculate_rouge(reference, candidate):
#     scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
#     scores = scorer.score(reference, candidate)
#     return scores

# # Function to calculate perplexity
# def calculate_perplexity(text):
#     inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
#     with torch.no_grad():
#         outputs = causal_model(**inputs, labels=inputs["input_ids"])
#     loss = outputs.loss
#     perplexity = math.exp(loss.item())
#     return perplexity

# # Evaluate the model with additional metrics
# def evaluate_model(eval_dataset):
#     bleu_scores = []
#     rouge_scores = []
#     perplexities = []

#     for example in eval_dataset:
#         instruction = example['instruction']
#         input_text = example['input']
#         reference = example['output']
#         predicted_label = predict_category(instruction, input_text)

#         # Calculate BLEU score
#         bleu = calculate_bleu(reference, predicted_label)
#         bleu_scores.append(bleu)

#         # Calculate ROUGE score
#         rouge = calculate_rouge(reference, predicted_label)
#         rouge_scores.append(rouge)

#         # Calculate Perplexity
#         perplexity = calculate_perplexity(predicted_label)
#         perplexities.append(perplexity)

#     # Calculate average scores
#     avg_bleu = np.mean(bleu_scores)
#     avg_rouge = {
#         'rouge1': np.mean([score['rouge1'].fmeasure for score in rouge_scores]),
#         'rouge2': np.mean([score['rouge2'].fmeasure for score in rouge_scores]),
#         'rougeL': np.mean([score['rougeL'].fmeasure for score in rouge_scores])
#     }
#     avg_perplexity = np.mean(perplexities)

#     return {
#         'avg_bleu': avg_bleu,
#         'avg_rouge': avg_rouge,
#         'avg_perplexity': avg_perplexity
#     }

# # Run evaluation on the validation set
# eval_results = evaluate_model(val_dataset)
# print(f"BLEU Score: {eval_results['avg_bleu']}")
# print(f"ROUGE Score: {eval_results['avg_rouge']}")
# print(f"Perplexity: {eval_results['avg_perplexity']}")


If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains

BLEU Score: 0.286250037088088
ROUGE Score: {'rouge1': 0.47370848519673897, 'rouge2': 0.31219912194738914, 'rougeL': 0.386691549496684}
Perplexity: 20513793.50274735


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
