<a href="https://colab.research.google.com/github/Avinashrkrk/CLinicalBERT-Medical-ChatBot/blob/main/CLinicalBERT_medical_chatbot_safehealth.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets torch sentencepiece

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments
from datasets import load_dataset, load_metric
import nltk
import numpy as np

In [None]:
# NLTK data
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
# Loading dataset
dataset = load_dataset("ruslanmv/ai-medical-chatbot")

In [None]:
# Spliting the dataset into train and test
dataset = dataset['train'].train_test_split(test_size=0.1)
train_dataset = dataset['train']
test_dataset = dataset['test']

In [None]:
train_dataset[0]

{'Description': 'Should I be concerned about having pain in the left side of my back?',
 'Patient': 'I have been having left side pain on the middle of my side but it can be in my back, chest and even my lower right side at times. No other symptoms at all. I had hernia repair with mesh 1 year ago. THe pain is tolerable just annoying. Should I be concerned?',
 'Doctor': 'Hello, The pain on one side of the back can be postural in nature. Standing for long periods and lack of adequate exercise can cause such a pain. Oral muscle relaxants and regular exercises can help. Hope I have answered your query. Let me know if I can assist you further. Take care Regards, Dr Praveen Tayal, Orthopaedic Surgeon'}

In [None]:
print(train_dataset[1])

{'Description': 'Q. My cuticles have developed blood specks. Is it scleroderma?', 'Patient': "Hi doctor,I am contacting you because I have Raynaud's, and the past few months my cuticles have developed blood specks. I know these symptoms point to scleroderma. It has me very worried. I need to visit a doctor for tests but I have to wait a few days and I would really like to get some clarification on a few things. I get the impression that once you show these signs of specks in cuticles it definitely indicates scleroderma or some other autoimmune issue. Is it possible that I am wrong? I stopped getting manicures and the cuticles started getting very overgrown. Is it possible the specks in my cuticles are just because of this? Or maybe because my hands are dry and I have many hangnails? Another question I have is if I do ultimately have limited scleroderma CREST, is it common for these CREST patients to develop full-blown diffuse scleroderma? I am very stressed out about this. I am 26 year

In [None]:
# Loading the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

# Using a seq2seq model like T5
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

In [None]:
# Preprocessing the data
def preprocess_data(examples):
    inputs = examples['Patient']
    targets = examples['Doctor']
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length')
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=512, truncation=True, padding='max_length')
    model_inputs["labels"] = labels["input_ids"]
    model_inputs["attention_mask"] = model_inputs["attention_mask"]
    return model_inputs

# Tokenize the data
tokenized_train_dataset = train_dataset.map(preprocess_data, batched=True)
tokenized_test_dataset = test_dataset.map(preprocess_data, batched=True)

Map:   0%|          | 0/231224 [00:00<?, ? examples/s]



Map:   0%|          | 0/25692 [00:00<?, ? examples/s]

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=8,
    predict_with_generate=True
)



In [None]:
# Defining the metric
metric = load_metric("sacrebleu")

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    return result

In [None]:

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
trainer.save_model("clinical_bert_seq2seq_model")
tokenizer.save_pretrained("clinical_bert_seq2seq_model")

In [None]:
tokenizer = AutoTokenizer.from_pretrained("clinical_bert_seq2seq_model")
model = AutoModelForSeq2SeqLM.from_pretrained("clinical_bert_seq2seq_model")

In [None]:
def chat_with_ai(model, tokenizer, input_text):
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True, padding='max_length')
    outputs = model.generate(inputs["input_ids"], max_length=512, num_beams=5, early_stopping=True)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

In [None]:
def chat():
    print("Welcome to the Medical Chatbot - Safe Health. Type 'exit' to end the conversation.")
    while True:
        user_input = input("User: ")
        if user_input.lower() == 'exit':
            break
        response = chat_with_ai(model, tokenizer, user_input)
        print(f"AI: {response}")

In [None]:
chat()