<a href="https://colab.research.google.com/github/Bfestus/medical_chatbot_qa/blob/main/medical_chatbot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

✅ Step 1: Setup & Data Preparation

In [None]:
# Basic Libraries
import pandas as pd
import numpy as np
import os
import random
import tensorflow as tf

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Load dataset
file_path = '/content/drive/MyDrive/Colab Notebooks/Medical_chatbot/medical_qa_doctor_style_refined.csv'
df = pd.read_csv(file_path)

# Clean dataset
df.drop_duplicates(inplace=True)
df['question'] = df['question'].str.strip()
df['answer'] = df['answer'].str.strip()
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

# Preview
print(f"Total cleaned samples: {len(df)}")
df = df.rename(columns={'question': 'input_text', 'answer': 'target_text'})
df['input_text'] = 'healthcare question: ' + df['input_text']


Mounted at /content/drive
Total cleaned samples: 285


✅ Step 2: Train/Val/Test Split

In [None]:
from sklearn.model_selection import train_test_split

# 80-10-10 split
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Convert to Hugging Face Datasets
from datasets import Dataset, DatasetDict
dataset = DatasetDict({
    'train': Dataset.from_pandas(train_df),
    'validation': Dataset.from_pandas(val_df),
    'test': Dataset.from_pandas(test_df)
})


✅ Step 3: Tokenization

In [None]:
from transformers import AutoTokenizer

model_checkpoint = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

max_input_length = 128
max_target_length = 128

def preprocess(example):
    inputs = tokenizer(example['input_text'], max_length=max_input_length, padding="max_length", truncation=True)
    targets = tokenizer(example['target_text'], max_length=max_target_length, padding="max_length", truncation=True)
    inputs['labels'] = targets['input_ids']
    return inputs

tokenized_datasets = dataset.map(preprocess, batched=True, remove_columns=dataset['train'].column_names)


✅ Step 4: Model Setup

In [None]:
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, create_optimizer

# Load model
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

# Data collator
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model, return_tensors="tf")

# Training parameters
batch_size = 8
epochs = 30
learning_rate = 5e-5
num_train_steps = (len(tokenized_datasets['train']) // batch_size) * epochs
optimizer, schedule = create_optimizer(init_lr=learning_rate, num_warmup_steps=0, num_train_steps=num_train_steps)


✅ Step 5: TF Dataset Creation

In [None]:
def create_tf_dataset_from_hf(dataset, data_collator, batch_size, shuffle=False):
    examples = [{
        "input_ids": example["input_ids"],
        "attention_mask": example["attention_mask"],
        "labels": example["labels"]
    } for example in dataset]

    def data_generator():
        indices = list(range(len(examples)))
        if shuffle:
            np.random.shuffle(indices)
        for i in range(0, len(indices), batch_size):
            batch = [examples[j] for j in indices[i:i + batch_size]]
            collated = data_collator(batch)
            yield (
                {"input_ids": np.array(collated["input_ids"]), "attention_mask": np.array(collated["attention_mask"])},
                np.array(collated["labels"])
            )

    return tf.data.Dataset.from_generator(
        data_generator,
        output_signature=(
            {
                "input_ids": tf.TensorSpec(shape=(None, None), dtype=tf.int32),
                "attention_mask": tf.TensorSpec(shape=(None, None), dtype=tf.int32)
            },
            tf.TensorSpec(shape=(None, None), dtype=tf.int32)
        )
    )

# Build TF datasets
tf_train_dataset = create_tf_dataset_from_hf(tokenized_datasets["train"], data_collator, batch_size, shuffle=True)
tf_val_dataset = create_tf_dataset_from_hf(tokenized_datasets["validation"], data_collator, batch_size, shuffle=False)


✅ Step 6: Model Training & Saving

In [None]:
# Compile and train
model.compile(optimizer=optimizer)
model.fit(tf_train_dataset, validation_data=tf_val_dataset, epochs=epochs)

# Save model
output_dir = "/content/drive/MyDrive/Colab Notebooks/Medical_chatbot/healthcare-chatbot-model"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model saved to {output_dir}")


Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Model saved to /content/drive/MyDrive/Colab Notebooks/Medical_chatbot/healthcare-chatbot-model


✅ Step 7: Inference Function

In [None]:
# Keywords for filtering
medical_keywords = [
    "symptom", "diagnose", "treatment", "medicine", "disease", "doctor",
    "covid", "cancer", "diabetes", "bipolar", "stroke", "fever", "infection",
    "pain", "mental", "health", "hospital", "vaccine", "prescription",
    "disorder", "diagnosed", "asthma", "epilepsy", "hypertension",
    "depression", "anxiety", "hiv", "ibuprofen", "lisinopril", "side effects",
    "paracetamol", "atorvastatin", "metformin", "checkup", "healthy lifestyle",
    "symptoms", "water", "dose", "blood pressure", "heart", "immune",
    "medication", "mental health", "therapy"
]

def is_medical_question(question):
    return any(keyword in question.lower() for keyword in medical_keywords)

def generate_answer(question):
    if not is_medical_question(question):
        return "❗ Sorry, I can only answer healthcare-related questions."
    input_text = "healthcare question: " + question
    input_ids = tokenizer(input_text, return_tensors="tf", padding=True, truncation=True).input_ids
    output = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True)
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Test examples
test_questions = [
    "What are the symptoms of stroke?",
    "Can bipolar disorder be detected early?",
    "How is COVID-19 diagnosed?",
]

for q in test_questions:
    print(f"\n❓ Question: {q}")
    print(f"💬 Answer: {generate_answer(q)}")



❓ Question: What are the symptoms of stroke?
💬 Answer: Symptoms of stroke include shortness of breath, fatigue. this information is helpful for understanding the condition better. understanding this response helps in gaining deeper insight into the medical condition and encourages timely medical consultation. If you have any concerns or symptoms, it's important to follow up with a healthcare provider for a personalized evaluation. If you have any concerns or symptoms, it's important to follow up with a healthcare provider for a personalized evaluation.

❓ Question: Can bipolar disorder be detected early?
💬 Answer: Yes, certainly, early detection of bipolar disorder is possible through questionnaires, ecg. this information is helpful for understanding the condition better. healthcare providers use behavioral assessments, interviews, and family history to evaluate and detect mood disorders at an early stage, potentially preventing more severe episodes. If you have any concerns or sympto