In [1]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
from openai import OpenAI
from tqdm.auto import tqdm
import torch
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from project_secrets import GPT_4o_API_KEY

In [3]:
GPT_4o_MODEL = "openai/gpt-4o-mini"
client_4o = OpenAI(base_url="https://openrouter.ai/api/v1", api_key=GPT_4o_API_KEY)

In [4]:
ds = load_dataset("ruslanmv/ai-medical-chatbot")

In [5]:
ds["train"][0]["Description"]

'Q. What does abutment of the nerve root mean?'

In [6]:
qa_pairs = [(entry["Patient"], entry["Doctor"]) for entry in ds["train"]]

In [7]:
questions, answers = zip(*qa_pairs)

In [14]:
embed_model = SentenceTransformer("paraphrase-MiniLM-L3-v2")  # Fast and effective

In [None]:
# ! Do not run this cell unless you want to rebuild question_embeddings
# Process in batches
batch_size = 32
question_embeddings = []

for i in tqdm(range(0, len(questions), batch_size)):
    batch = questions[i:i + batch_size]
    batch_embeddings = embed_model.encode(batch, convert_to_numpy=True, show_progress_bar=True)
    question_embeddings.append(batch_embeddings)

question_embeddings = np.vstack(question_embeddings)

In [30]:
# ! Only run this cell when you want to save question_embeddings
with open("question_embeddings.pkl", "wb") as f:
    pickle.dump(question_embeddings, f)

In [8]:
# ! Only run this cell when you want to LOAD question_embeddings
with open("question_embeddings.pkl", "rb") as f:
    question_embeddings = pickle.load(f)

In [9]:
# Store embeddings in FAISS for fast retrieval
dimension = question_embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(question_embeddings)

In [11]:
def retrieve_context(user_query, top_k=3):
    """Finds the most relevant stored questions and their answers"""
    query_embedding = embed_model.encode([user_query], convert_to_numpy=True)
    _, indices = index.search(query_embedding, top_k)
    
    retrieved_context = "\n\n".join([f"Patient: {questions[i]}\nDoctor: {answers[i]}" for i in indices[0]])
    return retrieved_context

In [12]:
def medical_chatbot(user_query):
    """Uses OpenAI's GPT-4 to generate a response with retrieved context"""
    retrieved_info = retrieve_context(user_query)
    prompt = f"""
    You are a helpful and professional medical chatbot. Below is past conversation data:

    {retrieved_info}

    Now answer the following question in a helpful and concise manner:
    Patient: {user_query}
    Doctor:
    """
    response = client_4o.chat.completions.create(
        model=GPT_4o_MODEL,
        messages=[
            {
                "role": "system", "content": "You are a medical chatbot."
            },
            {
                "role": "user", "content": prompt
            }
        ],
        temperature=0.7
    )
    return response.choices[0].message.content

In [15]:
user_query = "Last night, I bled from one nostril. I wasn't hit or anything, I was sleeping on my stomach. Should I be worried?"
ai_response = medical_chatbot(user_query)
print("Chatbot:", ai_response)

Chatbot: Hi! Occasional nosebleeds can be quite common and might not be a cause for concern, especially if there was no prior trauma or injury. Sleeping on your stomach could potentially irritate the nasal passages. Factors such as dry air, allergies, or even nasal congestion can also contribute to nosebleeds. However, if you experience frequent nosebleeds, significant bleeding, or other concerning symptoms, it would be wise to consult a healthcare provider for further evaluation. Keeping your nasal passages moisturized with saline spray or a humidifier may help prevent future occurrences. Take care!
