In [None]:
!pip install langchain==0.1.1
!pip install langchain-community
!pip install sentence-transformers
!pip install chromadb
!pip install openai


In [None]:
import os
os._exit(00)

In [1]:
import os
import re
import json
from pathlib import Path
from typing import List
import gradio as gr

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain_community.llms import HuggingFacePipeline
from langchain.chains import RetrievalQA

In [4]:
import os, json, re
from pathlib import Path
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.docstore.document import Document

BASE_DIR = "/kaggle/input/mimic-iv-ext-direct/mimic-iv-ext-direct-1.0.0"

def load_json_files_as_text(data_dir):
    """Load JSON files and preprocess text."""
    documents = []
    for path in Path(data_dir).rglob("*.json"):
        try:
            with open(path, "r", encoding="utf-8") as f:
                data = json.load(f)
                text = json.dumps(data)  
                if not text.strip(): 
                    continue

               
                if len(text) < 50:
                    continue

               
                text = re.sub(r"\s+", " ", text)

                
                text = re.sub(r"[^a-zA-Z0-9.,:;!?()\s]", "", text)

                documents.append(text)
        except:
            continue
    return documents

print("[INFO] Loading and preprocessing documents...")
all_texts = load_json_files_as_text(BASE_DIR)
print(f"[INFO] Total raw documents after preprocessing: {len(all_texts)}")


text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
docs = text_splitter.split_documents([Document(page_content=d) for d in all_texts])
print(f"[INFO] Total document chunks: {len(docs)}")


[INFO] Loading and preprocessing documents...
[INFO] Total raw documents after preprocessing: 1046
[INFO] Total document chunks: 7546


In [5]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

CHROMA_DIR = "./chroma_db"
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [6]:
BATCH_SIZE = 5000

def build_or_load_vectorstore(docs, batch_size=BATCH_SIZE, force_rebuild=False):
    
    if force_rebuild or not os.path.exists(CHROMA_DIR):
        print("[INFO] Building vectorstore from documents in batches...")
        vectorstore = Chroma(persist_directory=CHROMA_DIR, embedding_function=embeddings)

        for i in range(0, len(docs), batch_size):
            batch_docs = docs[i:i + batch_size]
            vectorstore.add_documents(batch_docs)
            print(f"[INFO] Added batch {i // batch_size + 1} / {(len(docs) + batch_size - 1) // batch_size}")

        vectorstore.persist()
        print("[INFO] Vectorstore built and persisted!")
    else:
        print("[INFO] Loading existing vectorstore...")
        vectorstore = Chroma(persist_directory=CHROMA_DIR, embedding_function=embeddings)

    return vectorstore


vectorstore = build_or_load_vectorstore(docs)


[INFO] Loading existing vectorstore...


In [13]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain_community.llms import HuggingFacePipeline

tokenizer = AutoTokenizer.from_pretrained("mosaicml/mpt-7b-instruct")
model = AutoModelForCausalLM.from_pretrained("mosaicml/mpt-7b-instruct", device_map="auto")

generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=256,
    temperature=0.3
)

llm = HuggingFacePipeline(pipeline=generator)
 




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


In [14]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=vectorstore.as_retriever(),
    return_source_documents=True
)


In [16]:
from bert_score import score
import pandas as pd


queries = [
    "What are the common symptoms for patients with diabetes?",
    "What is the normal blood pressure range?",
    "List some common medications for hypertension.",
    "What is a normal fasting blood sugar level?",
    "What are the symptoms of anemia?",
    "List some common treatments for high cholesterol.",
    "What are the warning signs of a heart attack?",
    "List common preventive measures for stroke."
]


expected_answers = [
    "Common symptoms include frequent urination, excessive thirst, and fatigue.",
    "Normal blood pressure is typically around 120/80 mmHg.",
    "Common medications include ACE inhibitors, beta-blockers, and diuretics.",
    "A normal fasting blood sugar level is between 70–100 mg/dL.",
    "Symptoms include fatigue, weakness, pale skin, and shortness of breath.",
    "Treatments include statins, lifestyle changes, and dietary modifications.",
    "Warning signs include chest pain, shortness of breath, and nausea.",
    "Preventive measures include regular exercise, healthy diet, and controlling blood pressure."
]


In [26]:
generated_answers = []

for i, query in enumerate(queries):
    result = qa_chain({"query": query})  
    generated_answers.append(result["result"]) 
    print(f"Query {i+1}: {query}")
    print("Generated Answer:", result["result"])
    print("Expected Answer:", expected_answers[i])
    print("-" * 80)


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Query 1: What are the common symptoms for patients with diabetes?
Generated Answer:  The most common symptoms of diabetes are frequent urination, increased thirst, and unexplained weight loss.
Expected Answer: Common symptoms include frequent urination, excessive thirst, and fatigue.
--------------------------------------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Query 2: What is the normal blood pressure range?
Generated Answer:  The normal blood pressure range is considered to be less than 120 over 80. Blood pressure is the force of the blood flowing through the arteries. High blood pressure, or hypertension, is when the blood pressure is higher than normal. It is a serious health condition and can lead to heart disease and stroke.
Expected Answer: Normal blood pressure is typically around 120/80 mmHg.
--------------------------------------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Query 3: List some common medications for hypertension.
Generated Answer:  Some common medications for hypertension include beta blockers, calcium channel blockers, ACE inhibitors, and ARBs.
Expected Answer: Common medications include ACE inhibitors, beta-blockers, and diuretics.
--------------------------------------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Query 4: What is a normal fasting blood sugar level?
Generated Answer:  A normal fasting blood sugar level is less than 100 mg/dL.
Expected Answer: A normal fasting blood sugar level is between 70–100 mg/dL.
--------------------------------------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Query 5: What are the symptoms of anemia?
Generated Answer:  Anemia is a condition in which the body does not have enough healthy red blood cells. The symptoms of anemia include fatigue, shortness of breath, weakness, and pale skin.
Expected Answer: Symptoms include fatigue, weakness, pale skin, and shortness of breath.
--------------------------------------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Query 6: List some common treatments for high cholesterol.
Generated Answer:  Some common treatments for high cholesterol include statins, which reduce cholesterol production in the liver; and bile acid sequestrants, which reduce the absorption of cholesterol in the intestines. Other treatments include lifestyle changes, such as eating a low-cholesterol diet and exercising regularly.
Expected Answer: Treatments include statins, lifestyle changes, and dietary modifications.
--------------------------------------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


Generated Answer:  Chest pain or discomfort, especially when exerting yourself, is a common symptom. Other symptoms include pain or discomfort in other areas of the body, such as the back, arm, neck, or jaw, shortness of breath, and sweating. If you experience these symptoms, it is important to seek medical attention immediately.
--------------------------------------------------------------------------------
Query 8: List common preventive measures for stroke.
Generated Answer:  Common preventive measures for stroke include maintaining a healthy diet, exercising regularly, and not smoking.
Expected Answer: Preventive measures include regular exercise, healthy diet, and controlling blood pressure.
--------------------------------------------------------------------------------


In [28]:
evaluation_results = []

for i, gen in enumerate(generated_answers):
    expected = expected_answers[i]

    expected_words = expected.lower().split()
    accuracy = any(word in gen.lower() for word in expected_words[:3]) if expected_words else 0

    q_words = set(queries[i].lower().split())
    g_words = set(gen.lower().split())
    relevance = len(q_words.intersection(g_words))

    coherence = 1 if len(gen.split()) >= 10 else 0

    evaluation_results.append({
        "Query": queries[i],
        "Generated Answer": gen,
        "Expected Answer": expected,
        "Accuracy": int(accuracy),
        "Relevance Score": relevance,
        "Coherence Score": coherence
    })

print("\nEASY EVALUATION REPORT\n")
for r in evaluation_results:
    print("Query:", r["Query"])
    print("Accuracy:", r["Accuracy"])
    print("Relevance Score:", r["Relevance Score"])
    print("Coherence Score:", r["Coherence Score"])
    print("-" * 60)



EASY EVALUATION REPORT

Query: What are the common symptoms for patients with diabetes?
Accuracy: 1
Relevance Score: 4
Coherence Score: 1
------------------------------------------------------------
Query: What is the normal blood pressure range?
Accuracy: 1
Relevance Score: 5
Coherence Score: 1
------------------------------------------------------------
Query: List some common medications for hypertension.
Accuracy: 1
Relevance Score: 4
Coherence Score: 1
------------------------------------------------------------
Query: What is a normal fasting blood sugar level?
Accuracy: 1
Relevance Score: 6
Coherence Score: 1
------------------------------------------------------------
Query: What are the symptoms of anemia?
Accuracy: 1
Relevance Score: 3
Coherence Score: 1
------------------------------------------------------------
Query: List some common treatments for high cholesterol.
Accuracy: 1
Relevance Score: 5
Coherence Score: 1
--------------------------------------------------------

In [31]:
for i, query in enumerate(queries):
    generated = result["result"]
    expected = expected_answers[i]
    expected_words = expected.lower().split()
    accuracy = any(word in generated.lower() for word in expected_words) if expected_words else 0

    if accuracy == 0 :
        print(f"Error for Query {i+1}")
        print(f"Query: {query}")
        print(f"Generated Answer: {generated}")
        print(f"Expected Answer: {expected}")
        print(f"Issue: Low accuracy or low relevance")
        print("-" * 80)
    else:
        print(f"Query {i+1} passed")


Query 1 passed
Query 2 passed
Query 3 passed
Query 4 passed
Query 5 passed
Query 6 passed
Query 7 passed
Query 8 passed


In [32]:
import gradio as gr


def answer_query(user_query):
    try:
        result = qa_chain({"query": user_query})
        return result["result"]
    except Exception as e:
        return f"Error: {str(e)}"


iface = gr.Interface(
    fn=answer_query,
    inputs=gr.Textbox(lines=2, placeholder="Enter your clinical query here..."),
    outputs="text",
    title="HealthGenie: AI Healthcare Assistant",
    description="Ask clinical questions and get answers based on the RAG system.",
    examples=[
        "What are the common symptoms for patients with diabetes?",
        "List some common medications for hypertension.",
        "What are the warning signs of a heart attack?"
    ]
)


iface.launch(share=True)


* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://f2fb7dd36a7e4dd783.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.
