# Required Libraries

### 1. Installing

In [1]:
# %pip install numpy==1.24.3 pandas==2.0.3
# %pip install -qU langchain-text-splitters langchain-community langchain-google-genai langchain-huggingface langchain-pinecone langgraph transformers sentence-transformers pinecone-client


### 2.Importing

In [2]:
# Step 1: Import necessary libraries
import os
import getpass
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from pinecone import Pinecone
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import PromptTemplate
from langchain_core.documents import Document
from typing_extensions import List, TypedDict
from langgraph.graph import START, StateGraph
import pandas as pd
import numpy as np
import pinecone
import re
import string

# Document Loding and Pre-processing:

In [3]:
# Load the dataset (use the correct path to your MedQuad dataset)
dataset_path = "medquad.csv"
loader = CSVLoader(file_path = dataset_path)

In [4]:
# Load the documents from the CSV
documents = loader.load()

In [5]:
# Preprocessing function for documents cleaning
def preprocess_text(text):
    # Step 1: Remove extra spaces and newlines
    text = text.strip()
    text = " ".join(text.split())  # remove extra spaces between words

    # Step 2: Remove non-informative content (e.g., footer, page numbers, unnecessary metadata)
    text = re.sub(r'\(.*\)', '', text)  # Removes text inside parentheses (e.g., page numbers or notes)

    # Step 3: Extract useful metadata (e.g., source, focus_area) and remove from text if needed
    source_match = re.search(r'source:\s*(.*)', text)
    focus_area_match = re.search(r'focus_area:\s*(.*)', text)
    metadata = {
        'source': source_match.group(1) if source_match else '',
        'focus_area': focus_area_match.group(1) if focus_area_match else ''
    }

    # Step 4: Remove the extracted metadata from text
    text = re.sub(r'source:.*', '', text)  # Remove the "source" part
    text = re.sub(r'focus_area:.*', '', text)  # Remove the "focus_area" part

    # Step 5: Remove punctuation (optional, depends on the task)
    text = text.translate(str.maketrans("", "", string.punctuation))

    # Step 6: Convert text to lowercase
    text = text.lower()

    return text, metadata

# Loop through each document and apply preprocessing
for doc in documents:
    cleaned_text, doc_metadata = preprocess_text(doc.page_content)
    doc.page_content = cleaned_text
    doc.metadata.update(doc_metadata)

# Splitting Documents

In [6]:
# Initialize the RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # chunk size (characters)
    chunk_overlap=200,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)

# Split the document into smaller chunks
split_docs = text_splitter.split_documents(documents)

# Preview the split documents
print(f"Number of split documents: {len(split_docs)}")
print("Sample split document:")
print(split_docs[0].page_content[:500])  # Show first 500 characters of the first chunk

Number of split documents: 22372
Sample split document:
question what is  in glaucoma for still unknown reasons the fluid drains too slowly out of the eye as the fluid builds up the pressure inside the eye rises unless this pressure is controlled it may cause damage to the optic nerve and other parts of the eye and result in loss of vision openangle glaucoma the most common type of glaucoma is called openangle glaucoma in the normal eye the clear fluid leaves the anterior chamber at the open angle where the cornea and iris meet when fluid reaches the


# Creating Embeddings and Upsert in Pinecone

In [13]:
# Initialize the embeddings model
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

# Initialize Pinecone client with your API key
pc = Pinecone(api_key="Paste-Your-Pinecone-API-Key-Here")

# Connect to the specific index
index_name = "medical-rag-chatbot"
index = pc.Index(index_name)

# Set up the VectorStore
vector_store = PineconeVectorStore(embedding=embeddings, index=index)

# Add the document chunks to the vector store
document_ids = vector_store.add_documents(documents=split_docs)

# Print the first 3 document IDs to verify
print(f"First 3 Document IDs: {document_ids[:3]}")

First 3 Document IDs: ['db21ba52-d381-49e0-b3d1-ebb306e23301', '9315c60e-a8d6-4288-b6b8-0c9db4efd464', 'c3fa64c2-3876-4da2-820f-29ad63e13c80']


# Querying and Retrieval

In [26]:
# Define the function for retrieval
def retrieve(query: str, k: int = 3):  # Use 'k' for the number of results to return
    # Perform similarity search to retrieve the most relevant documents
    retrieved_docs = vector_store.similarity_search(query, k=k)

    # Return the retrieved documents
    return retrieved_docs

# Example user query
query = "What is glaucoma and its treatment?"

# Retrieve relevant documents for the query
retrieved_docs = retrieve(query)

# Display the retrieved documents
print(f"Retrieved Documents for the query '{query}':")
for doc in retrieved_docs:
    print(doc.page_content[:300])  # Print the first 300 characters of each retrieved document

Retrieved Documents for the query 'What is glaucoma and its treatment?':
question what is  glaucoma  answer glaucoma is a group of diseases that can damage the eyes optic nerve it is a leading cause of blindness in the united states it usually happens when the fluid pressure inside the eyes slowly rises damaging the optic nerve often there are no symptoms at first withou
to ways some patients are coping with glaucoma surgery laser surgery is another treatment for glaucoma during laser surgery a strong beam of light is focused on the part of the anterior chamber where the fluid leaves the eye this results in a series of small changes that makes it easier for fluid to
question what is  in glaucoma for still unknown reasons the fluid drains too slowly out of the eye as the fluid builds up the pressure inside the eye rises unless this pressure is controlled it may cause damage to the optic nerve and other parts of the eye and result in loss of vision openangle glau


# LLM Integration for Generating Answers

In [46]:
# Define the state of the application
class State(TypedDict):
    question: str
    context: List[Document]
    answer: str

# Initialize the Gemini API key for authentication
gemini_api_key = "Paste-Your-Gemini-API-Key-Here"

# Initialize the ChatGoogleGenerativeAI model with the Gemini API key
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", api_key=gemini_api_key, temperature=0)

# Define the retrieval step
def retrieve(state: State):
    # Perform similarity search to retrieve relevant documents
    retrieved_docs = vector_store.similarity_search(state["question"], k=5)

    # If no documents found, add a custom message
    if not retrieved_docs:
        print("No relevant documents found. Please refine your query.")

    return {"context": retrieved_docs}

# Define the generation step
def generate(state: State):
    # Check if there are retrieved documents
    if not state["context"]:
        return {"answer": "Sorry, no relevant documents found. Please refine your query to be more specific to medical topics."}

    # Extract the content of the retrieved documents
    docs_content = "\n\n".join(doc.page_content for doc in state["context"])

    # Define the improved prompt template
    template = """
    You are an assistant trained to answer medical questions. Use the following pieces of context to directly answer the user's question.
    If you don't know the answer or if the context does not seem relevant, respond with:
    "Sorry, I couldn't find relevant information. Please refine your query to be more related to medical topics."

    Question: {question}
    Context: {context}
    Answer:
    """

    # Create the prompt using the template
    custom_rag_prompt = PromptTemplate.from_template(template)

    # Format the prompt
    prompt = custom_rag_prompt.format(question=state["question"], context=docs_content)

    # Generate the answer using the LLM
    response = llm.invoke(prompt)

    # Directly access the content of the response
    answer_content = response.content.strip()  # Accessing 'content' directly

    # Return the cleaned answer
    return {"answer": answer_content}

# Create a sequence of steps (retrieval -> generation)
from langgraph.graph import START, StateGraph

graph_builder = StateGraph(State).add_sequence([retrieve, generate])
graph_builder.add_edge(START, "retrieve")
graph = graph_builder.compile()

# Test the application with a query
result = graph.invoke({"question": "What is the Lymphocytic Colitis?"})

# Print the generated answer
print(f"Answer: {result['answer']}")


Answer: Lymphocytic colitis is a type of microscopic colitis, an inflammation of the colon only visible under a microscope.  Symptoms may include chronic watery diarrhea, abdominal pain, cramping, bloating, weight loss, nausea, dehydration, and/or fecal incontinence. The underlying cause is unknown, but may involve autoimmune conditions, medications, infections, genetic factors, and/or bile acid malabsorption. Treatment depends on individual symptoms and may include medications, dietary changes, and rarely, surgery.
