In [1]:
print("OK!")

OK!


In [2]:
import os
import re
from langchain.document_loaders import PyPDFLoader
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain_community.embeddings import HuggingFaceEmbeddings

from langchain.vectorstores import Pinecone
from langchain.chains import create_retrieval_chain
from langchain.prompts import ChatPromptTemplate
from langchain_google_genai import ChatGoogleGenerativeAI

from dotenv import load_dotenv
import pinecone

In [3]:
import os
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\user\\OneDrive\\Desktop\\Medical-Chatbot'

In [5]:
load_dotenv()

True

In [6]:
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')

In [7]:
from pinecone import Pinecone, ServerlessSpec

# Create the Pinecone instance
pc = Pinecone(api_key=PINECONE_API_KEY)

# Optional: Check if your index exists
if "medical-chatbot" not in pc.list_indexes().names():
    pc.create_index(
        name="medical-chatbot",
        dimension=384,  # Use 384 if you're using all-MiniLM-L6-v2 embeddings
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",   # or "gcp"
            region="us-west-2"  # change if needed
        )
    )


In [8]:
def load_pdf(data):
    all_docs = []
    for filename in os.listdir(data):
        if filename.endswith(".pdf"):
            loader = PyPDFLoader(os.path.join(data, filename))
            pages = loader.load()
            all_docs.extend(pages[14:])  # Skip the first 14 pages (adjust this number if needed)
    return all_docs

In [9]:
def clean_text(text):
    text = re.sub(r"^GALE ENCYCLOPEDIA OF MEDICINE.*", "", text)  # Remove lines starting with GALE ENCYCLOPEDIA OF MEDICINE
    text = re.sub(r"^GEM - .+", "", text)  # Remove GEM metadata entirely
    text = re.sub(r"^Page \d{1,3}.*", "", text)  # Remove page number information
    text = re.sub(r"\d{1,3}.*", "", text)  # Remove any numeric references (like page numbers or codes)
    text = re.sub(r"Photograph.*", "", text)  # Remove photograph references
    text = re.sub(r"Reproduced by permission.*", "", text)  # Remove permission notice
    text = re.sub(r"\s{2,}", " ", text)  # Replace multiple spaces with a single space
    return text.strip()  # Strip trailing/leading spaces

In [10]:
def remove_metadata_from_documents(documents):
    cleaned_documents = []
    for doc in documents:
        cleaned_content = clean_text(doc.page_content)
        cleaned_documents.append(Document(page_content=cleaned_content))
    return cleaned_documents

In [11]:
def text_split(extracted_data):
    cleaned_docs = remove_metadata_from_documents(extracted_data)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(cleaned_docs)
    return text_chunks

In [12]:
extracted_data = load_pdf("data/")
text_chunks = text_split(extracted_data)

In [13]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")



from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    embedding=embeddings,
    index_name="medical-chatbot"
)


  embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


In [14]:
for i, chunk in enumerate(text_chunks[:3]):
    print(f"\nChunk {i+1}:\n{'-'*30}")
    print(chunk.page_content[:500])  # Print first 500 characters



Chunk 1:
------------------------------
Abdominal aorta ultrasound see Abdominal
ultrasound
Abdominal aortic aneurysm see Aortic
aneurysm
Abdominal hernia see Hernia
Abdominal thrust see Heimlich maneuver
Abdominal ultrasound
Definition
Ultrasound technology allows doctors to “see”
inside a patient without resorting to surgery. A transmit-
ter sends high frequency sound waves into the body,
where they bounce off the different tissues and organs to
produce a distinctive pattern of echoes. A receiver

Chunk 2:
------------------------------
“hears” the returning echo pattern and forwards it to a
computer, which translates the data into an image on a
television screen. Because ultrasound can distinguish
subtle variations between soft, fluid-filled tissues, it is
particularly useful in providing diagnostic images of the
abdomen. Ultrasound can also be used in treatment.
Purpose
The potential medical applications of ultrasound
were first recognized in the sonar technology developed to dete

In [15]:
retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k": 3})


In [16]:
llm = ChatGoogleGenerativeAI(
    model="models/gemini-1.5-flash-latest",  # Use the appropriate model version
    temperature=0.4,
    max_output_tokens=500,
    google_api_key=GOOGLE_API_KEY,
)

In [17]:
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you don't know, "
    "Use a maximum of three sentences and keep the answer concise."
    "\n\n"
    "{context}"
)

In [18]:
prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}"),
])

In [19]:
from langchain_core.runnables import RunnableParallel, RunnableSequence, RunnableLambda
from langchain.chains import create_retrieval_chain
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

# Define the input mapper
input_mapper = RunnableParallel({
    "context": retriever,  # Ensure retriever is a valid retriever object
    "input": RunnableLambda(lambda x: x)  # Pass the question as 'input'
})

In [20]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser  

question_answer_chain = create_stuff_documents_chain(llm, prompt)

In [21]:
rag_chain = RunnableSequence(input_mapper, question_answer_chain)

In [22]:
response = rag_chain.invoke("What is Acne?")
print(response)


Acne is a common skin disease characterized by pimples on the face, chest, and back.  It's caused by clogged pores due to oil, dead skin cells, and bacteria.  Acne vulgaris is the medical term for common acne, and it's the most prevalent skin disease.


In [23]:
response = rag_chain.invoke("What is Achalasia?")
print(response)


Achalasia is a disorder of the esophagus that prevents normal swallowing.  It affects the esophagus, the tube carrying food from the throat to the stomach.  The disorder involves a malfunction of the lower esophageal muscle ring.
