<h2 align="center">Medical Chatbot Project Source by Devesh Singh</h2>

In [1]:
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_pdf(path):
    loader = DirectoryLoader(path = path, glob = "*.pdf", loader_cls = PyPDFLoader)
    documents = loader.load()
    return documents

In [3]:
pdf_data = load_pdf("../Resources")

In [12]:
type(pdf_data[4])

langchain_core.documents.base.Document

In [None]:
# split the pdf data into chunks of size 500 with an overlap of 20
def text_split(pdf_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size = 500, chunk_overlap = 20)
    chunks = text_splitter.split_documents(pdf_data)
    return chunks

In [14]:
text_chunks = text_split(pdf_data)
len(text_chunks)

40000

In [18]:
text_chunks[0].page_content

'The GALE\nENCYCLOPEDIA of\nMEDICINE\nTHIRD EDITION'

In [19]:
from langchain.embeddings import HuggingFaceEmbeddings

In [None]:
# function to get huggingface embeddings this model is small and fast and output embeddings of size 384
def get_huggingface_embeddings():
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    hf_embeddings = HuggingFaceEmbeddings(model_name = model_name)
    return hf_embeddings

In [21]:
embeddings = get_huggingface_embeddings()

  hf_embeddings = HuggingFaceEmbeddings(model_name = model_name)


In [47]:
from dotenv import load_dotenv
load_dotenv()

True

In [None]:
# using pinecone as vector database to store the embeddings
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec

pc = Pinecone()

index_name = "medical-chatbot-index"

if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        vector_type="dense",
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        ),
        deletion_protection="disabled",
        tags={
            "environment": "development"
        }
    )

In [35]:
from langchain_pinecone import PineconeVectorStore

In [37]:
# Took around 20 mins to upload 4000 chunks of 500 size each
PineconeVectorStore.from_documents(documents=text_chunks, embedding=embeddings, index_name=index_name)

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x1822c996490>

In [None]:
pc_index = PineconeVectorStore.from_existing_index(index_name=index_name, embedding=embeddings)
retriever = pc_index.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 3}
)

In [87]:
value = retriever.invoke("What is Asthma")

In [88]:
value[0].page_content

'Asthma\nDefinition\nAsthma is a chronic (long-lasting) inflammatory\ndisease of the airways. In those susceptible to asthma,\nthis inflammation causes the airways to narrow peri-\nodically. This, in turn, produceswheezing and breath-\nlessness, sometimes to the point where the patient\ngasps for air. Obstruction to air flow either stops\nspontaneously or responds to a wide range of treat-\nments, but continuing inflammation makes the\n424 GALE ENCYCLOPEDIA OF MEDICINE\nAsthma'

In [89]:
from langchain_groq import ChatGroq

In [90]:
model = "openai/gpt-oss-120b"
llm = ChatGroq(model = model, temperature = 0.5, max_tokens = 500)

In [91]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [None]:
# prompt template data for the medical assistant
system_prompt = """You are helpful medical assistant. Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Don't answer anything outside of medical queries. Use three sentences maximum and keep the answer as concise as possible. Always answer in a professional tone and manner. \n\n {context}"""

In [None]:
# create the prompt template
prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("user", "{input}")
])

In [None]:
# create the retrieval chain
query_chain = create_stuff_documents_chain(llm = llm, prompt = prompt)
chain = create_retrieval_chain(retriever = retriever, combine_docs_chain = query_chain)

In [95]:
response = chain.invoke({"input": "What is Asthma?"})
response["answer"]

'Asthma is a chronic inflammatory disease of the airways that causes periodic narrowing, leading to wheezing, breathlessness, and sometimes severe respiratory distress. The airway obstruction may resolve spontaneously or respond to treatment, but ongoing inflammation persists. Effective management focuses on reducing inflammation and preventing triggers.'

In [96]:
response = chain.invoke({"input": "What is Retrival Augmented Generation?"})
response

{'input': 'What is Retrival Augmented Generation?',
 'context': [Document(id='d2d11799-113b-49e8-a42f-3c17885bae52', metadata={'creationdate': '2006-10-16T20:19:33+02:00', 'creator': 'Adobe Acrobat 6.0', 'moddate': '2006-10-16T22:03:45+02:00', 'page': 1608.0, 'page_label': '1579', 'producer': 'PDFlib+PDI 6.0.3 (SunOS)', 'source': '..\\Resources\\The-Gale-Encyclopedia-of-Medicine-3rd-Edition.pdf', 'total_pages': 4505.0}, page_content='inserted genes can be naturally-occurring genes that\nproduce the desired effect or may be genetically engi-\nneered (or altered) genes.\nScientists have known how to manipulate a gene’s\nstructure in the laboratory since the early 1970s through\na process called gene splicing. The process involves\nremoving a fragment of DNA containing the specific\ngenetic sequence desired, then inserting it into the DNA\nof another gene. The resultant product is called recom-'),
  Document(id='6b3c9755-2c90-45ed-ad7f-902601682e27', metadata={'creationdate': '2006-10-16T

In [97]:
docs_and_scores = retriever.vectorstore.similarity_search_with_score(
    "What is Retrival Augmented Generation?", 
    k=4
)

for doc, score in docs_and_scores:
    print(score, doc.page_content[:200])


0.374864131 inserted genes can be naturally-occurring genes that
produce the desired effect or may be genetically engi-
neered (or altered) genes.
Scientists have known how to manipulate a gene’s
structure in the
0.34870249 Transference is the name that psychoanalysts
use for the patient’s repetition of childlike ways of
relating that were learned in early life. If the ther-
apeutic alliance has been well established, th
0.343909293 Retinoids— A derivative of synthetic Vitamin A.
Sporadic— Isolated or appearing occasionally with
no apparent pattern.
X-linked dominant inheritance— The inheritance
of a trait by the presence of a si
0.342236549 Suite 2, Santa Cruz, CA 95060. (831) 457-1004, Fax:
(831) 426-4383. <http://www.trich.org> .
Liz Meszaros
Rebecca Frey, PhD
In vitro fertilization
Definition
In vitro fertilization (IVF) is a procedur


In [None]:
pc_index.similarity_search_with_score(query = "What is Retrival Augmented Generation?", k=3, )

[(Document(id='d2d11799-113b-49e8-a42f-3c17885bae52', metadata={'creationdate': '2006-10-16T20:19:33+02:00', 'creator': 'Adobe Acrobat 6.0', 'moddate': '2006-10-16T22:03:45+02:00', 'page': 1608.0, 'page_label': '1579', 'producer': 'PDFlib+PDI 6.0.3 (SunOS)', 'source': '..\\Resources\\The-Gale-Encyclopedia-of-Medicine-3rd-Edition.pdf', 'total_pages': 4505.0}, page_content='inserted genes can be naturally-occurring genes that\nproduce the desired effect or may be genetically engi-\nneered (or altered) genes.\nScientists have known how to manipulate a gene’s\nstructure in the laboratory since the early 1970s through\na process called gene splicing. The process involves\nremoving a fragment of DNA containing the specific\ngenetic sequence desired, then inserting it into the DNA\nof another gene. The resultant product is called recom-'),
  0.374864131),
 (Document(id='6b3c9755-2c90-45ed-ad7f-902601682e27', metadata={'creationdate': '2006-10-16T20:19:33+02:00', 'creator': 'Adobe Acrobat 6.0'