In [1]:
# ---------- IMPORTS ----------
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_pinecone import PineconeVectorStore
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate
from dotenv import load_dotenv
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
from langchain_community.llms import HuggingFaceHub
import os


  from .autonotebook import tqdm as notebook_tqdm

For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from langchain_pinecone.vectorstores import Pinecone, PineconeVectorStore


In [2]:
%cd ..

/Users/akashvs/Documents/GitHubProjects/ai-powered-medical-chatbot


In [3]:
# ---------- LOAD PDFs ----------
def load_pdf_file(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )
    documents = loader.load()
    return documents

In [4]:
extracted_data = load_pdf_file(data='data/')

In [5]:
extracted_data[: :100]

[Document(metadata={'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'creator': 'PyPDF', 'creationdate': '2004-12-18T17:00:02-05:00', 'moddate': '2004-12-18T16:15:31-06:00', 'source': 'data/Medical_book.pdf', 'total_pages': 637, 'page': 0, 'page_label': '1'}, page_content=''),
 Document(metadata={'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'creator': 'PyPDF', 'creationdate': '2004-12-18T17:00:02-05:00', 'moddate': '2004-12-18T16:15:31-06:00', 'source': 'data/Medical_book.pdf', 'total_pages': 637, 'page': 100, 'page_label': '101'}, page_content='• T cell lymphocytes: 644-2200/mm\n3\n, 60-88% of all lym-\nphocytes.\n• B cell lymphocytes: 82-392/mm\n3\n, 3-20% of all lympho-\ncytes.\n• CD4+ lymphocytes: 500-1200/mm\n3\n, 34-67% of all\nlymphocytes.\nAbnormal results\nThe following results in AIDS tests indicate progres-\nsion of the disease:\n• Percentage of CD4+ lymphocytes: less than 20% of all\nlymphocytes.\n• CD4+ lymphocyte count: less than 200 cells/mm\n3\n.\n• Viral load test: Levels more than 

In [6]:
# ---------- SPLIT INTO CHUNKS ----------
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

In [7]:
text_chunks = text_split(extracted_data)
print("Length of Text Chunks:", len(text_chunks))

Length of Text Chunks: 5859


In [8]:
text_chunks[: :1000]

[Document(metadata={'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'creator': 'PyPDF', 'creationdate': '2004-12-18T17:00:02-05:00', 'moddate': '2004-12-18T16:15:31-06:00', 'source': 'data/Medical_book.pdf', 'total_pages': 637, 'page': 1, 'page_label': '2'}, page_content='The GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITION'),
 Document(metadata={'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'creator': 'PyPDF', 'creationdate': '2004-12-18T17:00:02-05:00', 'moddate': '2004-12-18T16:15:31-06:00', 'source': 'data/Medical_book.pdf', 'total_pages': 637, 'page': 115, 'page_label': '116'}, page_content='Alemtuzumab\nDefinition\nAlemtuzumab is sold as Campath in the United\nStates. Alemtuzumab is a humanized monoclonal anti-\nbody that selectively binds to CD52, a protein found on\nthe surface of normal and malignant B and T cells, that is\nused to reduce the numbers of circulating malignant cells\nof patients who have B-cell chronic lymphocytic\nleukemia (B-CLL).\nPurpose\nAlemtuzumab is a monoclonal antibo

In [11]:
# ---------- EMBEDDINGS FROM HUGGING FACE ----------
embeddings = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
query_result = embeddings.embed_query("Hello world")
print("Embedding vector length:", len(query_result))

Embedding vector length: 384


In [14]:
query_result[:10]

[-0.03447728604078293,
 0.031023193150758743,
 0.006734972819685936,
 0.026109015569090843,
 -0.03936203196644783,
 -0.16030248999595642,
 0.06692397594451904,
 -0.006441446952521801,
 -0.047450508922338486,
 0.014758873730897903]

In [None]:
# ---------- LOAD ENVIRONMENT VARIABLES ----------
load_dotenv()
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
GOOGLE_API_KEY = os.getenv('GOOGLE_API_KEY')

In [135]:
# ---------- INITIALIZE PINECONE ----------
pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "medicalbot"

# Create index if not exists
if index_name not in [index.name for index in pc.list_indexes()]:
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1")
    )


In [22]:
# ---------- UPSERT EMBEDDINGS ----------
docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings,
)

In [23]:
# ---------- LOAD EXISTING INDEX ----------
docsearch = PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

retriever = docsearch.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [24]:
retriever_ex = retriever.invoke("Acne")
retriever_ex

[Document(id='44c73afc-192d-4078-8d89-ed3a51d87347', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 39.0, 'page_label': '40', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'data/Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 226\nAcne\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 26'),
 Document(id='a3854d64-7383-46fe-9f9c-c528012409b9', metadata={'creationdate': '2004-12-18T17:00:02-05:00', 'creator': 'PyPDF', 'moddate': '2004-12-18T16:15:31-06:00', 'page': 38.0, 'page_label': '39', 'producer': 'PDFlib+PDI 5.0.0 (SunOS)', 'source': 'data/Medical_book.pdf', 'total_pages': 637.0}, page_content='GALE ENCYCLOPEDIA OF MEDICINE 2 25\nAcne\nAcne vulgaris affecting a woman’s face. Acne is the general\nname given to a skin disorder in which the sebaceous\nglands become inflamed. (Photograph by Biophoto Associ-\nates, Photo Researchers, Inc. Reproduced by permission.)\nGEM - 

In [140]:
from langchain_google_genai import ChatGoogleGenerativeAI


llm = ChatGoogleGenerativeAI(model="models/gemini-2.5-flash-lite-preview-06-17", temperature=0.4, max_output_tokens=100)


E0000 00:00:1759752976.974789 1189862 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


In [141]:
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("human", "{input}"),
    ]
)


In [142]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [None]:
response = rag_chain.invoke({"input": "what is Heart Attack and why it is occuring?"})
print(response["answer"])

Acromegaly and gigantism are disorders caused by the abnormal release of a chemical from the pituitary gland, leading to increased growth in bone and soft tissue. Gigantism occurs when this abnormality happens before bone growth stops, resulting in unusual height. Acromegaly is diagnosed when the abnormality occurs after bone growth has ceased.


In [148]:
import pkg_resources

packages = [
    "langchain", "flask", "pypdf", "python-dotenv",
    "pinecone-client", "langchain-pinecone", "langchain_community",
    "langchain_experimental", "sentence-transformers",
    "langchain-google-genai", "google-generativeai", "langchain-huggingface"
]

for pkg in packages:
    try:
        version = pkg_resources.get_distribution(pkg).version
        print(f"{pkg}: {version}")
    except pkg_resources.DistributionNotFound:
        print(f"{pkg}: Not installed")



langchain: 0.3.27
flask: 3.1.2
pypdf: 6.1.1
python-dotenv: 1.1.1
pinecone-client: Not installed
langchain-pinecone: 0.2.12
langchain_community: 0.3.30
langchain_experimental: 0.3.4
sentence-transformers: 5.1.1
langchain-google-genai: 2.0.10
google-generativeai: 0.8.5
langchain-huggingface: 0.3.1
