In [1]:
%pwd

'd:\\MedicalChatBot\\End-to-end-Medical-Chatbot\\reasearch'

In [2]:
import os
os.chdir("../")

In [3]:
%pwd

'd:\\MedicalChatBot\\End-to-end-Medical-Chatbot'

In [4]:
from langchain.document_loaders import PyPDFLoader,DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [5]:
#Extract the data from PDF file
def load_pdf_file(file_path):
    loader=DirectoryLoader(file_path,glob="*.pdf",loader_cls=PyPDFLoader)
    documents=loader.load()
    return documents

In [6]:
extracted_documents = load_pdf_file(file_path="D:/MedicalChatBot/End-to-end-Medical-Chatbot/Data/")

In [7]:
extracted_documents[7].page_content

'MEDICAL ADVISORS\nA. Richard Adrouny, M.D.,\nF.A.C.P.\nClinical Assistant Professor of\nMedicine\nDivision of Oncology\nStanford University\nDirector of Medical Oncology\nCommunity Hospital of Los Gatos-\nSaratoga\nLos Gatos, CA\nLaurie Barclay, M.D.\nNeurological Consulting Services\nTampa, FL\nKenneth J. Berniker, M.D.\nAttending Physician\nEmergency Department\nKaiser Permanente Medical Center\nVallejo, CA\nRosalyn Carson-DeWitt, M.D.\nDurham, NC\nRobin Dipasquale, N.D.\nClinical Faculty\nBastyr University\nSeattle, W A\nFaye Fishman, D.O.\nRandolph, NJ\nJ. Gary Grant, M.D.\nPacific Grove, CA\nLaith F. Gulli, M.D.\nM.Sc., M.Sc.(MedSci), MSA,\nMsc.Psych., MRSNZ\nFRSH, FRIPHH, FAIC, FZS\nDAPA, DABFC, DABCI\nConsultant Psychotherapist in\nPrivate Practice\nLathrup Village, MI\nL. Anne Hirschel, D.D.S.\nSouthfield, MI\nLarry I. Lutwick M.D., F.A.C.P.\nDirector, Infectious Diseases\nV A Medical Center\nBrooklyn, NY\nIra Michelson, M.D., M.B.A.,\nF.A.C.O.G.\nPhysician and Clinical Instru

In [8]:
#Split Data into chunks
def text_splitter(extracted_documents):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=50)
    documents = text_splitter.split_documents(extracted_documents)
    return documents

In [9]:
documents = text_splitter(extracted_documents)
print("Length of text chunks ",len(documents))

Length of text chunks  3599


In [10]:
#Download the embeddings from HuggingFace
from langchain_huggingface import HuggingFaceEmbeddings
def download_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [11]:
embeddings = download_embeddings()

In [12]:
query_results=embeddings.embed_query("What is the treatment for diabetes?")
print("Length: ",len(query_results))

Length:  384


In [13]:
from dotenv import load_dotenv
load_dotenv()
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
print("Pinecone API Key: ", PINECONE_API_KEY)

Pinecone API Key:  pcsk_6Jmsfi_NtTantsY8DB7JUfTnpJTubiE3eXtooYBXxGUhhFVpWXdEX5NvR5ok5Y548LJouq


In [14]:
from pinecone.grpc import PineconeGRPC as Pinecone
from pinecone import ServerlessSpec
import os

pc=Pinecone(api_key="pcsk_6Jmsfi_NtTantsY8DB7JUfTnpJTubiE3eXtooYBXxGUhhFVpWXdEX5NvR5ok5Y548LJouq")

index_name = "medical-chatbot"

pc.create_index(
    name=index_name,
    dimension=384,  
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1",
))


{
    "name": "medical-chatbot",
    "metric": "cosine",
    "host": "medical-chatbot-zondwtd.svc.aped-4627-b74a.pinecone.io",
    "spec": {
        "serverless": {
            "cloud": "aws",
            "region": "us-east-1"
        }
    },
    "status": {
        "ready": true,
        "state": "Ready"
    },
    "vector_type": "dense",
    "dimension": 384,
    "deletion_protection": "disabled",
    "tags": null
}

In [15]:
import os
os.environ["PINECONE_API_KEY"] = "pcsk_6Jmsfi_NtTantsY8DB7JUfTnpJTubiE3eXtooYBXxGUhhFVpWXdEX5NvR5ok5Y548LJouq"

In [16]:
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=documents,
    index_name=index_name,
    embedding=embeddings,
)

In [17]:
#load existing index
from langchain_pinecone import PineconeVectorStore

docsearch=PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings,
)

In [18]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x235c5428160>

In [19]:
retriever=docsearch.as_retriever(search_type="similarity", search_kwargs={"k": 3})

In [20]:
retriever_docs=retriever.invoke("What is Acne?")

In [21]:
retriever_docs

[Document(id='8992d5af-6c81-45e7-87a4-615af64002e3', metadata={'author': '', 'creationdate': '2017-05-01T10:37:35-07:00', 'creator': '', 'keywords': '', 'moddate': '2017-05-01T10:37:35-07:00', 'page': 298.0, 'page_label': '299', 'producer': 'GPL Ghostscript 9.10', 'source': 'D:\\MedicalChatBot\\End-to-end-Medical-Chatbot\\Data\\The_GALE_ENCYCLOPEDIA_of_MEDICINE_SECOND.pdf', 'subject': '', 'title': '', 'total_pages': 759.0}, page_content='Corticosteroids —A group of anti-inflammatory\nsubstances often used to treat skin conditions.\nImmune response—The protective reaction by the\nimmune system against foreign antigens (sub-\nstances that the body perceives as potentially dan-\ngerous). The immune system combats disease by\nneutralizing or destroying antigens.\ncontact dermatitis becomes a chronic and disabling con-\ndition that can have a profound effect on employability\nand quality of life.\nPrevention\nAvoidance of known or suspected allergens or irritat-\ning substances is the best 

In [22]:
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
print("API Key Loaded:", GOOGLE_API_KEY is not None)

API Key Loaded: True


In [24]:
from langchain_google_genai import ChatGoogleGenerativeAI

# Use the API key directly
GOOGLE_API_KEY = "AIzaSyDLLZsQkfyKctkESSajrJFGQqZlSHzBnBw"  # 🔐 Replace with your actual key

llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    google_api_key=GOOGLE_API_KEY,
    temperature=0.2,
)

response = llm.invoke("What is LangChain?")
print(response.content)


LangChain is a framework for developing applications powered by large language models (LLMs).  It's designed to make it easier to build applications that combine the power of LLMs with other sources of data and computation.  Instead of just interacting with an LLM directly, LangChain provides tools and abstractions to:

* **Connect LLMs to other sources of data:** This allows your application to access and process information from databases, APIs, documents, and more, enriching the LLM's responses.  Imagine an application that answers questions about your company's internal documents – LangChain helps connect the LLM to those documents.

* **Manage the flow of interactions with LLMs:**  LangChain provides tools to chain multiple LLM calls together, creating more complex and sophisticated applications.  For example, you might use one LLM call to summarize a document and then another to answer a question based on that summary.

* **Improve the memory and context management of LLMs:** LLM

In [30]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you don't know. "
    "Use three sentences maximum and keep the answer concise.\n\n{context}"
)

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}"),
])

In [31]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [36]:
response = rag_chain.invoke({"input": "What is Acromegaly and gigantism?"})
print(response["answer"])

I am sorry, but this document does not contain information on acromegaly and gigantism.


In [35]:
response = rag_chain.invoke({"input": "What is stats?"})
print(response["answer"])

I'm sorry, but this document does not define "stats".  The provided text mentions the use of statistics in predicting cancer outcomes, specifically five-year survival rates.  However, it does not offer a general definition of statistics.
