In [None]:
import langchain
print(langchain.__version__)

0.1.16


In [1]:
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [2]:
%pwd

'd:\\Medical-Chat-Bot\\research'

In [3]:
import os
os.chdir("../")

In [4]:
%pwd

'd:\\Medical-Chat-Bot'

In [5]:
#Extract Data From the PDF File
def load_pdf_file(data):
    loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
    documents=loader.load()
    return documents

In [6]:
extracted_data=load_pdf_file(data='Data/')
print(f'Total number of documents: {len(extracted_data)}')

Total number of documents: 637


In [None]:
# extracted_data 

In [7]:
#Split the Data into Text Chuks
def text_split(extracted_data):
    text_splitter=RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=20)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [8]:
text_chunks=text_split(extracted_data)
print("Length of Text Chunks",len(text_chunks))

Length of Text Chunks 5859


In [None]:
# text_chunks

In [9]:
#Download the Embeddings from Hugging Face
from langchain.embeddings import HuggingFaceEmbeddings
def download_hugging_face_embeddings():
    embeddings=HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')
    return embeddings


In [10]:
embeddings = download_hugging_face_embeddings()

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
query_result= embeddings.embed_query("Hello World")
print("Length",len(query_result))

Length 384


In [None]:
# query_result

In [12]:
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv
import os

load_dotenv()

PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
GOOGLE_API_KEY = os.environ.get("GOOGLE_API_KEY")

pc = Pinecone(api_key=PINECONE_API_KEY)


In [13]:
index_name = "medical-chatbot"

existing_indexes = [idx["name"] for idx in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )
    print("Index created")
else:
    print("Index already exists")


Index already exists


In [None]:
#Store chunks with embeddings into your pinecone index.
from langchain_pinecone import PineconeVectorStore

docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings,
)

In [14]:
#Load Existing Index
from langchain_pinecone import PineconeVectorStore
#embed each chunk and upsert the embeddings into your pinecone index
docsearch=PineconeVectorStore.from_existing_index(
    index_name=index_name,
    embedding=embeddings
)

In [15]:
docsearch

<langchain_pinecone.vectorstores.PineconeVectorStore at 0x2d3b0e4ca60>

In [16]:
retriever = docsearch.as_retriever(search_type="similarity",search_kwargs={"k":3})

In [17]:
retrieved_docs=retriever.invoke("What is liver cancer")
retrieved_docs

[Document(page_content='Bile duct cancer\nDefinition\nBile duct cancer, or cholangiocarcinoma, is a malig-\nnant tumor of the bile ducts within the liver (intrahepatic),\nor leading from the liver to the small intestine (extrahepat-\nic). It is a rare tumor with poor outcome for most patients.\nDescription\nBile is a substance manufactured by the liver that\naids in the digestion of food. Bile ducts are channels that\ncarry the bile from the liver to the small intestine. Like', metadata={'page': 490.0, 'source': 'Data\\Medical_book.pdf'}),
 Document(page_content='Bile duct cancer\nDefinition\nBile duct cancer, or cholangiocarcinoma, is a malig-\nnant tumor of the bile ducts within the liver (intrahepatic),\nor leading from the liver to the small intestine (extrahepat-\nic). It is a rare tumor with poor outcome for most patients.\nDescription\nBile is a substance manufactured by the liver that\naids in the digestion of food. Bile ducts are channels that\ncarry the bile from the liver to

In [18]:
# from langchain_gemini import Gemini
# llm=Gemini(temperature=0.4, max_tokens=500)
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(
    model="models/gemini-flash-latest",
    temperature=0.4
)




In [26]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain 
from langchain_core.prompts import ChatPromptTemplate 

system_prompt = (
    "You are a medical information assistant. "
    "Answer the user's question using the provided documents when available. "
    "If the documents do not contain the answer, provide a general medical explanation "
    "based on widely accepted medical knowledge. "
    "Do NOT mention documents, context, or sources explicitly. "
    "Always end your answer with a short note advising the user to consult an appropriate medical specialist. "
    "Keep the answer clear, user-friendly, and under four sentences.\n\n"
    "{context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system",system_prompt),
        ("human","{input}")
    ]
)


In [27]:
question_answer_chain = create_stuff_documents_chain(llm,prompt)
rag_chain = create_retrieval_chain(retriever,question_answer_chain)

In [21]:
# import google.generativeai as genai
# import os

# genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

# models = genai.list_models()
# for m in models:
#     print(m.name, m.supported_generation_methods)


In [28]:
response = rag_chain.invoke({"input":"what is fever"})
print(response["answer"])

Fever is an elevation of the body's core temperature above the normal range, typically considered above 100.4°F (38°C). It is often a sign that the body is fighting an infection or illness, and high fever can range between 38°C and 40°C. Symptoms accompanying a fever may include chills, headache, muscle aches, and a general ill feeling.

Please consult a healthcare provider for an accurate diagnosis and appropriate treatment.
