In [7]:
import os 

In [8]:
%pwd

'd:\\Tutorial\\GenAI\\Projects\\Medical-Chatbot\\research'

In [9]:
os.chdir('../')

In [10]:
%pwd 

'd:\\Tutorial\\GenAI\\Projects\\Medical-Chatbot'

In [11]:
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings

import pinecone 
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.document_loaders import DirectoryLoader,PyPDFDirectoryLoader,PyPDFLoader
from langchain.document_loaders import TextLoader
import warnings
warnings.filterwarnings("ignore")
import os

In [12]:
def load_pdf(data):
    loader = DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents 

In [None]:
extracted_data = load_pdf("data/")

In [12]:
extracted_data[400]

Document(metadata={'source': 'data\\medical-book.pdf', 'page': 400, 'page_label': '401'}, page_content='• Movement education and bodywork, including mas-\nsage, myofacial release, and arthrokinetics, to help\nrelease tension and make new movement patterns easier.\n• Post-testing, when pre-testing movements are repeated,\nallowing the client to feel the changes that have taken\nplace and integrate them into daily life.\nAston-Patterning requires more participation from\nthe client than many bodywork techniques. The massage\naspect of Aston-Patterning is designed around a three-\ndimensional, non-compressive touch that releases pat-\nterns of tension in the body. It is gentler than Rolfing.\nMyokinetics uses touch to release tension in the face and\nneck. Arthrokinetics addresses tension at bones and\njoints. This massage is accompanied by education about\nhow new movement patterns may be established.\nIn addition to Aston-Patterning sessions, clients are\nalso helped to examine their en

### Create text chunks

In [20]:
def text_split(data_extracted):

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(data_extracted)
    return text_chunks


In [21]:
text_chunks = text_split(extracted_data)
print(len(text_chunks))

5860


In [22]:
text_chunks[400].page_content

'Acupressure points to relieve hay fever, sore throat, and\nheartburn. (Illustration by Electronic Illustrators Group.)\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 37'

In [7]:
# text_chunks

### Embedding Model 

In [15]:
# import google.generativeai as genai
from dotenv import load_dotenv
load_dotenv()
GEMINI_API_KEY=os.getenv("GOOGLE_API_KEY")
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')

In [None]:
# genai.configure(api_key=GEMINI_API_KEY)

# def get_gemini_embedding(text):
#     result = genai.embed_content(
#         model="models/embedding-001",
#         content=text
#     )
#     return result["embedding"]

In [24]:
embedding_model = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
# vector = embeddings.embed_query("hello, world!")

In [25]:
query_result = embedding_model.embed_query("hello world")
print(len(query_result))

768


In [16]:
import os
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = 'llama-chatbot'
index = pc.Index(index_name)

In [17]:
index

<pinecone.data.index.Index at 0x16e656f28c0>

In [None]:
upsert_data = [(str(i), embedding_model.embed_query(text_chunks[i].page_content),{"text": text_chunks[i].page_content}) for i in range(len(text_chunks))]



In [83]:
print(upsert_data[50])

('50', [0.03542203828692436, 0.040648289024829865, -0.043802086263895035, -0.009207949973642826, -0.004721476696431637, 0.054261982440948486, -0.005626289173960686, 0.07076804339885712, -0.011029995046555996, 0.04050498455762863, -0.0027835220098495483, 0.033860523253679276, 0.04517918825149536, -0.04999975115060806, -0.04413965716958046, -0.03926658257842064, 0.04494636505842209, 0.03869068622589111, -0.07037244737148285, -0.015249388292431831, 0.012086814269423485, -0.03178563341498375, 0.034802984446287155, -0.027212204411625862, -0.009783915244042873, -0.04238882660865784, -0.026876037940382957, -0.0675315111875534, 0.006882001645863056, -0.00985070038586855, 0.1248229369521141, 0.04491973668336868, 0.0009683624957688153, -0.03160779923200607, -0.003243088722229004, 0.03650408983230591, 0.02696242555975914, -0.036800406873226166, 0.05677996948361397, -0.03171490505337715, 0.011570662260055542, -0.03784432262182236, -0.00485506746917963, 0.00879188347607851, -0.02644726075232029, -0

In [84]:
namespace = "medical-chat"
batch_size = 100  # or any number that suits your data size
for i in range(0, len(upsert_data), batch_size):
    batch = upsert_data[i:i + batch_size]
    index.upsert(vectors=batch,namespace=namespace)


In [85]:
query = "What are allergies?"
query_embedding = get_gemini_embedding(query)  # encoding query text into vector

In [86]:
len(query_embedding)

768

In [87]:

# Perform similarity search
results = index.query(
    vector=query_embedding,  # Query vector
    top_k=3,  # Retrieve top 3 most similar documents
    include_metadata=True,  # Include metadata if available,
    namespace=namespace
)




In [88]:
results

{'matches': [{'id': '3540',
              'metadata': {'text': 'portal vein that run through liver (portal '
                                   'hypertension).\n'
                                   'Portal hypertension is caused by the '
                                   'scarring that\n'
                                   'occurs in cirrhosis. Blood that cannot '
                                   'flow through the\n'
                                   'liver because of the increased pressure '
                                   'leaks into the\n'
                                   'abdomen and causes ascites.\n'
                                   'Other conditions that contribute to '
                                   'ascites develop-\n'
                                   'ment include:\n'
                                   '• hepatitis\n'
                                   '• heart or kidney failure\n'
                                   '• inflammation and fibrous hardening of '


In [None]:
# # for Pinecone.from_existing_index method only 
# from langchain.vectorstores import Pinecone  

In [None]:
# older (from langchain.vectorstores)

# vectorstore = Pinecone.from_existing_index(
#     index_name=index_name,  # Pinecone index name
#     embedding=get_gemini_embedding,  # Embedding function for queries
#     namespace="medical-chat"  # Correct namespace
# )

# # Now get a retriever
# retriever = vectorstore.as_retriever(search_kwargs={"k": 3})



In [89]:
# newer (from pinecone python SDK)
def retrieve_documents(query, top_k=3):
    query_embedding = get_gemini_embedding(query)
    results = index.query(vector=query_embedding, top_k=top_k, include_metadata=True, namespace="medical-chat")
    return [match['metadata']['text'] for match in results['matches']]


In [18]:
prompt_template = """ 

If you don't know the answer, interact according to your intelligence.

Context:{context}
Question: {question}

Return the helpful answer below and nothing else.
Helpful answer: 

"""

In [19]:
prompt = PromptTemplate(template=prompt_template,input_variables=['context','question'])
chain_type_kwargs = {"prompt":prompt}

In [20]:
# llm = CTransformers(model="model/llama-2-7b-chat.ggmlv3.q4_0.bin",
#                     model_type="llama",
#                     config={'max_new_tokens':512,
#                             'temperature':0.8})



from langchain_google_genai import ChatGoogleGenerativeAI
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature = 0.8, max_tokens=512)

In [93]:
# older (from langchain.vectores module)

# qa_chain = RetrievalQA.from_chain_type(
#     llm=llm,
#     retriever=retriever,
#     chain_type="stuff",
#     chain_type_kwargs={"prompt": prompt},
# )

In [94]:
def ask_question(question):
    context_docs = retrieve_documents(question)
    context = "\n".join(context_docs)
    final_prompt = prompt.format(context=context, question=question)
    return llm.invoke(final_prompt).content


In [95]:
# older (from langchain.vectores Pinecone method )

# question = "what are allergies "
# answer = qa_chain.run(question)
# print(f"Answer: {answer}")

In [96]:
question = "Tell me about Antiviral drugs"
answer = ask_question(question)
print(f"Answer: {answer}")


Answer: This document does not contain information about antiviral drugs.  Therefore, I cannot answer your question.


### with the help of langchain PineconeVectorStore 

In [21]:
from langchain_pinecone import PineconeVectorStore

In [100]:
docs = [
    {"id": str(i), "metadata": {"text": text_chunks[i].page_content}, "values": embedding_model.embed_query(text_chunks[i].page_content)}
    for i in range(len(text_chunks))
]

In [22]:
new_namespace="new-test"

In [26]:
vectorstore = PineconeVectorStore(
    index_name=index_name,
    pinecone_api_key=PINECONE_API_KEY,
    embedding=embedding_model,
    namespace=new_namespace
)

In [None]:
vectorstore.add_texts(
    texts=[doc["metadata"]["text"] for doc in docs],
    metadatas=[{"id": doc["id"]} for doc in docs]
)

In [27]:
retriever_new = vectorstore.as_retriever(search_kwargs={"k": 3})


In [28]:
# Cell 10: Define the prompt template
from langchain.prompts import PromptTemplate

prompt_template = """ 
If you don't know the answer, interact according to your intelligence.

Context: {context}
Question: {question}

Return the helpful answer below and nothing else.
Helpful answer: 
"""

prompt = PromptTemplate(template=prompt_template, input_variables=['context', 'question'])
chain_type_kwargs = {"prompt": prompt}


In [29]:
# Cell 11: Initialize LLM model
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0.8, max_tokens=512)


In [30]:
# Cell 12: Create RetrievalQA chain
qa_chain_new = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever_new,
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt},
)


In [31]:
# Cell 13: Run a sample query
question = "what are allergies"
answer = qa_chain_new.run(question)
print(f"Answer: {answer}")


Answer: Allergies are immune reactions triggered by harmless, everyday substances (allergens) such as pollen, dust, and animal dander.  The body's immune system responds as if these substances are harmful, causing a series of reactions that can lead to various symptoms.


In [32]:
results = vectorstore.similarity_search("What are allergies", k=3)


In [33]:
documents = [doc.page_content for doc in results]
documents

['Description\nAllergies are among the most common of medical\ndisorders. It is estimated that 60 million Americans, or\nmore than one in every five people, suffer from some\nform of allergy, with similar proportions throughout\nmuch of the rest of the world. Allergy is the single largest\nreason for school absence and is a major source of lost\nproductivity in the workplace.\nAn allergy is a type of immune reaction. Normally,\nthe immune system responds to foreign microorganisms',
 'ganisms, or particles, like pollen or dust, by producing\nspecific proteins, called antibodies, that are capable of\nbinding to identifying molecules, or antigens, on the for-\neign particle. This reaction between antibody and antigen\nsets off a series of reactions designed to protect the body\nfrom infection. Sometimes, this same series of reactions\nis triggered by harmless, everyday substances. This is the\ncondition known as allergy, and the offending substance\nis called an allergen.',
 'or particles