In [4]:
import os 

In [5]:
%pwd

'd:\\Tutorial\\GenAI\\Projects\\Medical-Chatbot\\research'

In [6]:
os.chdir('../')

In [7]:
%pwd 

'd:\\Tutorial\\GenAI\\Projects\\Medical-Chatbot'

In [None]:
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import pinecone 
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.document_loaders import DirectoryLoader,PyPDFDirectoryLoader,PyPDFLoader
from langchain.document_loaders import TextLoader
import warnings
warnings.filterwarnings("ignore")
import os

In [10]:
def load_pdf(data):
    loader = DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents 

In [11]:
extracted_data = load_pdf("data/")

In [12]:
extracted_data[400]

Document(metadata={'source': 'data\\medical-book.pdf', 'page': 400, 'page_label': '401'}, page_content='• Movement education and bodywork, including mas-\nsage, myofacial release, and arthrokinetics, to help\nrelease tension and make new movement patterns easier.\n• Post-testing, when pre-testing movements are repeated,\nallowing the client to feel the changes that have taken\nplace and integrate them into daily life.\nAston-Patterning requires more participation from\nthe client than many bodywork techniques. The massage\naspect of Aston-Patterning is designed around a three-\ndimensional, non-compressive touch that releases pat-\nterns of tension in the body. It is gentler than Rolfing.\nMyokinetics uses touch to release tension in the face and\nneck. Arthrokinetics addresses tension at bones and\njoints. This massage is accompanied by education about\nhow new movement patterns may be established.\nIn addition to Aston-Patterning sessions, clients are\nalso helped to examine their en

### Create text chunks

In [20]:
def text_split(data_extracted):

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(data_extracted)
    return text_chunks


In [21]:
text_chunks = text_split(extracted_data)
print(len(text_chunks))

5860


In [22]:
text_chunks[400].page_content

'Acupressure points to relieve hay fever, sore throat, and\nheartburn. (Illustration by Electronic Illustrators Group.)\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 37'

In [7]:
# text_chunks

### Embedding Model 

In [23]:
import google.generativeai as genai
from dotenv import load_dotenv
load_dotenv()
GEMINI_API_KEY=os.getenv("GOOGLE_API_KEY")
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')

In [31]:
genai.configure(api_key=GEMINI_API_KEY)

def get_gemini_embedding(text):
    result = genai.embed_content(
        model="models/embedding-001",
        content=text
    )
    return result["embedding"]

In [32]:
query_result = get_gemini_embedding("hello world")
print(len(query_result))

768


In [33]:
import os
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = 'llama-chatbot'
index = pc.Index(index_name)

In [34]:
index

<pinecone.data.index.Index at 0x2720288d8d0>

In [35]:
upsert_data = [(str(i), get_gemini_embedding(text_chunks[i].page_content),{"text": text_chunks[i].page_content}) for i in range(len(text_chunks))]



In [36]:
print(upsert_data[50])

('50', [-0.012584968, -0.030572569, -0.028516477, -0.021046735, 0.037231583, 0.021155966, 0.019479766, -0.009392349, -0.011548036, 0.034207847, 0.0076141176, 0.0100267, 0.0051081167, 0.043778062, -0.035404198, -0.009758398, 0.01739472, -0.037396666, -0.047487643, -0.0053381994, 0.011133258, 0.015440317, -0.024159044, -0.036466073, 0.048799496, -0.04439283, 0.0011796087, -0.06117664, -0.01434713, 0.017185783, -0.02975989, 0.015487354, -0.035122562, 0.007557198, -0.01722336, -0.075188555, 0.0012135081, -0.0145842545, 0.008320674, 0.02447066, 0.023432024, -0.0009399453, -0.023850376, 0.019282436, 0.037062332, -0.04469776, 0.029672721, 0.04595869, 0.023002071, -0.0631669, -0.0015914042, 0.026077038, 0.05958253, -0.008611812, -0.0020206477, -0.026356537, 0.0005900204, 0.0036020428, -0.048105907, -0.0023005153, -0.016143031, 0.037556536, -0.0787604, 0.020947218, -0.015711688, -0.08881835, -0.03685563, -0.029540565, 0.08769921, 0.018256087, 0.03302891, -0.034351207, 0.06453947, -0.022641018, 

In [37]:
namespace = "medical-chat"
batch_size = 100  # or any number that suits your data size
for i in range(0, len(upsert_data), batch_size):
    batch = upsert_data[i:i + batch_size]
    index.upsert(vectors=batch,namespace=namespace)


In [38]:
query = "What are allergies?"
query_embedding = get_gemini_embedding(query)  # encoding query text into vector

In [39]:
len(query_embedding)

768

In [40]:

# Perform similarity search
results = index.query(
    vector=query_embedding,  # Query vector
    top_k=3,  # Retrieve top 3 most similar documents
    include_metadata=True,  # Include metadata if available,
    namespace=namespace
)




In [41]:
results

{'matches': [{'id': '1158',
              'metadata': {'text': 'Allergies\n'
                                   'GEM - 0001 to 0432 - A  10/22/03 1:42 PM  '
                                   'Page 118'},
              'score': 0.814151049,
              'values': []},
             {'id': '1122',
              'metadata': {'text': 'Description\n'
                                   'Allergies are among the most common of '
                                   'medical\n'
                                   'disorders. It is estimated that 60 million '
                                   'Americans, or\n'
                                   'more than one in every five people, suffer '
                                   'from some\n'
                                   'form of allergy, with similar proportions '
                                   'throughout\n'
                                   'much of the rest of the world. Allergy is '
                                   'the single larges

In [None]:
# # for Pinecone.from_existing_index method only 
# from langchain.vectorstores import Pinecone  

In [None]:
# older (from langchain.vectorstores)

# vectorstore = Pinecone.from_existing_index(
#     index_name=index_name,  # Pinecone index name
#     embedding=get_gemini_embedding,  # Embedding function for queries
#     namespace="medical-chat"  # Correct namespace
# )

# # Now get a retriever
# retriever = vectorstore.as_retriever(search_kwargs={"k": 3})



In [56]:
# newer (from pinecone python SDK)
def retrieve_documents(query, top_k=3):
    query_embedding = get_gemini_embedding(query)
    results = index.query(vector=query_embedding, top_k=top_k, include_metadata=True, namespace="medical-chat")
    return [match['metadata']['text'] for match in results['matches']]


In [57]:
prompt_template = """ 

If you don't know the answer, interact according to your intelligence.

Context:{context}
Question: {question}

Return the helpful answer below and nothing else.
Helpful answer: 

"""

In [58]:
prompt = PromptTemplate(template=prompt_template,input_variables=['context','question'])
chain_type_kwargs = {"prompt":prompt}

In [59]:
# llm = CTransformers(model="model/llama-2-7b-chat.ggmlv3.q4_0.bin",
#                     model_type="llama",
#                     config={'max_new_tokens':512,
#                             'temperature':0.8})



from langchain_google_genai import ChatGoogleGenerativeAI
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature = 0.8, max_tokens=512)

In [60]:
# older (from langchain.vectores module)

# qa_chain = RetrievalQA.from_chain_type(
#     llm=llm,
#     retriever=retriever,
#     chain_type="stuff",
#     chain_type_kwargs={"prompt": prompt},
# )

In [61]:
def ask_question(question):
    context_docs = retrieve_documents(question)
    context = "\n".join(context_docs)
    final_prompt = prompt.format(context=context, question=question)
    return llm.invoke(final_prompt).content


In [62]:
# older (from langchain.vectores Pinecone method )

# question = "what are allergies "
# answer = qa_chain.run(question)
# print(f"Answer: {answer}")

In [63]:
question = "what are allergies"
answer = ask_question(question)
print(f"Answer: {answer}")


Answer: Allergies are a type of immune reaction where the body's immune system responds to harmless substances (allergens) as if they were harmful foreign invaders.  This triggers a series of reactions that can cause symptoms like sneezing, itching, rash, and more.  They are among the most common medical disorders, affecting a significant portion of the population.
