In [13]:
import os 

In [14]:
%pwd

'd:\\Tutorial\\GenAI\\Projects\\Medical-Chatbot\\research'

In [15]:
os.chdir('../')

In [16]:
%pwd 

'd:\\Tutorial\\GenAI\\Projects\\Medical-Chatbot'

In [45]:
from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceHubEmbeddings
from langchain_google_genai import GoogleGenerativeAIEmbeddings

from langchain.vectorstores import Pinecone
import pinecone
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
from langchain.document_loaders import DirectoryLoader,PyPDFDirectoryLoader,PyPDFLoader
from langchain.document_loaders import TextLoader
from langchain.vectorstores import Pinecone
import warnings
warnings.filterwarnings("ignore")


import os

In [18]:
def load_pdf(data):
    loader = DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents 

In [19]:
extracted_data = load_pdf("data/")

In [20]:
extracted_data[400]

Document(metadata={'source': 'data\\medical-book.pdf', 'page': 400, 'page_label': '401'}, page_content='• Movement education and bodywork, including mas-\nsage, myofacial release, and arthrokinetics, to help\nrelease tension and make new movement patterns easier.\n• Post-testing, when pre-testing movements are repeated,\nallowing the client to feel the changes that have taken\nplace and integrate them into daily life.\nAston-Patterning requires more participation from\nthe client than many bodywork techniques. The massage\naspect of Aston-Patterning is designed around a three-\ndimensional, non-compressive touch that releases pat-\nterns of tension in the body. It is gentler than Rolfing.\nMyokinetics uses touch to release tension in the face and\nneck. Arthrokinetics addresses tension at bones and\njoints. This massage is accompanied by education about\nhow new movement patterns may be established.\nIn addition to Aston-Patterning sessions, clients are\nalso helped to examine their en

### Create text chunks

In [21]:
def text_split(data_extracted):

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(data_extracted)
    return text_chunks


In [22]:
text_chunks = text_split(extracted_data)
print(len(text_chunks))

5860


In [24]:
text_chunks[400].page_content

'Acupressure points to relieve hay fever, sore throat, and\nheartburn. (Illustration by Electronic Illustrators Group.)\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 37'

In [7]:
# text_chunks

### Embedding Model 

In [13]:
# import os
# def download_embeddings():
#     # Define the path where the embeddings should be saved
#     cache_dir = os.path.join(os.getcwd(), "modelEmbedd")
    
#     # Initialize embeddings with the specified cache directory
#     embeddings = HuggingFaceEmbeddings(
#         model_name="sentence-transformers/all-MiniLM-L6-v2",
#         cache_folder=cache_dir
#     )
#     return embeddings



In [29]:
import google.generativeai as genai
from dotenv import load_dotenv
load_dotenv()
GEMINI_API_KEY=os.getenv("GOOGLE_API_KEY")
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')

In [27]:
genai.configure(api_key=GEMINI_API_KEY)

def get_gemini_embedding(text):
    result = genai.embed_content(
        model="models/text-embedding-004",
        content=text
    )
    return result["embedding"]

In [28]:
query_result = get_gemini_embedding("hello world")
print(len(query_result))

768


In [30]:
import os
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = 'llama-chatbot'
index = pc.Index(index_name)

In [31]:
index

<pinecone.data.index.Index at 0x17f0ffa6950>

In [32]:
upsert_data = [(str(i), get_gemini_embedding(text_chunks[i].page_content),{"text": text_chunks[i].page_content}) for i in range(len(text_chunks))]



In [33]:
print(upsert_data[50])

('50', [0.03542204, 0.04064829, -0.043802086, -0.00920795, -0.0047214767, 0.054261982, -0.005626289, 0.07076804, -0.011029995, 0.040504985, -0.002783522, 0.033860523, 0.04517919, -0.04999975, -0.044139657, -0.039266583, 0.044946365, 0.038690686, -0.07037245, -0.015249388, 0.012086814, -0.031785633, 0.034802984, -0.027212204, -0.009783915, -0.042388827, -0.026876038, -0.06753151, 0.0068820016, -0.0098507, 0.12482294, 0.044919737, 0.0009683625, -0.0316078, -0.0032430887, 0.03650409, 0.026962426, -0.036800407, 0.05677997, -0.031714905, 0.011570662, -0.037844323, -0.0048550675, 0.0087918835, -0.02644726, -0.023644093, -0.0053583146, 0.0127215935, -0.041744113, 0.037732158, -0.007805945, 0.030302918, -0.0256557, 0.010219257, -0.0031694311, -0.03254766, -0.012564856, -0.028400194, 0.024700172, 0.017770538, -0.021585124, -0.011796184, 0.0019637393, 0.0014649091, 0.045919, -0.068779714, -0.042868987, 0.042060316, -0.021091357, 0.039032154, 0.03877291, 0.041904952, -0.028491588, 0.064178, -0.00

In [34]:
namespace = "medical-chat"
batch_size = 100  # or any number that suits your data size
for i in range(0, len(upsert_data), batch_size):
    batch = upsert_data[i:i + batch_size]
    index.upsert(vectors=batch,namespace=namespace)


In [35]:
query = "What are allergies?"
query_embedding = get_gemini_embedding(query)  # encoding query text into vector

In [36]:
len(query_embedding)

768

In [37]:

# Perform similarity search
results = index.query(
    vector=query_embedding,  # Query vector
    top_k=3,  # Retrieve top 3 most similar documents
    include_metadata=True,  # Include metadata if available,
    namespace=namespace
)




In [38]:
results

{'matches': [{'id': '1122',
              'metadata': {'text': 'Description\n'
                                   'Allergies are among the most common of '
                                   'medical\n'
                                   'disorders. It is estimated that 60 million '
                                   'Americans, or\n'
                                   'more than one in every five people, suffer '
                                   'from some\n'
                                   'form of allergy, with similar proportions '
                                   'throughout\n'
                                   'much of the rest of the world. Allergy is '
                                   'the single largest\n'
                                   'reason for school absence and is a major '
                                   'source of lost\n'
                                   'productivity in the workplace.\n'
                                   'An allergy is a type of im

In [46]:
# vectorstore = Pinecone(
#     index=index,  # Pinecone index instance
#     embedding=get_gemini_embedding,  # Embedding function
#     text_key="text" ,
#     namespace=namespace
# )

# retriever = vectorstore.as_retriever(search_kwargs={"k": 3})


vectorstore = Pinecone.from_existing_index(
    index_name=index_name,  # Pinecone index name
    embedding=get_gemini_embedding,  # Embedding function for queries
    namespace="medical-chat"  # Correct namespace
)

# Now get a retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})



In [68]:
prompt_template = """ 

If you don't know the answer, interact according to your intelligence.

Context:{context}
Question: {question}

Return the helpful answer below and nothing else.
Helpful answer: 

"""

In [69]:
prompt = PromptTemplate(template=prompt_template,input_variables=['context','question'])
chain_type_kwargs = {"prompt":prompt}

In [70]:
# llm = CTransformers(model="model/llama-2-7b-chat.ggmlv3.q4_0.bin",
#                     model_type="llama",
#                     config={'max_new_tokens':512,
#                             'temperature':0.8})



from langchain_google_genai import ChatGoogleGenerativeAI
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature = 0.8, max_tokens=512)

In [71]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt},
)

In [73]:
question = "Have you done sex"
answer = qa_chain.run(question)
print(f"Answer: {answer}")

Answer: I cannot ask you if you have had sex.  That is a highly personal and private question that is inappropriate to ask in this context.  To qualify potential donors, focus on verifiable risk factors such as travel history to specific regions, specific medical diagnoses, and documented exposure to bloodborne pathogens.  Avoid questions that are invasive, potentially triggering, or unrelated to established risk assessments.
