In [1]:
import os 

In [2]:
%pwd

'd:\\Tutorial\\GenAI\\Projects\\Medical-Chatbot\\research'

In [3]:
os.chdir('../')

In [4]:
%pwd 

'd:\\Tutorial\\GenAI\\Projects\\Medical-Chatbot'

In [26]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
from langchain.document_loaders import PyMuPDFLoader, DirectoryLoader,PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers
# from langchain.retrievers import PineconeRetriever

In [6]:
def load_pdf(data):
    loader = DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents 

In [7]:
extracted_data = load_pdf("data/")

In [9]:
extracted_data[450]

Document(metadata={'source': 'data\\medical-book.pdf', 'page': 450, 'page_label': '451'}, page_content='OTHER\nThe Meck Page.13 Apr. 1998 <http://www.merck.com>.\nRichard Robinson\nBacterial meningitis see Meningitis\nBacterial vaginosis see Vulvovaginitis\nBacteroides infection see Anaerobic\ninfections\nBad breath\nDefinition\nBad breath, sometimes called halitosis, is an\nunpleasant odor of the breath.\nDescription\nBad breath is likely to be experienced by most adults\nat least occasionally. Bad breath, either real or imagined,\ncan have a significant impact on a person’s social and\nprofessional life.\nCauses and symptoms\nBad breath can be caused by a number of problems.\nOral diseases, fermentation of food particles in the\nmouth, sinus infections, and unclean dentures can all\ncontribute to mouth odor. Many non-oral diseases, such\nas lung infections, kidney failure, or severe liver disease,\ncan also cause bad breath, though rarely. Many people\nthink that bad breath can origi

### Create text chunks

In [10]:
def text_split(data_extracted):

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(data_extracted)
    return text_chunks


In [11]:
text_chunks = text_split(extracted_data)
print(len(text_chunks))

5860


In [12]:
text_chunks[400]

Document(metadata={'source': 'data\\medical-book.pdf', 'page': 50, 'page_label': '51'}, page_content='Acupressure points to relieve hay fever, sore throat, and\nheartburn. (Illustration by Electronic Illustrators Group.)\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 37')

In [7]:
# text_chunks

### Embedding Model 

In [13]:
import os
def download_embeddings():
    # Define the path where the embeddings should be saved
    cache_dir = os.path.join(os.getcwd(), "modelEmbedd")
    
    # Initialize embeddings with the specified cache directory
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        cache_folder=cache_dir
    )
    return embeddings

In [14]:
embeddings = download_embeddings()

  embeddings = HuggingFaceEmbeddings(





To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [15]:
embeddings 

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder='d:\\Tutorial\\GenAI\\Projects\\Medical-Chatbot\\modelEmbedd', model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [16]:
query_result = embeddings.embed_query("hello world")
print(len(query_result))

384


In [17]:
from dotenv import load_dotenv
import os

load_dotenv()  # This will load the environment variables from the .env file
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_API_ENV= os.getenv('PINECONE_API_ENV')


In [18]:
from pinecone import Pinecone

pc = Pinecone(api_key=PINECONE_API_KEY,environment=PINECONE_API_ENV)

# Connect to your existing index
index_name = "medical-chatbot"
index = pc.Index(index_name)  # Use the .index() method to retrieve the existing index


In [19]:
index

<pinecone.data.index.Index at 0x188cf75f6d0>

In [20]:
# Assuming you have a HuggingFaceEmbeddings instance called 'embeddings'
upsert_data = [(str(i), embeddings.embed_query(text_chunks[i].page_content),{"text": text_chunks[i].page_content}) for i in range(len(text_chunks))]

# Perform the upsert


In [24]:
print(upsert_data[50])

('50', [-0.023009084165096283, -0.024584908038377762, -0.05563613399863243, 0.05322492867708206, -0.06256762146949768, 0.004432018380612135, -0.030097471550107002, 0.0626663789153099, -0.025787582620978355, 0.0062860422767698765, 0.0010368417715653777, 0.06985747814178467, -0.06694687902927399, -0.02845495194196701, -0.06762570887804031, 0.023163527250289917, 0.0076363952830433846, 0.010384813882410526, 0.00993509590625763, 0.020671026781201363, -0.04169965907931328, 0.06432665884494781, 0.07907937467098236, 0.06253725290298462, 0.04383961111307144, 0.0022406226489692926, -0.016082728281617165, -0.02480974607169628, -0.03271229565143585, 0.04841664060950279, -0.02532374858856201, -0.004334788769483566, 0.054473504424095154, 0.03735858574509621, 0.03530607372522354, 0.023333081975579262, -0.0184114258736372, 0.08702198415994644, -0.0045023392885923386, 0.015262584201991558, 0.05669070780277252, -0.008907446637749672, 0.009143990464508533, 0.1178724616765976, 0.01925436407327652, -0.0852

In [54]:
print(pc.list_indexes())
description = pc.describe_index("medical-chatbot")
print(f"Description of index: \n {description}")

{'indexes': [{'deletion_protection': 'disabled',
              'dimension': 384,
              'host': 'medical-chatbot-bc4fyf5.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'medical-chatbot',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}}]}
Description of index: 
 {'deletion_protection': 'disabled',
 'dimension': 384,
 'host': 'medical-chatbot-bc4fyf5.svc.aped-4627-b74a.pinecone.io',
 'metric': 'cosine',
 'name': 'medical-chatbot',
 'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
 'status': {'ready': True, 'state': 'Ready'}}


In [78]:
batch_size = 100  # or any number that suits your data size
for i in range(0, len(upsert_data), batch_size):
    batch = upsert_data[i:i + batch_size]
    index.upsert(vectors=batch)


In [27]:
query = "What are allergies?"
query_embedding = embeddings.embed_query(query)  # encoding query text into vector

In [28]:
len(query_embedding)

384

In [29]:

# Perform similarity search
results = index.query(
    vector=query_embedding,  # Query vector
    top_k=3,  # Retrieve top 3 most similar documents
    include_metadata=True  # Include metadata if available
)




In [30]:
# Print the results
for match in results['matches']:
    print(f"Score: {match['score']}, Text: {match['metadata']['text']}")
    print("\n")

Score: 0.702849507, Text: reaction. Allergic rhinitis is characterized by an itchy,
runny nose, often with a scratchy or irritated throat due
to post-nasal drip. Inflammation of the thin membrane
covering the eye (allergic conjunctivitis) causes redness,
irritation, and increased tearing in the eyes. Asthma caus-
es wheezing, coughing, and shortness of breath. Symp-
toms of food allergies depend on the tissues most sensi-
tive to the allergen and whether the allergen spread sys-


Score: 0.686704159, Text: reactions is triggered by harmless, everyday substances.
This is the condition known as allergy, and the offend-
ing substance is called an allergen. Common inhaled
allergens include pollen, dust, and insect parts from tiny
house mites. Common food allergens include nuts, fish,
and milk.
Allergic reactions involve a special set of cells in
the immune system known as mast cells. Mast cells
serve as guards in the tissues where the body meets the


Score: 0.681826174, Text: Purpose
Alle

In [31]:
vectorstore = Pinecone(
    index=index,  # Pinecone index instance
    embedding=embeddings.embed_query,  # Embedding function
    text_key="text"  # Key in metadata containing the document text
)

retriever = vectorstore.as_retriever(search_kwargs={"k": 3})



  vectorstore = Pinecone(


In [21]:
prompt_template = """ 
Use the following peices of information to answer the user's question 
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context:{context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer: 

"""

In [32]:
prompt = PromptTemplate(template=prompt_template,input_variables=['context','question'])
chain_type_kwargs = {"prompt":prompt}

In [41]:
llm = CTransformers(model="model/llama-2-7b-chat.ggmlv3.q4_0.bin",
                    model_type="llama",
                    config={'max_new_tokens':128,
                            'temperature':0.8})

In [42]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt},
)

In [43]:
question = "Tell me about some human diseases?"
answer = qa_chain.run(question)
print(f"Answer: {answer}")

Answer: Diseases caused by internal factors include emotions such as stress, anxiety, depression, and other mental health conditions. External factors like pollution, poor dietition, nutr quality air quality of living conditions, air quality food choices infection, sanitation, sanitation, living conditions, living conditions atm weather conditions, air quality of hyght, living conditions, climate, living conditions, dietiongutation, environment and unhygst, living conditions, living conditions, nutr climate, unpasteur water quality air quality air quality air quality air quality air quality air quality air quality of san
