In [229]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone

import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers

In [49]:
PINECONE_API_KEY = "1bae0d8e-019e-4e87-8080-ecf523e5f25f"
PINECONE_API_ENV = "us-east1-aws"

In [13]:
def load_pdf(data):
    loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

In [15]:
extracted_data = load_pdf("data/")

In [24]:
# Data is extracted from the PDFs
# Now form chunks out of it

Document(page_content='TheGALE\nENCYCLOPEDIA\nofMEDICINE\nSECOND EDITION', metadata={'source': 'data\\Medical_book.pdf', 'page': 1})

In [82]:
def text_split(extracted_data):
    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    chunks = splitter.split_documents(extracted_data)
    return chunks


In [83]:
chunks = text_split(extracted_data)

In [84]:
# len(chunks)
chunks[0]
# Chunks formation is done
# Now, convert the chunks into embeddings
# Then Store this embeddings to VectorDB pinecone

Document(page_content='TheGALE\nENCYCLOPEDIA\nofMEDICINE\nSECOND EDITION', metadata={'source': 'data\\Medical_book.pdf', 'page': 1})

In [85]:
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [86]:
embeddings = download_hugging_face_embeddings()

In [125]:

chunk_embeddings = []
for i in range(0, len(chunks)):
    chunk_embeddings.append(embeddings.embed_query(chunks[i].page_content))


In [128]:
len(chunk_embeddings)
chunks[0]

Document(page_content='TheGALE\nENCYCLOPEDIA\nofMEDICINE\nSECOND EDITION', metadata={'source': 'data\\Medical_book.pdf', 'page': 1})

In [148]:
chunk_content = [chunks[i].page_content for i in range(0 , len(chunks))]

In [149]:

chunk_ids = [str(i+1) for i in range(0, len(chunks))]


In [153]:
print(chunk_ids[0])
print(chunk_embeddings[0])
chunk_content[0]

1
[0.0017460489179939032, -0.033502884209156036, -0.03290388733148575, 0.007168094161897898, -0.01460327859967947, 0.010261928662657738, -0.01151528861373663, 0.22930213809013367, -0.023232396692037582, 0.004120402969419956, -0.036560822278261185, 0.08592110127210617, 0.012972140684723854, 0.05221788212656975, -0.10232618451118469, -0.003139043692499399, -0.012686969712376595, 0.000471863109851256, -0.02848585695028305, -0.050259195268154144, 0.01155101228505373, 0.0778065174818039, 0.09282823652029037, -0.0137972766533494, -0.016935130581259727, -0.025955867022275925, -0.04956510663032532, -0.046131301671266556, 0.00729052210226655, -0.013553328812122345, 0.038439445197582245, 0.06280472129583359, 0.018353812396526337, 0.008242843672633171, 0.0017155527602881193, -0.039861857891082764, -0.011638614349067211, 0.016446180641651154, 0.025595590472221375, 0.09104609489440918, 0.029672738164663315, -0.05416030064225197, -0.04576560854911804, -0.013853926211595535, 0.02577359229326248, 0.01

'TheGALE\nENCYCLOPEDIA\nofMEDICINE\nSECOND EDITION'

In [154]:
# It will automatically convert the chunks into embeddings and store it on pinecone Db instance
from langchain_pinecone import PineconeVectorStore
# Replace with your actual API key
# pinecone_api_key = "YO
pc = Pinecone(api_key=PINECONE_API_KEY)
# Replace with your index name
index_name = "medical-chatbot"
index = pc.Index(index_name)

In [158]:
upsert_vectors = [
    {
        "id": chunk_id,
        "values": embedding,
        "metadata": {"text": content}  # Replace with actual metadata if available
    }
    for chunk_id, embedding, content in zip(chunk_ids, chunk_embeddings, chunk_content)
]


In [159]:
upsert_vectors[0]

{'id': '1',
 'values': [0.0017460489179939032,
  -0.033502884209156036,
  -0.03290388733148575,
  0.007168094161897898,
  -0.01460327859967947,
  0.010261928662657738,
  -0.01151528861373663,
  0.22930213809013367,
  -0.023232396692037582,
  0.004120402969419956,
  -0.036560822278261185,
  0.08592110127210617,
  0.012972140684723854,
  0.05221788212656975,
  -0.10232618451118469,
  -0.003139043692499399,
  -0.012686969712376595,
  0.000471863109851256,
  -0.02848585695028305,
  -0.050259195268154144,
  0.01155101228505373,
  0.0778065174818039,
  0.09282823652029037,
  -0.0137972766533494,
  -0.016935130581259727,
  -0.025955867022275925,
  -0.04956510663032532,
  -0.046131301671266556,
  0.00729052210226655,
  -0.013553328812122345,
  0.038439445197582245,
  0.06280472129583359,
  0.018353812396526337,
  0.008242843672633171,
  0.0017155527602881193,
  -0.039861857891082764,
  -0.011638614349067211,
  0.016446180641651154,
  0.025595590472221375,
  0.09104609489440918,
  0.02967273816

In [161]:
# docsearch = Pinecone.from_texts([t.page_content for t in chunks], embeddings, index_name)

# Upsert the chunks into Pinecone
# index.upsert(vectors=upsert_vectors)
batch_size = 500  # Adjust as necessary based on your data size and Pinecone limits
for i in range(0, len(upsert_vectors), batch_size):
    batch_vectors = upsert_vectors[i:i + batch_size]
    index.upsert(vectors=batch_vectors)


In [205]:
query_embedding = embeddings.embed_query("What are allergies")
print(len(query_embedding))
# Perform query to retrieve similar vectors
results = index.query(vector=[query_embedding], top_k=3, include_values=True)


384


In [207]:
print(results)

{'matches': [{'id': '1373',
              'score': 0.682538807,
              'values': [0.0354718231,
                         -0.0110478504,
                         0.075132288,
                         0.00805769768,
                         0.113756657,
                         0.0377156585,
                         0.113712065,
                         0.0840440243,
                         -0.0305278692,
                         0.091364637,
                         0.0113536799,
                         -0.0614714548,
                         0.0239176769,
                         0.0503650531,
                         -0.0131362388,
                         0.0664992481,
                         -0.0492045134,
                         -0.0601874441,
                         -0.026664611,
                         -0.048428636,
                         0.00717463,
                         0.0825395435,
                         -0.0148052601,
                         -0.038249176

In [208]:
matched_ids = [match['id'] for match in results['matches']]
print(matched_ids)

['1373', '1356', '1306']


In [209]:
chunks[1306].page_content
# Now, based on these top results, I will send it to the llm and it will return the appropriate answer

'mous. Seasonal AR is most commonly caused by grassand tree pollens, since their pollen is produced in largeamounts and is dispersed by the wind. Showy flowers,like roses or lilacs, that attract insects produce a stickypollen which is less likely to become airborne. Differentplants release their pollen at different times of the year,so seasonal AR sufferers may be most affected in spring,summer, or fall, depending on which plants provoke aresponse. The amount of pollen in the air is reflected'

In [194]:
prompt_template = """
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below nothing else.
Helpful Answer: 
"""

In [195]:
PROMPT = PromptTemplate(template = prompt_template, input_variables=["context", "question"])
chain_type_kwargs = {"prompt":PROMPT}

In [252]:
llm = CTransformers(model="model/llama-2-7b-chat.ggmlv3.q4_0.bin", model_type="llama", config={'max_new_tokens': 512, 'temperature': 1})

In [253]:
# Create Pinecone retriever
vector_store = Pinecone(index, embeddings, text_key="text")

In [254]:

qa = RetrievalQA.from_chain_type(llm, chain_type="stuff",retriever = vector_store.as_retriever(search_kwargs={"k": 2}), chain_type_kwargs=chain_type_kwargs)

In [261]:
# Example query
query = "How to cure AIDS?"
answer = qa.invoke(query)
print(answer)

{'query': 'How to cure AIDS?', 'result': "I don't know of any known cure for AIDS at this time. The current treatment options focus on managing symptoms, slowing or halting the progression of the disease, and improving quality of life. It is important to note that while there are antiretroviral therapies available, they do not eliminate the virus completely, but rather keep it under control, thereby delaying the progression of the disease. Therefore, there is an ongoing search for a cure or functional cure, but no definitive answers yet. "}
