In [98]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
from langchain.document_loaders import PyMuPDFLoader, DirectoryLoader,PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers
# from langchain.retrievers import PineconeRetriever

In [2]:
def load_pdf(data):
    loader = DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents 

In [3]:
extracted_data = load_pdf("data/")

In [4]:
# extracted_data

### Create text chunks

In [5]:
def text_split(data_extracted):

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(data_extracted)
    return text_chunks


In [6]:
text_chunks = text_split(extracted_data)
print(len(text_chunks))

5860


In [74]:
text_chunks[400]

Document(metadata={'source': 'data\\medical-book.pdf', 'page': 50, 'page_label': '51'}, page_content='Acupressure points to relieve hay fever, sore throat, and\nheartburn. (Illustration by Electronic Illustrators Group.)\nGEM - 0001 to 0432 - A  10/22/03 1:41 PM  Page 37')

In [7]:
# text_chunks

### Embedding Model 

In [8]:
import os
def download_embeddings():
    # Define the path where the embeddings should be saved
    cache_dir = os.path.join(os.getcwd(), "models")
    
    # Initialize embeddings with the specified cache directory
    embeddings = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2",
        cache_folder=cache_dir
    )
    return embeddings


In [9]:
embeddings = download_embeddings()

  embeddings = HuggingFaceEmbeddings(





In [10]:
embeddings 

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder='d:\\Tutorial\\GenAI\\Projects\\Medical-Chatbot\\models', model_kwargs={}, encode_kwargs={}, multi_process=False, show_progress=False)

In [11]:
query_result = embeddings.embed_query("hello world")
print(len(query_result))

384


In [42]:
from dotenv import load_dotenv
import os

load_dotenv()  # This will load the environment variables from the .env file
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
PINECONE_API_ENV= os.getenv('PINECONE_API_ENV')


In [44]:
print(PINECONE_API_KEY)
print(PINECONE_API_ENV)

pcsk_2oVXis_TBuupQJDi3qHCGHL5BQifhBAMfCiWfU69WtbNnQdQfi21KSMeCUqMsrWZuKVLXu
aped-4627-b74a


In [45]:
from pinecone import Pinecone

pc = Pinecone(api_key=PINECONE_API_KEY,environment=PINECONE_API_ENV)

# Connect to your existing index
index_name = "medical-chatbot"
index = pc.Index(index_name)  # Use the .index() method to retrieve the existing index


In [46]:
index

<pinecone.data.index.Index at 0x2ba2746f670>

In [76]:
# Assuming you have a HuggingFaceEmbeddings instance called 'embeddings'
upsert_data = [(str(i), embeddings.embed_query(text_chunks[i].page_content),{"text": text_chunks[i].page_content}) for i in range(len(text_chunks))]

# Perform the upsert


In [77]:
print(upsert_data[0])

('0', [0.021459683775901794, -0.008097134530544281, -0.026178136467933655, 0.016104133799672127, -0.03194974735379219, 0.00957584474235773, 0.00321900169365108, 0.19288143515586853, -0.032423410564661026, -0.041329674422740936, 0.005839129909873009, 0.08297932147979736, 0.04542427137494087, 0.02660774625837803, -0.11355927586555481, 0.006353229284286499, -0.03204401955008507, -0.030400892719626427, -0.007295815274119377, -0.02290460467338562, -0.050757940858602524, 0.08314482122659683, 0.05323915556073189, 0.0234315637499094, -0.08658876270055771, 0.0478459969162941, -0.05883761867880821, -0.05568983778357506, -0.001716391183435917, -0.01642111875116825, -0.0034194032195955515, 0.09850358217954636, 0.055149514228105545, -0.013974909670650959, -0.00471839401870966, -0.04058890789747238, 0.018936995416879654, 0.028466319665312767, -0.04568350687623024, 0.10384681075811386, 0.0267010610550642, -0.06810127198696136, -0.0347861684858799, -0.0038320529274642467, 0.06335132569074631, 0.051609

In [54]:
print(pc.list_indexes())
description = pc.describe_index("medical-chatbot")
print(f"Description of index: \n {description}")

{'indexes': [{'deletion_protection': 'disabled',
              'dimension': 384,
              'host': 'medical-chatbot-bc4fyf5.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'medical-chatbot',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}}]}
Description of index: 
 {'deletion_protection': 'disabled',
 'dimension': 384,
 'host': 'medical-chatbot-bc4fyf5.svc.aped-4627-b74a.pinecone.io',
 'metric': 'cosine',
 'name': 'medical-chatbot',
 'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
 'status': {'ready': True, 'state': 'Ready'}}


In [78]:
batch_size = 100  # or any number that suits your data size
for i in range(0, len(upsert_data), batch_size):
    batch = upsert_data[i:i + batch_size]
    index.upsert(vectors=batch)


In [79]:
query = "What are allergies?"
query_embedding = embeddings.embed_query(query)  # encoding query text into vector

In [80]:
len(query_embedding)

384

In [90]:

# Perform similarity search
results = index.query(
    vector=query_embedding,  # Query vector
    top_k=3,  # Retrieve top 3 most similar documents
    include_metadata=True  # Include metadata if available
)

# Print the results
# for match in results['matches']:
#     print(f"Score: {match['score']}, Text: {match['metadata']['text']}")
#     print("\n")


In [101]:
vectorstore = Pinecone(
    index=index,  # Pinecone index instance
    embedding=embeddings.embed_query,  # Embedding function
    text_key="text"  # Key in metadata containing the document text
)

retriever = vectorstore.as_retriever(search_kwargs={"k": 3})





In [89]:
prompt_template = """ 
Use the following peices of information to answer the user's question 
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context:{context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer: 

"""

In [91]:
prompt = PromptTemplate(template=prompt_template,input_variables=['context','question'])
chain_type_kwargs = {"prompt":prompt}

In [92]:
llm = CTransformers(model="model/llama-2-7b-chat.ggmlv3.q4_0.bin",
                    model_type="llama",
                    config={'max_new_tokens':512,
                            'temperature':0.6})

In [102]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt},
)

In [103]:
question = "What are allergies?"
answer = qa_chain.run(question)
print(f"Answer: {answer}")

  answer = qa_chain.run(question)


Answer: Allergies are a reaction of the immune system where the body mistakenly identifies harmless substances as threats, triggering an inappropriate and exaggerated response. This can result in a range of symptoms including itchy eyes, runny nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, noses, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, and cong, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, stuffy nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, and cong nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, and stuffy nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, and cong noses, nose, nose, nose, nose, nose, nose, nose, nose, nose, nose, 