In [17]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers

In [18]:
def load_pdf(data):
    loader = DirectoryLoader(data,
                    glob="*.pdf",
                    loader_cls=PyPDFLoader)
    
    documents = loader.load()

    return documents

In [19]:
ext_data = load_pdf("Data/")

In [40]:
def text_split(ext_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=20)
    text_chunks = text_splitter.split_documents(ext_data)

    return text_chunks

In [41]:
text_chunks = text_split(ext_data)

In [42]:
print(len(text_chunks))

7020


In [31]:
# #download embedding model
# def download_hugging_face_embeddings():
#     embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
#     return embeddings
# embeddings = download_hugging_face_embeddings()

In [43]:
from dotenv import load_dotenv
load_dotenv()
import os

In [44]:
PINECONE1_API_KEY = os.environ.get('PINECONE1_API_KEY')


In [45]:
from pinecone import Pinecone

pc = Pinecone(api_key=PINECONE1_API_KEY)
index = pc.Index("medicalchatbot")

In [49]:
from sentence_transformers import SentenceTransformer

# Function to download the Hugging Face embedding model
def download_hugging_face_embeddings():
    embeddings = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

# Load the Hugging Face embedding model
model = download_hugging_face_embeddings()

### Upserting data with meta data
# Extract the data from text chunks

cleaned_data = [t.page_content for t in text_chunks][2000:3000]

# Generate embeddings for each text chunk
embeddings = model.encode(cleaned_data)

# Formatting for upserting

entries = []
for i, embedding in enumerate(embeddings):
    entry = {
        "id": f"text-{i+3000}",  # Replace with your ID format
        "values": embedding.tolist(),
        "metadata": {"context":cleaned_data[i]}  # Assuming metadata is provided in the same order as embeddings
    }
    entries.append(entry)

index.upsert(vectors=entries)

{'upserted_count': 1000}

 Querying

In [50]:
query_text = "What is allergy"


In [51]:
import torch

# Assuming you have already embedded and stored embeddings for this query text
# Generate the embedding for the query text (you would typically do this in the embedding step)
query_embedding = model.encode([query_text])[0]

# Ensure query embedding is a list of floats
if isinstance(query_embedding, np.ndarray):
    query_embedding = query_embedding.tolist()
elif isinstance(query_embedding, torch.Tensor):
    query_embedding = query_embedding.tolist()
else:
    raise ValueError(f"Unexpected embedding type: {type(query_embedding)}")



In [56]:
# Search for similar embeddings
results = index.query(vector=query_embedding, top_k=10,include_metadata=True)



In [57]:
results

{'matches': [{'id': 'text-1340',
              'metadata': {'context': 'When thisoccurs, an allergy develops '
                                      'against the offending sub-stance (an '
                                      'allergen.)'},
              'score': 0.705211341,
              'values': []},
             {'id': 'text-1430',
              'metadata': {'context': 'Purpose\n'
                                      'Allergy is a reaction of the immune '
                                      'system. Nor-'},
              'score': 0.69418323,
              'values': []},
             {'id': 'text-1372',
              'metadata': {'context': 'GALE ENCYCLOPEDIA OF MEDICINE 2 '
                                      '117Allergies\n'
                                      'Allergic rhinitis is commonly triggered '
                                      'by\n'
                                      'exposure to household dust, animal '
                                      'fur,or polle

In [None]:
# Search for similar embeddings
results = index.query(vector=query_embedding, top_k=5,include_metadata=True)

In [62]:
final_result = [result['metadata']['context'] for result in results['matches']]

In [63]:
final_result

['When thisoccurs, an allergy develops against the offending sub-stance (an allergen.)',
 'Purpose\nAllergy is a reaction of the immune system. Nor-',
 "GALE ENCYCLOPEDIA OF MEDICINE 2 117Allergies\nAllergic rhinitis is commonly triggered by\nexposure to household dust, animal fur,or pollen. The foreign substance thattriggers an allergic reaction is calledan allergen.\nThe presence of an allergen causes the\nbody's lymphocytes to begin producingIgE antibodies. The lymphocytes of an allergy sufferer produce an unusuallylarge amount of IgE.\nIgE molecules attach to mast\ncells, which contain histamine.HistaminePollen grains\nLymphocyte\nFIRST EXPOSURE",
 'Allergen —A substance that provokes an allergic\nresponse.\nAllergic rhinitis —Inflammation of the mucous\nmembranes of the nose and eyes in response to anallergen.\nAnaphylaxis —Increased sensitivity caused by previ-\nous exposure to an allergen that can result in bloodvessel dilation and smooth muscle contraction.Anaphylaxis can resul