In [16]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers
from tqdm import tqdm

# Extract data from the PDF
def load_pdf(data):
    loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

extracted_data = load_pdf("D:/Medical_chatbot/data/")
print("Documents extracted:", len(extracted_data))

# Create text chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

text_chunks = text_split(extracted_data)
print("Length of my chunks:", len(text_chunks))

# Download embedding model
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

embeddings = download_hugging_face_embeddings()

# Initialize Pinecone
api_key = "b0b29cdd-c3d5-40a9-8c81-7f75f4a19bf3"
pc = pinecone.Pinecone(api_key=api_key)

index_name = "medical"

# Create the index if it doesn't exist
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=pinecone.ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
        )
    )

# Connect to the index
index = pc.Index(index_name)

# Generate embeddings for text chunks and prepare for upsert
def generate_embeddings(text_chunks, embeddings):
    embedded_texts = []
    for i, chunk in enumerate(tqdm(text_chunks, desc="Generating embeddings")):
        vector = embeddings.embed_query(chunk.page_content)
        embedded_texts.append({
            "id": f"chunk_{i}",
            "values": vector,
            "metadata": {"text": chunk.page_content}
        })
    return embedded_texts

embedded_texts = generate_embeddings(text_chunks, embeddings)

# Upsert embeddings to Pinecone index
for i in tqdm(range(0, len(embedded_texts), 100), desc="Upserting embeddings"):
    batch = embedded_texts[i:i+100]
    index.upsert(vectors=batch)

print("Upserted text chunks into Pinecone index successfully.")

# Query the Pinecone index
query_results = index.query(
    vector=embedded_texts[0]['values'],
    top_k=3,
    include_values=True
)

print("Query results:", query_results)


Documents extracted: 637
Length of my chunks: 7020


Generating embeddings: 100%|██████████| 7020/7020 [28:48<00:00,  4.06it/s]     
Upserting embeddings: 100%|██████████| 71/71 [03:56<00:00,  3.33s/it]


Upserted text chunks into Pinecone index successfully.
Query results: {'matches': [{'id': 'chunk_0',
              'score': 1.0,
              'values': [0.00174607965,
                         -0.0335028544,
                         -0.0329039358,
                         0.00716804,
                         -0.0146032888,
                         0.0102619026,
                         -0.0115152597,
                         0.229302093,
                         -0.0232323837,
                         0.00412041647,
                         -0.0365608521,
                         0.0859211087,
                         0.0129721984,
                         0.0522178747,
                         -0.102326214,
                         -0.00313904765,
                         -0.0126869297,
                         0.000471863983,
                         -0.028485857,
                         -0.0502591804,
                         0.0115509806,
                         0.0778065324,
  

In [22]:
query = "What are Allergies"

# Generate the embedding for the query
query_embedding = embeddings.embed_query(query)

# Perform the similarity search using the generated query embedding
query_results = index.query(
    vector=query_embedding,
    top_k=3,
    include_values=True
)

print("Query results:", query_results)


Query results: {'matches': [{'id': 'chunk_1372',
              'score': 0.682538807,
              'values': [0.0354718342,
                         -0.0110478457,
                         0.0751323476,
                         0.00805771817,
                         0.113756694,
                         0.0377156436,
                         0.113712087,
                         0.0840440318,
                         -0.0305278748,
                         0.0913646817,
                         0.0113536678,
                         -0.061471466,
                         0.0239176396,
                         0.0503650643,
                         -0.0131362136,
                         0.0664993078,
                         -0.0492045097,
                         -0.0601874553,
                         -0.0266646501,
                         -0.0484286509,
                         0.00717465766,
                         0.0825395286,
                         -0.0148052443,
          

In [25]:
# Define your question
question = "What causes asthma?"

# Generate the embedding for the question
question_embedding = embeddings.embed_query(question)

# Perform the similarity search using the generated question embedding
query_results = index.query(vector=question_embedding,top_k=3,include_values=True)

# Create a dictionary to map chunk IDs to their text content
id_to_text = {doc['id']: doc['metadata']['text'] for doc in embedded_texts}

# Extract and print the text content of the top matches
for match in query_results['matches']:
    chunk_id = match['id']
    score = match['score']
    text_content = id_to_text.get(chunk_id, "Text not found")
    print(f"Chunk ID: {chunk_id}")
    print(f"Score: {score}")
    print(f"Text Content: {text_content}\n")


Chunk ID: chunk_4329
Score: 0.740090549
Text Content: or nasal polyps , or they may be sensitive to aspirin and
related drugs. Another major source of adult asthma isexposure at work to animal products, certain forms ofplastic, wood dust, or metals.
Causes and symptoms
In most cases, asthma is caused by inhaling an aller-

Chunk ID: chunk_2732
Score: 0.715492904
Text Content: Asthma attacks can be caused by allergies to pollen,
dust, pets or other things, but people without knownallergies may also have asthma. Exercise ,stress , intense
emotions, exposure to cold, certain medicines and somemedical conditions also can bring on attacks.
The two main approaches to dealing with asthma are

Chunk ID: chunk_4319
Score: 0.691601098
Text Content: inflammatory disease of the airways. In those susceptibleto asthma, this inflammation causes the airways to nar-row periodically. This, in turn, produces wheezing and
breathlessness, sometimes to the point where the patient
GALE ENCYCLOPEDIA OF MEDICI