In [1]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers
from tqdm import tqdm

# Extract data from the PDF
def load_pdf(data):
    loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

extracted_data = load_pdf("D:/Medical_chatbot/data/")
print("Documents extracted:", len(extracted_data))

# Create text chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

text_chunks = text_split(extracted_data)
print("Length of my chunks:", len(text_chunks))

# Download embedding model
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

embeddings = download_hugging_face_embeddings()

# Initialize Pinecone
api_key = "b0b29cdd-c3d5-40a9-8c81-7f75f4a19bf3"
pc = pinecone.Pinecone(api_key=api_key)

index_name = "medical"

# Create the index if it doesn't exist
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=pinecone.ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
        )
    )

# Connect to the index
index = pc.Index(index_name)

# Generate embeddings for text chunks and prepare for upsert
def generate_embeddings(text_chunks, embeddings):
    embedded_texts = []
    for i, chunk in enumerate(tqdm(text_chunks, desc="Generating embeddings")):
        vector = embeddings.embed_query(chunk.page_content)
        embedded_texts.append({
            "id": f"chunk_{i}",
            "values": vector,
            "metadata": {"text": chunk.page_content}
        })
    return embedded_texts

embedded_texts = generate_embeddings(text_chunks, embeddings)

# Upsert embeddings to Pinecone index
for i in tqdm(range(0, len(embedded_texts), 100), desc="Upserting embeddings"):
    batch = embedded_texts[i:i+100]
    index.upsert(vectors=batch)

print("Upserted text chunks into Pinecone index successfully.")

# Query the Pinecone index
query_results = index.query(
    vector=embedded_texts[0]['values'],
    top_k=3,
    include_values=True
)

print("Query results:", query_results)


  from tqdm.autonotebook import tqdm


Documents extracted: 637
Length of my chunks: 7020


Generating embeddings: 100%|██████████| 7020/7020 [40:52<00:00,  2.86it/s]
Upserting embeddings: 100%|██████████| 71/71 [02:40<00:00,  2.26s/it]


Upserted text chunks into Pinecone index successfully.
Query results: {'matches': [{'id': 'chunk_0',
              'score': 1.0,
              'values': [0.00174607965,
                         -0.0335028544,
                         -0.0329039358,
                         0.00716804,
                         -0.0146032888,
                         0.0102619026,
                         -0.0115152597,
                         0.229302093,
                         -0.0232323837,
                         0.00412041647,
                         -0.0365608521,
                         0.0859211087,
                         0.0129721984,
                         0.0522178747,
                         -0.102326214,
                         -0.00313904765,
                         -0.0126869297,
                         0.000471863983,
                         -0.028485857,
                         -0.0502591804,
                         0.0115509806,
                         0.0778065324,
  

In [18]:
# Define your question
question = "What causes asthma?"

# Generate the embedding for the question and perform the similarity search
query_results = index.query(vector=embeddings.embed_query(question), top_k=3, include_values=True, include_metadata=True)

# Extract and print the text content of the top matches
for match in query_results['matches']:
    #print(f"Chunk ID: {match['id']}")
    print(f"Score: {match['score']}")
    print(f"Text Content: {match['metadata'].get('text', 'Text not found')}\n")


Score: 0.740090549
Text Content: or nasal polyps , or they may be sensitive to aspirin and
related drugs. Another major source of adult asthma isexposure at work to animal products, certain forms ofplastic, wood dust, or metals.
Causes and symptoms
In most cases, asthma is caused by inhaling an aller-

Score: 0.715492904
Text Content: Asthma attacks can be caused by allergies to pollen,
dust, pets or other things, but people without knownallergies may also have asthma. Exercise ,stress , intense
emotions, exposure to cold, certain medicines and somemedical conditions also can bring on attacks.
The two main approaches to dealing with asthma are

Score: 0.691601098
Text Content: inflammatory disease of the airways. In those susceptibleto asthma, this inflammation causes the airways to nar-row periodically. This, in turn, produces wheezing and
breathlessness, sometimes to the point where the patient
GALE ENCYCLOPEDIA OF MEDICINE 2 379AsthmaGEM - 0001 to 0432 - A  10/22/03 1:43 PM  Page 37

In [20]:
import os
from dotenv import load_dotenv
from groq import Groq



# Initialize the Groq client with the Gemma-7b-it model
client = Groq(api_key="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")

# Define your question
question = "What causes asthma?"

# Create a chat completion using the model
chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": question,
        }
    ],
    model="Gemma-7b-it",
)

# Get the response content
response_content = chat_completion.choices[0].message.content

query_results = index.query(vector=embeddings.embed_query(response_content), top_k=3, include_values=True, include_metadata=True)

# Extract and print the text content of the top matches
for match in query_results['matches']:
    print(f"Chunk ID: {match['id']}")
    print(f"Score: {match['score']}")
    print(f"Text Content: {match['metadata'].get('text', 'Text not found')}\n")


Chunk ID: chunk_2732
Score: 0.75076
Text Content: Asthma attacks can be caused by allergies to pollen,
dust, pets or other things, but people without knownallergies may also have asthma. Exercise ,stress , intense
emotions, exposure to cold, certain medicines and somemedical conditions also can bring on attacks.
The two main approaches to dealing with asthma are

Chunk ID: chunk_4329
Score: 0.717816591
Text Content: or nasal polyps , or they may be sensitive to aspirin and
related drugs. Another major source of adult asthma isexposure at work to animal products, certain forms ofplastic, wood dust, or metals.
Causes and symptoms
In most cases, asthma is caused by inhaling an aller-

Chunk ID: chunk_4333
Score: 0.716788173
Text Content: near people who are smoking, can irritate the airwaysand trigger an asthmatic attack. Air pollutants can have asimilar effect. In addition, there are three important fac-tors that regularly produce attacks in certain asthmaticpatients, and they may someti

In [21]:
import os
from dotenv import load_dotenv
from groq import Groq

client = Groq(api_key="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")

# Define your question and prompt template
question = "What causes asthma?"
prompt_template = "Explain the following topic in detail: {question}"

# Format the prompt template with the question
formatted_prompt = prompt_template.format(question=question)

# Create a chat completion using the formatted prompt
chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": formatted_prompt,
        }
    ],
    model="Gemma-7b-it",
)

# Get the response content
response_content = chat_completion.choices[0].message.content


question_embedding = embeddings.embed_query(response_content)  # Placeholder for actual embedding method

# Perform the similarity search using the generated question embedding
query_results = index.query(vector=question_embedding, top_k=3, include_values=True, include_metadata=True)

# Extract and print the text content of the top matches
for match in query_results['matches']:
    print(f"Chunk ID: {match['id']}")
    print(f"Score: {match['score']}")
    print(f"Text Content: {match['metadata'].get('text', 'Text not found')}\n")


Chunk ID: chunk_2732
Score: 0.781488121
Text Content: Asthma attacks can be caused by allergies to pollen,
dust, pets or other things, but people without knownallergies may also have asthma. Exercise ,stress , intense
emotions, exposure to cold, certain medicines and somemedical conditions also can bring on attacks.
The two main approaches to dealing with asthma are

Chunk ID: chunk_4329
Score: 0.713857055
Text Content: or nasal polyps , or they may be sensitive to aspirin and
related drugs. Another major source of adult asthma isexposure at work to animal products, certain forms ofplastic, wood dust, or metals.
Causes and symptoms
In most cases, asthma is caused by inhaling an aller-

Chunk ID: chunk_4333
Score: 0.692672491
Text Content: near people who are smoking, can irritate the airwaysand trigger an asthmatic attack. Air pollutants can have asimilar effect. In addition, there are three important fac-tors that regularly produce attacks in certain asthmaticpatients, and they may so

In [None]:
prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [None]:
import os
from dotenv import load_dotenv
from groq import Groq

# Load environment variables from a .env file
load_dotenv()
groq_api_key = os.getenv('GROQ_API_KEY')

# Initialize the Groq client with the Gemma-7b-it model
client = Groq(api_key=groq_api_key)

# Define your context and question
context = "Asthma is a common respiratory condition characterized by inflammation and narrowing of the airways."
question = "What causes asthma?"

# Define your prompt template
prompt_template = """
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

# Format the prompt template with the context and question
formatted_prompt = prompt_template.format(context=context, question=question)

# Create a chat completion using the formatted prompt
chat_completion = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": formatted_prompt,
        }
    ],
    model="Gemma-7b-it",
)

# Get the response content
response_content = chat_completion.choices[0].message.content

# Assuming you have a method to convert text to embeddings, replace `embeddings.embed_query`
# with the actual method you're using to generate embeddings.
question_embedding = embeddings.embed_query(response_content)  # Placeholder for actual embedding method

# Perform the similarity search using the generated question embedding
query_results = index.query(vector=question_embedding, top_k=3, include_values=True, include_metadata=True)

# Extract and print the text content of the top matches
for match in query_results['matches']:
    print(f"Chunk ID: {match['id']}")
    print(f"Score: {match['score']}")
    print(f"Text Content: {match['metadata'].get('text', 'Text not found')}\n")


In [None]:
PROMPT=PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain_type_kwargs={"prompt": PROMPT}