In [None]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers
from tqdm import tqdm

# Extract data from the PDF
def load_pdf(data):
    loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

extracted_data = load_pdf("D:/Medical_chatbot/data/")
print("Documents extracted:", len(extracted_data))

# Create text chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

text_chunks = text_split(extracted_data)
print("Length of my chunks:", len(text_chunks))

# Download embedding model
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

embeddings = download_hugging_face_embeddings()

# Initialize Pinecone
api_key = "b0b29cdd-c3d5-40a9-8c81-7f75f4a19bf3"
pc = pinecone.Pinecone(api_key=api_key)

index_name = "medical"

# Create the index if it doesn't exist
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=pinecone.ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
        )
    )

# Connect to the index
index = pc.Index(index_name)

# Generate embeddings for text chunks and prepare for upsert
def generate_embeddings(text_chunks, embeddings):
    embedded_texts = []
    for i, chunk in enumerate(tqdm(text_chunks, desc="Generating embeddings")):
        vector = embeddings.embed_query(chunk.page_content)
        embedded_texts.append({
            "id": f"chunk_{i}",
            "values": vector,
            "metadata": {"text": chunk.page_content}
        })
    return embedded_texts

embedded_texts = generate_embeddings(text_chunks, embeddings)

# Upsert embeddings to Pinecone index
for i in tqdm(range(0, len(embedded_texts), 100), desc="Upserting embeddings"):
    batch = embedded_texts[i:i+100]
    index.upsert(vectors=batch)

print("Upserted text chunks into Pinecone index successfully.")

# Query the Pinecone index
query_results = index.query(
    vector=embedded_texts[0]['values'],
    top_k=3,
    include_values=True
)

print("Query results:", query_results)


In [None]:
# Define your question
question = "What causes asthma?"

# Generate the embedding for the question
question_embedding = embeddings.embed_query(question)

# Perform the similarity search using the generated question embedding
query_results = index.query(vector=question_embedding,top_k=3,include_values=True)

# Create a dictionary to map chunk IDs to their text content
id_to_text = {doc['id']: doc['metadata']['text'] for doc in embedded_texts}

# Extract and print the text content of the top matches
for match in query_results['matches']:
    chunk_id = match['id']
    score = match['score']
    text_content = id_to_text.get(chunk_id, "Text not found")
    print(f"Chunk ID: {chunk_id}")
    print(f"Score: {score}")
    print(f"Text Content: {text_content}\n")


In [None]:
prompt_template="""
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [None]:
PROMPT=PromptTemplate(template=prompt_template, input_variables=["context", "question"])
chain_type_kwargs={"prompt": PROMPT}

In [34]:
from dotenv import load_dotenv
from langchain_groq import ChatGroq
import os
load_dotenv()
groq_api_key=os.getenv('GROQ_API_KEY')


llm=ChatGroq(groq_api_key=groq_api_key, model_name="Gemma-7b-it")