In [1]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Pinecone
import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers
from tqdm import tqdm

  from tqdm.autonotebook import tqdm


In [2]:
# Extract data from the PDF
def load_pdf(data):
    loader = DirectoryLoader(data, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

extracted_data = load_pdf("D:/Medical_chatbot/data/")
print("Documents extracted:", len(extracted_data))

Documents extracted: 77


In [3]:
# Create text chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

text_chunks = text_split(extracted_data)
print("Length of my chunks:", len(text_chunks))

Length of my chunks: 233


In [4]:
# Download embedding model
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

embeddings = download_hugging_face_embeddings()

In [5]:
# Initialize Pinecone
api_key = "b0b29cdd-c3d5-40a9-8c81-7f75f4a19bf3"
pc = pinecone.Pinecone(api_key=api_key)

index_name = "medical"

# Create the index if it doesn't exist
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=384,
        metric="cosine",
        spec=pinecone.ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
        )
    )

In [6]:
# Connect to the index
index = pc.Index(index_name)

# Generate embeddings for text chunks and prepare for upsert
def generate_embeddings(text_chunks, embeddings):
    embedded_texts = []
    for i, chunk in enumerate(tqdm(text_chunks, desc="Generating embeddings")):
        vector = embeddings.embed_query(chunk.page_content)
        embedded_texts.append({
            "id": f"chunk_{i}",
            "values": vector,
            "metadata": {"text": chunk.page_content}
        })
    return embedded_texts

embedded_texts = generate_embeddings(text_chunks, embeddings)

Generating embeddings: 100%|██████████| 233/233 [00:51<00:00,  4.55it/s]


In [7]:

# Upsert embeddings to Pinecone index
batch_size = 128  # Define your batch size

for i in tqdm(range(0, len(embedded_texts), batch_size), desc="Upserting embeddings"):
    batch = embedded_texts[i:i + batch_size]
    to_upsert = list(zip([item["id"] for item in batch], 
                         [item["values"] for item in batch], 
                         [item["metadata"] for item in batch]))
    index.upsert(vectors=to_upsert)

print("Upserted text chunks into Pinecone index successfully.")

Upserting embeddings: 100%|██████████| 2/2 [00:07<00:00,  3.72s/it]

Upserted text chunks into Pinecone index successfully.





In [8]:
# def query_pinecone(query: str, top_k: int, index, embeddings):

#     # Generate the embedding for the question
#     query_vector = embeddings.embed_query(query)
    
#     # Perform the similarity search
#     query_results = index.query(vector=query_vector, top_k=top_k, include_values=True, include_metadata=True)
    
#     # Extract and print the text content of the top matches
#     for match in query_results['matches']:
#         print(f"Score: {match['score']}")
#         print(f"Text Content: {match['metadata'].get('text', 'Text not found')}\n")




In [9]:
# # Example usage
# query = "What is COMMON LLM ELIGIBILITY CRITERIA"
# docs=query_pinecone(query, top_k=3, index=index, embeddings=embeddings)

In [10]:
from groq import Groq
import getpass
import os
from dotenv import load_dotenv
load_dotenv()
# Initialize Groq client
import getpass
groq_api_key = getpass.getpass("Enter your Groq API key: ")
groq_client = Groq(api_key=groq_api_key)


def query_pinecone(query: str, top_k: int, index, embeddings):
    # Generate the embedding for the question
    query_vector = embeddings.embed_query(query)
    
    # Perform the similarity search
    query_results = index.query(vector=query_vector, top_k=top_k, include_values=True, include_metadata=True)
    
    # Extract and print the text content of the top matches
    docs = []
    for match in query_results['matches']:
        print(f"Score: {match['score']}")
        text_content = match['metadata'].get('text', 'Text not found')
        print(f"Text Content: {text_content}\n")
        docs.append(text_content)
    return docs

In [11]:

def generate(query: str, docs: list[str]):
    # Construct the system message
    system_message = (
        "You are a helpful assistant that answers questions about AI using the context provided below.\n\n"
        "CONTEXT:\n" + "\n\n---\n".join(docs) + "\n\n"
    )
    
    # Prepare the messages for the chat
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": query}
    ]
    
    # Generate and return the response
    chat_response = groq_client.chat.completions.create(
        model="llama3-70b-8192",
        messages=messages
    )
    return chat_response.choices[0].message.content


In [12]:
# Example usage
query = "What is COMMON LLM ELIGIBILITY CRITERIA"
docs = query_pinecone(query, top_k=2, index=index, embeddings=embeddings)

out = generate(query=query, docs=docs)
print(out)


Score: 0.796412706
Text Content: COMMON LLM ELIGIBILITY CRITERIA  
Candidates are eligible to apply for admission in an LLM course if they have completed their 
graduation in law, that is, if the aspirant has secured his/ her LLB/ Bachelor of Law/ Five -
year integrated LLB course from a recognised university/ college.  
Apart from this, some colleges also fix a minimum percentage requirement for the LLM 
course offered by them. As part of the eligibility criteria shared by such colleges, candidates

Score: 0.616188467
Text Content: LLM EXAMINATION  
LLM is a post -graduate degree of law of two -year course that is pursued after completion of 
law degree to become knowledgeable in any specialisation. Admission is done on basis of 
performance in law entrance exams like CLAT, LSAT, and other university - level 
examinations. A law degree of three or five years with 50 -60% total scores are required.  
LLM admissions are taken through examinations. Plenty of private law schools provide L