### Import Libraries & Set API Key

In [3]:

import os
import fitz  # PyMuPDF
import numpy as np
from groq import Groq
from dotenv import load_dotenv

# Load environment variables from a .env file
load_dotenv()

# Retrieve the API key from environment variable
GROQ_API_KEY = os.getenv("GROQ_API_KEY")

# Ensure it's set
if not GROQ_API_KEY:
    raise ValueError("Missing GROQ_API_KEY in .env file or environment variables.")

# Initialize Groq API client
client = Groq(api_key=GROQ_API_KEY)

### Extract Text from PDF

In [4]:
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    all_text = ""
    for page in doc:
        all_text += page.get_text("text")
    return all_text

### Chunk the Extracted Text

In [5]:
def chunk_text(text, chunk_size=1000, overlap=200):
    chunks = []
    for i in range(0, len(text), chunk_size - overlap):
        chunks.append(text[i:i + chunk_size])
    return chunks

### Simple Vector Store

In [6]:
class SimpleVectorStore:
    def __init__(self):
        self.vectors = []
        self.texts = []

    def add_item(self, text, embedding):
        self.texts.append(text)
        self.vectors.append(np.array(embedding))

    def similarity_search(self, query_embedding, k=5):
        if not self.vectors:
            return []

        query_vec = np.array(query_embedding)
        similarities = [
            (i, np.dot(query_vec, vec) / (np.linalg.norm(query_vec) * np.linalg.norm(vec)))
            for i, vec in enumerate(self.vectors)
        ]
        similarities.sort(key=lambda x: x[1], reverse=True)
        return [self.texts[i] for i, _ in similarities[:k]]

### Generate Embeddings

In [8]:
def get_embeddings(text_list):
    """
    Generates embeddings for a list of text chunks using batch processing.

    Args:
        text_list (List[str]): A list of text chunks.

    Returns:
        List[np.ndarray]: List of embedding vectors.
    """
    return embedding_model.encode(text_list, convert_to_numpy=True, batch_size=16, show_progress_bar=True)


### Groq Chat Wrapper

In [9]:
def groq_chat(system_prompt, user_prompt, model="llama3-70b-8192"):
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0.3
    )
    return response.choices[0].message.content.strip()


### Document Processing with Batch Embeddings

In [10]:
def process_document(pdf_path, chunk_size=1000, chunk_overlap=200):
    """
    Extracts text from a PDF, chunks it, and creates batch embeddings.

    Args:
        pdf_path (str): Path to the PDF file.
        chunk_size (int): Number of characters per chunk.
        chunk_overlap (int): Overlap between chunks.

    Returns:
        Tuple[List[str], SimpleVectorStore]: Chunked texts and vector store.
    """
    print("📄 Extracting text from PDF...")
    text = extract_text_from_pdf(pdf_path)

    print("✂️ Chunking text...")
    chunks = chunk_text(text, chunk_size, chunk_overlap)
    print(f"✅ Total Chunks: {len(chunks)}")

    print("🔗 Generating embeddings (batched)...")
    embeddings = get_embeddings(chunks)  # Batch version from Segment 6

    print("📦 Storing embeddings in vector store...")
    store = SimpleVectorStore()
    for chunk, vector in zip(chunks, embeddings):
        store.add_item(chunk, vector)

    print(f"✅ Added {len(store.texts)} chunks to vector store")
    return chunks, store


### Factual Retrieval Strategy

In [11]:
def factual_retrieval_strategy(query, vector_store, k=4):
    """
    Uses Groq to enhance a factual query, generate embedding,
    and retrieve top-k similar documents.

    Args:
        query (str): Original user query.
        vector_store (SimpleVectorStore): Chunk store with embeddings.
        k (int): Number of results to return.

    Returns:
        List[str]: Top-k most relevant document chunks.
    """
    print(f"\n🔍 Executing Factual Retrieval for: \"{query}\"")

    # Enhance query using Groq
    system_prompt = """You are an expert at refining factual search queries.
Make the user's question more specific, clear, and optimized for information retrieval.
Do NOT add opinions or unrelated info. Return only the enhanced query."""
    
    enhanced_query = groq_chat(system_prompt, query)
    print(f"✅ Enhanced Query: {enhanced_query}")

    # Generate embedding for the enhanced query
    query_embedding = get_embedding(enhanced_query)

    # Perform similarity search
    top_chunks = vector_store.similarity_search(query_embedding, k=k)

    print(f"📄 Retrieved Top {k} Relevant Chunks.")
    return top_chunks


### Analytical Retrieval Strategy

In [12]:
def analytical_retrieval_strategy(query, vector_store, k=4):
    """
    Retrieves diverse and comprehensive results by generating sub-questions.

    Args:
        query (str): Original analytical question.
        vector_store (SimpleVectorStore): Vector store with document chunks.
        k (int): Number of final documents to return.

    Returns:
        List[str]: Top-k diverse document chunks.
    """
    print(f"\n📊 Executing Analytical Retrieval for: \"{query}\"")

    # Step 1: Generate sub-questions using Groq
    system_prompt = """You are an expert at breaking down complex analytical queries.
Generate exactly 3 sub-questions that explore different dimensions of the main query.
Return each sub-question on a new line without numbering or explanation."""

    response = groq_chat(system_prompt, query)
    sub_questions = [q.strip() for q in response.split("\n") if q.strip()]

    print("✅ Sub-questions generated:")
    for i, sq in enumerate(sub_questions, 1):
        print(f"  {i}. {sq}")

    # Step 2: For each sub-question → embedding + similarity search
    results = []
    seen_texts = set()

    for sub_query in sub_questions:
        sub_embedding = get_embedding(sub_query)
        sub_results = vector_store.similarity_search(sub_embedding, k=2)

        for chunk in sub_results:
            if chunk not in seen_texts:
                seen_texts.add(chunk)
                results.append(chunk)

    # Step 3: Fallback if less than k results
    if len(results) < k:
        print("⚠️ Not enough unique chunks from sub-questions. Fetching directly from main query.")
        main_embedding = get_embedding(query)
        fallback_chunks = vector_store.similarity_search(main_embedding, k=k)

        for chunk in fallback_chunks:
            if chunk not in seen_texts and len(results) < k:
                seen_texts.add(chunk)
                results.append(chunk)

    print(f"📚 Returning {len(results)} analytical chunks.")
    return results[:k]


### Opinion Retrieval Strategy

In [13]:
def opinion_retrieval_strategy(query, vector_store, k=4):
    """
    Retrieves a diverse set of perspectives on an opinion-based query.

    Args:
        query (str): Original opinion-based query.
        vector_store (SimpleVectorStore): Vector store with document chunks.
        k (int): Number of final documents to return.

    Returns:
        List[str]: Top-k opinion chunks representing different perspectives.
    """
    print(f"\n🗣️ Executing Opinion Retrieval for: \"{query}\"")

    # Step 1: Generate perspective angles using Groq
    system_prompt = """You are an expert at identifying diverse viewpoints.
For the given opinion-based query, generate exactly 3 different perspective angles 
people might have. Each one should reflect a distinct viewpoint.

Return one viewpoint per line with NO explanation or numbering."""

    response = groq_chat(system_prompt, query)
    perspectives = [p.strip() for p in response.split("\n") if p.strip()]

    print("✅ Identified perspectives:")
    for i, p in enumerate(perspectives, 1):
        print(f"  {i}. {p}")

    # Step 2: Combine query with each perspective and embed + search
    results = []
    seen_texts = set()

    for perspective in perspectives:
        perspective_query = f"{query} | {perspective}"
        emb = get_embedding(perspective_query)
        similar_chunks = vector_store.similarity_search(emb, k=2)

        for chunk in similar_chunks:
            if chunk not in seen_texts:
                seen_texts.add(chunk)
                results.append(chunk)

    # Step 3: Fallback to generic similarity search if needed
    if len(results) < k:
        print("⚠️ Not enough unique perspective chunks. Adding fallback...")
        base_emb = get_embedding(query)
        fallback_chunks = vector_store.similarity_search(base_emb, k=k)

        for chunk in fallback_chunks:
            if chunk not in seen_texts and len(results) < k:
                seen_texts.add(chunk)
                results.append(chunk)

    print(f"📚 Returning {len(results)} opinion chunks.")
    return results[:k]


### Dispatcher

In [14]:
def adaptive_retrieval(query, vector_store, k=4, user_context=None):
    """
    Determines query type and dynamically routes to the appropriate retrieval strategy.

    Args:
        query (str): The user's input query.
        vector_store (SimpleVectorStore): Precomputed document chunks and embeddings.
        k (int): Number of results to retrieve.
        user_context (str, optional): Extra context for contextual queries.

    Returns:
        Tuple[str, List[str]]: Query type and top-k retrieved document chunks.
    """
    print(f"\n🧠 Classifying query: \"{query}\"")
    
    # Step 1: Classify query type via Groq
    system_prompt = """You are an expert at query classification.
Classify the input into one of the following categories:
- Factual
- Analytical
- Opinion
- Contextual

Return ONLY the category name with no explanation."""
    
    query_type = groq_chat(system_prompt, query)
    print(f"✅ Query classified as: {query_type}")

    # Step 2: Route to the correct strategy
    if query_type.lower() == "factual":
        results = factual_retrieval_strategy(query, vector_store, k)
    elif query_type.lower() == "analytical":
        results = analytical_retrieval_strategy(query, vector_store, k)
    elif query_type.lower() == "opinion":
        results = opinion_retrieval_strategy(query, vector_store, k)
    elif query_type.lower() == "contextual":
        print("⚠️ Contextual strategy not yet implemented. Falling back to factual.")
        results = factual_retrieval_strategy(query, vector_store, k)
    else:
        print("⚠️ Unknown type. Defaulting to factual.")
        results = factual_retrieval_strategy(query, vector_store, k)

    return query_type, results


### Response Generation using Groq

In [15]:
def generate_response(query, retrieved_chunks, query_type="Factual", model="llama3-70b-8192"):
    """
    Generates a final response based on the query, retrieved documents, and query type.

    Args:
        query (str): The original user query.
        retrieved_chunks (List[str]): Top-k relevant document chunks.
        query_type (str): Type of the query (Factual, Analytical, etc.)
        model (str): Groq model to use.

    Returns:
        str: Final natural language response.
    """
    # Combine the top chunks into a single context string
    context = "\n\n---\n\n".join(retrieved_chunks)

    # Use different instructions based on query type
    system_prompts = {
        "Factual": """You are a helpful assistant. Provide accurate, clear, and precise answers.
Use only the context provided below. If the answer isn't in the context, say so.""",
        "Analytical": """You are a helpful assistant. Provide a detailed, well-structured analysis of the topic
using the provided context. Consider multiple dimensions where appropriate.""",
        "Opinion": """You are a helpful assistant. Summarize multiple viewpoints on the topic using the context.
Present diverse opinions neutrally and fairly.""",
        "Contextual": """You are a helpful assistant. Answer the question based on both the query and the user's context
as reflected in the provided information.""",
    }

    system_prompt = system_prompts.get(query_type, system_prompts["Factual"])

    # User message combining context and query
    user_prompt = f"""Context:\n{context}\n\nQuestion: {query}\n\nAnswer:"""

    # Generate the answer using Groq
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        temperature=0.3
    )

    return response.choices[0].message.content.strip()


In [23]:
# ✅ Step 1: Load and Process Your PDF File
pdf_path = "A Comprehensive Analysis of Liver Disease Detection Using Advanced Machine Learning Algorithms.pdf"  # 🔁 Replace with your actual PDF file path
chunks, vector_store = process_document(pdf_path)

# ✅ Step 2: Enter Your Query
query = "What is the file about?"  # 🔁 Replace with your own query

# ✅ Step 3: Adaptive Retrieval Based on Query Type
query_type, top_chunks = adaptive_retrieval(query, vector_store, k=4)

# ✅ Step 4: Generate Final Answer Using Groq
final_answer = generate_response(query, top_chunks, query_type)

# ✅ Step 5: Display Results
print(f"\n🧠 Query Type: {query_type}")
print(f"\n💬 Final Answer:\n{final_answer}")


📄 Extracting text from PDF...
✂️ Chunking text...
✅ Total Chunks: 37
🔗 Generating embeddings (batched)...


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

📦 Storing embeddings in vector store...
✅ Added 37 chunks to vector store

🧠 Classifying query: "What is the file about?"
✅ Query classified as: Contextual
⚠️ Contextual strategy not yet implemented. Falling back to factual.

🔍 Executing Factual Retrieval for: "What is the file about?"
✅ Enhanced Query: Which specific file are you referring to? Please provide more context or details. Here's a refined query:

"What is the content or purpose of [file name/file type/file location]?"
📄 Retrieved Top 4 Relevant Chunks.

🧠 Query Type: Contextual

💬 Final Answer:
The file appears to be a research paper or article about liver disease prediction and classification using machine learning techniques. It discusses the use of different classification algorithms, such as Decision Tree, Random Forest, k-Nearest Neighbors, and Support Vector Machine, to improve the accuracy of liver disease prediction models.
