In [1]:
# Import the Kaggle-specific library to access our secrets
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

# Retrieve our GitHub username and token
# Make sure the labels 'GITHUB_USER' and 'GITHUB_TOKEN' match what you created
# in the Kaggle Secrets menu.
GITHUB_USER = user_secrets.get_secret("GITHUB_USER")
GITHUB_TOKEN = user_secrets.get_secret("GITHUB_TOKEN")

# This is the URL to your repository
# Make sure to replace 'YourUsername' with your actual GitHub username
# and 'bajaj-hackrx-llm-agent' with your repository name if it's different.
GIT_REPO_URL = f"https://{GITHUB_USER}:{GITHUB_TOKEN}@github.com/{GITHUB_USER}/bajaj-hackrx-project.git"

# --- Clone the repository ---
# The '!' tells the notebook to run this as a command line command
# We are cloning the repository into a directory with the same name
!git clone {GIT_REPO_URL}

# --- Verify the setup ---
# Change our current directory to be inside our new project folder
%cd bajaj-hackrx-project

print("✅ Setup Complete! Your GitHub repository is now connected.")
print("\nCurrent files in your project:")
!ls -F

Cloning into 'bajaj-hackrx-project'...
remote: Enumerating objects: 12, done.[K
remote: Counting objects: 100% (12/12), done.[K
remote: Compressing objects: 100% (12/12), done.[K
remote: Total 12 (delta 4), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (12/12), 19.67 KiB | 2.81 MiB/s, done.
Resolving deltas: 100% (4/4), done.
/kaggle/working/bajaj-hackrx-project
✅ Setup Complete! Your GitHub repository is now connected.

Current files in your project:
Bajaj-Hackrx.ipynb  LICENSE  README.md


In [2]:
# --- Step 1: Install the required library for PDF processing ---
# We only need to run this once.

import os
import pandas as pd
from pypdf import PdfReader

# --- Step 2: Define a function to extract text from a PDF ---
def extract_text_from_pdf(pdf_path):
    """
    Opens a PDF file and extracts all text content from it.
    
    Args:
        pdf_path (str): The full path to the PDF file.
        
    Returns:
        str: The concatenated text from all pages of the PDF.
    """
    try:
        reader = PdfReader(pdf_path)
        text = ""
        for page in reader.pages:
            text += page.extract_text() + "\n" # Add a newline between pages
        return text
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return None

# --- Step 3: Process all documents in our dataset ---
# IMPORTANT: Make sure your Kaggle dataset is named 'bajaj-policy-docs'
# If you named it something else, change the path here.
docs_path = "/kaggle/input/bajaj-policy-docs/"
processed_docs = []

# Check if the directory exists
if os.path.exists(docs_path):
    print(f"Found document directory: {docs_path}")
    # Loop through all files in the directory
    for filename in os.listdir(docs_path):
        if filename.lower().endswith(".pdf"):
            file_path = os.path.join(docs_path, filename)
            print(f"Processing document: {filename}...")
            
            # Extract the text
            document_text = extract_text_from_pdf(file_path)
            
            if document_text:
                processed_docs.append({
                    "document_name": filename,
                    "text_content": document_text
                })
else:
    print(f"Error: Directory not found at {docs_path}")
    print("Please ensure you have uploaded the sample documents as a Kaggle dataset.")

# --- Step 4: Display the results ---
# Create a pandas DataFrame to neatly display our extracted text
docs_df = pd.DataFrame(processed_docs)

if not docs_df.empty:
    print("\n✅ Document processing complete.")
    print(f"Successfully processed {len(docs_df)} documents.")
    
    # Print the first few lines of the first document to verify
    print("\n--- Sample content from the first document ---")
    print(docs_df.iloc[0]['text_content'][:500])
    print("-------------------------------------------")
else:
    print("\n⚠️ No documents were processed. Please check the file paths and formats.")

Found document directory: /kaggle/input/bajaj-policy-docs/
Processing document: BAJHLIP23020V012223.pdf...
Processing document: EDLHLGA23009V012223.pdf...
Processing document: ICIHLIP22012V012223.pdf...
Processing document: CHOTGDP23004V012223.pdf...
Processing document: HDFHLIP23024V072223.pdf...

✅ Document processing complete.
Successfully processed 5 documents.

--- Sample content from the first document ---
    
 
   
 
UIN- BAJHLIP23020V012223                                 Global Health Care/ Policy Wordings/Page 1 
 
 
Bajaj Allianz General Insurance Co. Ltd.                       
Bajaj Allianz House, Airport Road, Yerawada, Pune - 411 006. Reg. No.: 113 
For more details, log on to: www.bajajallianz.com | E-mail: bagichelp@bajajallianz.co.in or 
Call at: Sales - 1800 209 0144 / Service - 1800 209 5858 (Toll Free No.) 
Issuing Office: 
 
GLOBAL HEALTH CARE 
 
 
Policy Wordings 
 
UIN- BAJHLIP2
-------------------------------------------


In [3]:
# Chunking
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pandas as pd

# --- Prerequisite: Assumes 'docs_df' from the previous step is in memory ---

if 'docs_df' in locals() and not docs_df.empty:
    # --- Step 2: Define our text splitter ---
    # This splitter will try to break text by paragraphs ("\n\n"), then by lines ("\n"),
    # and finally by sentences or words to respect the document's structure.
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,  # The maximum size of each chunk (in characters)
        chunk_overlap=100, # The number of characters to overlap between chunks
        length_function=len,
    )

    # --- Step 3: Process all documents and create chunks ---
    all_chunks = []

    print("Starting to chunk documents...")
    # Iterate through each document in our DataFrame
    for index, row in docs_df.iterrows():
        document_name = row['document_name']
        document_text = row['text_content']
        
        # Use the splitter to create chunks from the document's text
        chunks = text_splitter.split_text(document_text)
        
        # Add each chunk to our list with its source document
        for i, chunk_text in enumerate(chunks):
            all_chunks.append({
                "document_name": document_name,
                "chunk_id": f"{document_name}_chunk_{i}",
                "chunk_text": chunk_text
            })
    
    # --- Step 4: Display the results ---
    # Create a new DataFrame from our list of chunks
    chunks_df = pd.DataFrame(all_chunks)

    if not chunks_df.empty:
        print("\n✅ Document chunking complete.")
        print(f"Created {len(chunks_df)} chunks from {len(docs_df)} documents.")
        
        # Print a sample chunk to verify
        print("\n--- Sample Chunk ---")
        print(f"Source Document: {chunks_df.iloc[0]['document_name']}")
        print("--------------------")
        print(chunks_df.iloc[0]['chunk_text'])
        print("--------------------")
    else:
        print("\n⚠️ No chunks were created.")

else:
    print("Error: 'docs_df' not found or is empty. Please run the previous document processing step first.")

Starting to chunk documents...

✅ Document chunking complete.
Created 911 chunks from 5 documents.

--- Sample Chunk ---
Source Document: BAJHLIP23020V012223.pdf
--------------------
UIN- BAJHLIP23020V012223                                 Global Health Care/ Policy Wordings/Page 1 
 
 
Bajaj Allianz General Insurance Co. Ltd.                       
Bajaj Allianz House, Airport Road, Yerawada, Pune - 411 006. Reg. No.: 113 
For more details, log on to: www.bajajallianz.com | E-mail: bagichelp@bajajallianz.co.in or 
Call at: Sales - 1800 209 0144 / Service - 1800 209 5858 (Toll Free No.) 
Issuing Office: 
 
GLOBAL HEALTH CARE 
 
 
Policy Wordings 
 
UIN- BAJHLIP23020V012223 
SECTION A) PREAMBLE 
 
Whereas the Insured described in the Policy Schedule hereto (hereinafter called the ‘Insured’  or “Policyholder” or 
“Insured Person”) has made to Bajaj Allianz General Insurance Company Limited (hereinafter called the “Company” 
or “Insurer” or “Insurance Company”) a proposal or Proposal as m

In [4]:
#Creating Embeddings(Vector Data)
# --- Step 1: Install the required library for creating embeddings ---
# sentence-transformers is a powerful library from HuggingFace


from sentence_transformers import SentenceTransformer
import pandas as pd

# --- Prerequisite: Assumes 'chunks_df' from the previous step is in memory ---

if 'chunks_df' in locals() and not chunks_df.empty:
    # --- Step 2: Load a pre-trained embedding model ---
    # 'all-MiniLM-L6-v2' is a popular, fast, and high-quality model.
    # The first time you run this, it will download the model (a few hundred MB).
    print("Loading the embedding model...")
    model = SentenceTransformer('all-MiniLM-L6-v2')
    print("✅ Model loaded successfully.")

    # --- Step 3: Create the embeddings for each chunk ---
    # We will process the text in batches for efficiency.
    # Get the list of all chunk texts
    chunk_texts = chunks_df['chunk_text'].tolist()

    print(f"\nCreating embeddings for {len(chunk_texts)} chunks... (This may take a few minutes)")
    # The model.encode() function takes a list of texts and returns a list of embeddings
    embeddings = model.encode(chunk_texts, show_progress_bar=True)
    
    # Add the embeddings to our DataFrame
    chunks_df['embedding'] = list(embeddings)

    # --- Step 4: Display the results ---
    print("\n✅ Embeddings created successfully.")
    print("Sample of the chunks DataFrame with embeddings:")
    
    # Show the first few rows, including the new 'embedding' column
    print(chunks_df.head())
    
    # Let's inspect one of the embeddings
    print("\n--- Sample Embedding ---")
    sample_embedding = chunks_df.iloc[0]['embedding']
    print(f"Embedding Type: {type(sample_embedding)}")
    print(f"Embedding Length (Dimensions): {len(sample_embedding)}")
    print(f"First 5 values: {sample_embedding[:5]}")
    print("----------------------")

else:
    print("Error: 'chunks_df' not found or is empty. Please run the previous document chunking step first.")

2025-07-21 16:09:57.204556: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753114197.437571      13 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753114197.500103      13 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Loading the embedding model...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✅ Model loaded successfully.

Creating embeddings for 911 chunks... (This may take a few minutes)


Batches:   0%|          | 0/29 [00:00<?, ?it/s]


✅ Embeddings created successfully.
Sample of the chunks DataFrame with embeddings:
             document_name                         chunk_id  \
0  BAJHLIP23020V012223.pdf  BAJHLIP23020V012223.pdf_chunk_0   
1  BAJHLIP23020V012223.pdf  BAJHLIP23020V012223.pdf_chunk_1   
2  BAJHLIP23020V012223.pdf  BAJHLIP23020V012223.pdf_chunk_2   
3  BAJHLIP23020V012223.pdf  BAJHLIP23020V012223.pdf_chunk_3   
4  BAJHLIP23020V012223.pdf  BAJHLIP23020V012223.pdf_chunk_4   

                                          chunk_text  \
0  UIN- BAJHLIP23020V012223                      ...   
1  declarations, information/particulars and stat...   
2  permits. If any word starts with Capital alpha...   
3  Indian Medicine/Central Council for Homeopathy...   
4  applicable and having facilities for carrying ...   

                                           embedding  
0  [-0.05861414, 0.046383865, -0.013737356, -0.00...  
1  [-0.046450872, 0.06120499, -0.009025839, -0.02...  
2  [0.012981687, -0.023778314, 0.00

In [5]:
# --- Step 1: Install the required library for the vector database ---
!pip install faiss-cpu

import faiss
import numpy as np
import pandas as pd

# --- Prerequisite: Assumes 'chunks_df' from the previous step is in memory ---

if 'chunks_df' in locals() and not chunks_df.empty and 'embedding' in chunks_df.columns:
    # --- Step 2: Prepare the embeddings for FAISS ---
    # FAISS requires the embeddings to be in a specific format (a 2D numpy array of type float32).
    print("Converting embeddings to a FAISS-compatible format...")
    embeddings = np.array(chunks_df['embedding'].tolist()).astype('float32')
    
    # Check the shape of our embeddings matrix
    print(f"Embeddings matrix shape: {embeddings.shape}") # Should be (num_chunks, embedding_dimension)

    # --- Step 3: Create the FAISS Index ---
    # Get the dimension of our embeddings (e.g., 384 for the 'all-MiniLM-L6-v2' model)
    d = embeddings.shape[1]
    
    # We will use a simple, exact-search index called 'IndexFlatL2'.
    # This index performs an exhaustive search, which is perfect for our scale.
    print(f"Creating a FAISS index with dimension {d}...")
    index = faiss.IndexFlatL2(d)
    
    # --- Step 4: Add the embeddings to the index ---
    print(f"Adding {len(embeddings)} embeddings to the index...")
    index.add(embeddings)
    
    print(f"✅ FAISS index created successfully. Total vectors in index: {index.ntotal}")

    # --- Step 5: Save the index and the chunk data for later use ---
    # We need to save both the index itself and the dataframe that maps an index ID
    # back to its original text content.
    
    # Save the FAISS index
    faiss.write_index(index, "policy_document_index.faiss")
    
    # Save the chunks dataframe (our mapping file)
    # We drop the embedding column before saving to make the file smaller.
    chunks_df.drop(columns=['embedding']).to_csv("policy_document_chunks.csv", index=False)
    
    print("\n✅ Knowledge base created and saved successfully.")
    print("Saved files: 'policy_document_index.faiss' and 'policy_document_chunks.csv'")

else:
    print("Error: 'chunks_df' with embeddings not found. Please run the previous steps first.")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.0 kB)
Downloading faiss_cpu-1.11.0.post1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m45.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0.post1
Converting embeddings to a FAISS-compatible format...
Embeddings matrix shape: (911, 384)
Creating a FAISS index with dimension 384...
Adding 911 embeddings to the index...
✅ FAISS index created successfully. Total vectors in index: 911

✅ Knowledge base created and saved successfully.
Saved files: 'policy_document_index.faiss' and 'policy_document_chunks.csv'


In [6]:
#Retriever

import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer

# --- Step 1: Load all our saved components ---
print("Loading the knowledge base and embedding model...")

try:
    # Load the FAISS index
    index = faiss.read_index("policy_document_index.faiss")
    
    # Load the chunk data that maps index IDs to text
    chunks_df = pd.read_csv("policy_document_chunks.csv")
    
    # Load the sentence transformer model
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    print("✅ Knowledge base and model loaded successfully.")
    print(f"Index contains {index.ntotal} vectors.")

except Exception as e:
    print(f"Error loading files: {e}")
    print("Please ensure 'policy_document_index.faiss' and 'policy_document_chunks.csv' are in the correct directory.")

# --- Step 2: Define the search function ---
def search_documents(query, top_k=3):
    """
    Performs a semantic search for a given query against the FAISS index.
    
    Args:
        query (str): The user's search query.
        top_k (int): The number of top results to return.
        
    Returns:
        pd.DataFrame: A DataFrame containing the top_k most relevant chunks.
    """
    if 'index' not in globals():
        print("Error: FAISS index not loaded.")
        return None
        
    print(f"\nSearching for: '{query}'...")
    # 1. Create an embedding for the user's query
    query_embedding = model.encode([query], convert_to_tensor=False).astype('float32')
    
    # 2. Perform the search in our FAISS index
    # The search function returns distances and the indices (IDs) of the top_k results
    distances, indices = index.search(query_embedding, top_k)
    
    # 3. Retrieve the results
    # The 'indices' is a 2D array, so we take the first row
    results_indices = indices[0]
    
    # Get the corresponding chunks from our dataframe
    results_df = chunks_df.iloc[results_indices].copy()
    results_df['similarity_score'] = 1 - distances[0] # Convert L2 distance to a similarity score
    
    return results_df

# --- Step 3: Test our search function ---
if 'index' in globals():
    # Let's test with a sample query from the problem statement
    sample_query = "Is knee surgery covered?"
    
    search_results = search_documents(sample_query)
    
    if search_results is not None:
        print("\n--- Top Search Results ---")
        # Display the results
        for index, row in search_results.iterrows():
            print(f"Document: {row['document_name']}")
            print(f"Similarity: {row['similarity_score']:.4f}")
            print("---")
            print(row['chunk_text'])
            print("\n--------------------------\n")

Loading the knowledge base and embedding model...
✅ Knowledge base and model loaded successfully.
Index contains 911 vectors.

Searching for: 'Is knee surgery covered?'...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


--- Top Search Results ---
Document: HDFHLIP23024V072223.pdf
Similarity: -0.0402
---
fractures (excluding hairline fractures) and dislocations of the 
mandible and extremities. 
16) Dental treatment and surgery of any kind, unless requiring 
Hospitalisation  
17) Any non medical expenses mentioned in List 1 of Annexure I  
18) Treatment rendered by a Medical Practitioner which is outside 
his discipline or the discipline for which he is licensed. 
19) Treatments rendered by a Medical Practitioner who is a 
member of the Insured Person’s family or stays with him, however 
proven material costs are eligible for reimbursement in accordance 
with the applicable cover. 
20) Any treatment or part of a treatment that is not of a 
reasonable charge and not Medically Necessary.  
21) Drugs or treatments which are not supported by a 
prescription. 
22) Any specific time bound or lifetime exclusion(s) applied by Us 
and specified in the Schedule and accepted by the insured. 
23) Admission for ad

In [7]:
import json
import pandas as pd

# --- Prerequisite: This code assumes you are in an environment with access to the Gemini API ---
# In a local setup, you would use libraries like `google-generativeai`.
# In this environment, we will simulate the API call structure.

# --- Step 1: Define the LLM-powered parser function ---
def parse_query_with_llm(raw_query):
    """
    Uses a Large Language Model to parse a raw query and extract key entities into a JSON format.
    
    Args:
        raw_query (str): The user's unstructured query.
        
    Returns:
        dict: A dictionary containing the extracted entities.
    """
    print(f"\nParsing query with LLM: '{raw_query}'...")
    
    # This is the core of the function: a carefully designed prompt.
    # It tells the LLM its role, the task, the desired output format, and provides examples.
    prompt = f"""
    You are an expert data extraction agent for an insurance company.
    Your task is to parse a user's query and extract the key details into a structured JSON object.

    The possible fields to extract are:
    - "age" (integer)
    - "gender" (string, either "male", "female", or "unspecified")
    - "procedure" (string, the medical procedure mentioned)
    - "location" (string, the city or location mentioned)
    - "policy_duration_months" (integer, the age of the policy in months)
    - "core_question" (string, a concise summary of the user's main question for semantic search)

    If a piece of information is not present in the query, do not include its key in the JSON output.

    Here are some examples:

    Query: "46-year-old male, knee surgery in Pune, 3-month-old insurance policy"
    Output:
    {{
      "age": 46,
      "gender": "male",
      "procedure": "knee surgery",
      "location": "Pune",
      "policy_duration_months": 3,
      "core_question": "coverage for knee surgery"
    }}

    Query: "Is maternity care covered in Mumbai for my 6 month old policy?"
    Output:
    {{
      "procedure": "maternity care",
      "location": "Mumbai",
      "policy_duration_months": 6,
      "core_question": "coverage for maternity care"
    }}

    Now, parse the following query. Only return the JSON object, with no other text or explanations.

    Query: "{raw_query}"
    Output:
    """

    # --- This section simulates making a call to the Gemini API ---
    # In a real application, you would replace this with your API call code.
    # For this example, we will simulate the response for the sample query.
    
    # This is a placeholder for the actual API call.
    # response_text = call_gemini_api(prompt) 
    
    # Let's simulate the expected response for our sample query
    if "46M" in raw_query and "knee surgery" in raw_query:
        response_text = """
        {
          "age": 46,
          "gender": "male",
          "procedure": "knee surgery",
          "location": "Pune",
          "policy_duration_months": 3,
          "core_question": "coverage for knee surgery"
        }
        """
    else:
        response_text = '{"error": "Could not parse query. Please provide a more detailed query."}'

    # --- End of simulation ---

    try:
        # Attempt to parse the LLM's response string into a Python dictionary
        structured_response = json.loads(response_text)
        return structured_response
    except json.JSONDecodeError:
        print("Error: The LLM returned an invalid JSON response.")
        return {"error": "Failed to parse LLM output."}

# --- Step 2: Test our parser function ---
sample_query_from_problem = "46M, knee surgery, Pune, 3-month policy"
parsed_data = parse_query_with_llm(sample_query_from_problem)

print("\n--- Parsed Query Data ---")
print(json.dumps(parsed_data, indent=2))
print("-------------------------")

# We can now use the 'core_question' for our semantic search
if 'core_question' in parsed_data:
    core_question = parsed_data['core_question']
    print(f"\nExtracted core question for semantic search: '{core_question}'")


Parsing query with LLM: '46M, knee surgery, Pune, 3-month policy'...

--- Parsed Query Data ---
{
  "age": 46,
  "gender": "male",
  "procedure": "knee surgery",
  "location": "Pune",
  "policy_duration_months": 3,
  "core_question": "coverage for knee surgery"
}
-------------------------

Extracted core question for semantic search: 'coverage for knee surgery'


In [8]:
import json
import pandas as pd
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

# ==============================================================================
# PART 1: LOAD ALL KNOWLEDGE BASE COMPONENTS
# ==============================================================================
print("--- Loading Knowledge Base and Models ---")
try:
    # Load the FAISS index into a specific, non-conflicting variable name
    faiss_index = faiss.read_index("policy_document_index.faiss")
    
    # Load the chunk data that maps index IDs to text
    chunks_df_map = pd.read_csv("policy_document_chunks.csv")
    
    # Load the sentence transformer model
    embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
    
    print("✅ Knowledge base and models loaded successfully.")
    print(f"Index contains {faiss_index.ntotal} vectors.")
except Exception as e:
    print(f"Error loading files: {e}")
    # Define placeholder variables to prevent subsequent errors
    faiss_index, chunks_df_map, embedding_model = None, None, None

# ==============================================================================
# PART 2: DEFINE CORE FUNCTIONS (PARSER, RETRIEVER, JUDGE)
# ==============================================================================

# --- Function 2a: LLM-Powered Query Parser ---
def parse_query_with_llm(raw_query):
    print(f"\nParsing query with LLM: '{raw_query}'...")
    prompt = f"""
    You are an expert data extraction agent...
    Query: "{raw_query}"
    Output:
    """
    # This section simulates the API call
    if "46M" in raw_query and "knee surgery" in raw_query:
        response_text = """
        {"age": 46, "gender": "male", "procedure": "knee surgery", "location": "Pune", "policy_duration_months": 3, "core_question": "coverage for knee surgery"}
        """
    else:
        response_text = '{"error": "Could not parse query."}'
    
    try:
        return json.loads(response_text)
    except json.JSONDecodeError:
        return {"error": "Failed to parse LLM output."}

# --- Function 2b: Semantic Search Retriever (Corrected) ---
# This function now explicitly accepts the objects it needs to work.
def search_documents(query, index_obj, model_obj, mapping_df, top_k=3):
    if index_obj is None:
        print("Error: FAISS index not loaded.")
        return None
    print(f"\nSearching for: '{query}'...")
    query_embedding = model_obj.encode([query]).astype('float32')
    # This will now work because we are using the passed-in 'index_obj'
    distances, indices = index_obj.search(query_embedding, top_k)
    results_df = mapping_df.iloc[indices[0]].copy()
    results_df['similarity_score'] = 1 - distances[0]
    return results_df

# --- Function 2c: LLM-Powered Adjudicator (The Judge) ---
def get_final_decision_with_llm(parsed_query, retrieved_chunks):
    print("\nAdjudicating claim with LLM...")
    context = ""
    # Using '_' is a safe convention for a loop variable you don't need.
    for _, row in retrieved_chunks.iterrows():
        context += f"--- Relevant Clause from {row['document_name']} ---\n{row['chunk_text']}\n-----------------------------------------\n\n"
    prompt = f"""
    You are an expert insurance claims adjudicator...
    User's Situation:
    {json.dumps(parsed_query, indent=2)}
    Relevant Policy Clauses:
    {context}
    Output:
    """
    # This section simulates the API call
    response_text = """
    {"decision": "Approved", "amount": 50000.0, "justification": "The user's request for knee surgery is approved...", "cited_clauses": "Section 4.2..."}
    """
    try:
        return json.loads(response_text)
    except json.JSONDecodeError:
        return {"error": "Failed to parse LLM output."}

# ==============================================================================
# PART 3: EXECUTE THE FULL END-TO-END PIPELINE
# ==============================================================================
if faiss_index is not None:
    print("\n--- Starting Full End-to-End Test ---")
    user_query = "46M, knee surgery, Pune, 3-month policy"

    # 1. PARSE the user's query
    parsed_data = parse_query_with_llm(user_query)

    if "error" not in parsed_data:
        # 2. RETRIEVE relevant documents by passing the required objects as arguments
        core_question = parsed_data.get('core_question', user_query)
        # We now pass our loaded objects directly to the function
        search_results = search_documents(core_question, faiss_index, embedding_model, chunks_df_map)
        
        if search_results is not None:
            # 3. JUDGE the case using the parsed data and retrieved chunks
            final_decision = get_final_decision_with_llm(parsed_data, search_results)
            
            print("\n\n========================================")
            print("      FINAL DECISION REPORT")
            print("========================================")
            print(json.dumps(final_decision, indent=2))
            print("========================================")
else:
    print("\nPipeline execution skipped due to errors in loading the knowledge base.")

--- Loading Knowledge Base and Models ---
✅ Knowledge base and models loaded successfully.
Index contains 911 vectors.

--- Starting Full End-to-End Test ---

Parsing query with LLM: '46M, knee surgery, Pune, 3-month policy'...

Searching for: 'coverage for knee surgery'...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]


Adjudicating claim with LLM...


      FINAL DECISION REPORT
{
  "decision": "Approved",
  "amount": 50000.0,
  "justification": "The user's request for knee surgery is approved...",
  "cited_clauses": "Section 4.2..."
}
