In [None]:
# --- Step 1: Install the required library for PDF processing ---
# We only need to run this once.

import os
import pandas as pd
from pypdf import PdfReader

# --- Step 2: Define a function to extract text from a PDF ---
def extract_text_from_pdf(pdf_path):
    """
    Opens a PDF file and extracts all text content from it.
    
    Args:
        pdf_path (str): The full path to the PDF file.
        
    Returns:
        str: The concatenated text from all pages of the PDF.
    """
    try:
        reader = PdfReader(pdf_path)
        text = ""
        for page in reader.pages:
            text += page.extract_text() + "\n" # Add a newline between pages
        return text
    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return None

# --- Step 3: Process all documents in our dataset ---
# IMPORTANT: Make sure your Kaggle dataset is named 'bajaj-policy-docs'
# If you named it something else, change the path here.
docs_path = "/kaggle/input/bajaj-policy-docs/"
processed_docs = []

# Check if the directory exists
if os.path.exists(docs_path):
    print(f"Found document directory: {docs_path}")
    # Loop through all files in the directory
    for filename in os.listdir(docs_path):
        if filename.lower().endswith(".pdf"):
            file_path = os.path.join(docs_path, filename)
            print(f"Processing document: {filename}...")
            
            # Extract the text
            document_text = extract_text_from_pdf(file_path)
            
            if document_text:
                processed_docs.append({
                    "document_name": filename,
                    "text_content": document_text
                })
else:
    print(f"Error: Directory not found at {docs_path}")
    print("Please ensure you have uploaded the sample documents as a Kaggle dataset.")

# --- Step 4: Display the results ---
# Create a pandas DataFrame to neatly display our extracted text
docs_df = pd.DataFrame(processed_docs)

if not docs_df.empty:
    print("\n✅ Document processing complete.")
    print(f"Successfully processed {len(docs_df)} documents.")
    
    # Print the first few lines of the first document to verify
    print("\n--- Sample content from the first document ---")
    print(docs_df.iloc[0]['text_content'][:500])
    print("-------------------------------------------")
else:
    print("\n⚠️ No documents were processed. Please check the file paths and formats.")

In [None]:
# Chunking
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pandas as pd

# --- Prerequisite: Assumes 'docs_df' from the previous step is in memory ---

if 'docs_df' in locals() and not docs_df.empty:
    # --- Step 2: Define our text splitter ---
    # This splitter will try to break text by paragraphs ("\n\n"), then by lines ("\n"),
    # and finally by sentences or words to respect the document's structure.
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,  # The maximum size of each chunk (in characters)
        chunk_overlap=100, # The number of characters to overlap between chunks
        length_function=len,
    )

    # --- Step 3: Process all documents and create chunks ---
    all_chunks = []

    print("Starting to chunk documents...")
    # Iterate through each document in our DataFrame
    for index, row in docs_df.iterrows():
        document_name = row['document_name']
        document_text = row['text_content']
        
        # Use the splitter to create chunks from the document's text
        chunks = text_splitter.split_text(document_text)
        
        # Add each chunk to our list with its source document
        for i, chunk_text in enumerate(chunks):
            all_chunks.append({
                "document_name": document_name,
                "chunk_id": f"{document_name}_chunk_{i}",
                "chunk_text": chunk_text
            })
    
    # --- Step 4: Display the results ---
    # Create a new DataFrame from our list of chunks
    chunks_df = pd.DataFrame(all_chunks)

    if not chunks_df.empty:
        print("\n✅ Document chunking complete.")
        print(f"Created {len(chunks_df)} chunks from {len(docs_df)} documents.")
        
        # Print a sample chunk to verify
        print("\n--- Sample Chunk ---")
        print(f"Source Document: {chunks_df.iloc[0]['document_name']}")
        print("--------------------")
        print(chunks_df.iloc[0]['chunk_text'])
        print("--------------------")
    else:
        print("\n⚠️ No chunks were created.")

else:
    print("Error: 'docs_df' not found or is empty. Please run the previous document processing step first.")

In [None]:
#Creating Embeddings(Vector Data)
# --- Step 1: Install the required library for creating embeddings ---
# sentence-transformers is a powerful library from HuggingFace


from sentence_transformers import SentenceTransformer
import pandas as pd

# --- Prerequisite: Assumes 'chunks_df' from the previous step is in memory ---

if 'chunks_df' in locals() and not chunks_df.empty:
    # --- Step 2: Load a pre-trained embedding model ---
    # 'all-MiniLM-L6-v2' is a popular, fast, and high-quality model.
    # The first time you run this, it will download the model (a few hundred MB).
    print("Loading the embedding model...")
    model = SentenceTransformer('all-MiniLM-L6-v2')
    print("✅ Model loaded successfully.")

    # --- Step 3: Create the embeddings for each chunk ---
    # We will process the text in batches for efficiency.
    # Get the list of all chunk texts
    chunk_texts = chunks_df['chunk_text'].tolist()

    print(f"\nCreating embeddings for {len(chunk_texts)} chunks... (This may take a few minutes)")
    # The model.encode() function takes a list of texts and returns a list of embeddings
    embeddings = model.encode(chunk_texts, show_progress_bar=True)
    
    # Add the embeddings to our DataFrame
    chunks_df['embedding'] = list(embeddings)

    # --- Step 4: Display the results ---
    print("\n✅ Embeddings created successfully.")
    print("Sample of the chunks DataFrame with embeddings:")
    
    # Show the first few rows, including the new 'embedding' column
    print(chunks_df.head())
    
    # Let's inspect one of the embeddings
    print("\n--- Sample Embedding ---")
    sample_embedding = chunks_df.iloc[0]['embedding']
    print(f"Embedding Type: {type(sample_embedding)}")
    print(f"Embedding Length (Dimensions): {len(sample_embedding)}")
    print(f"First 5 values: {sample_embedding[:5]}")
    print("----------------------")

else:
    print("Error: 'chunks_df' not found or is empty. Please run the previous document chunking step first.")

In [None]:
# --- Step 1: Install the required library for the vector database ---
!pip install faiss-cpu

import faiss
import numpy as np
import pandas as pd

# --- Prerequisite: Assumes 'chunks_df' from the previous step is in memory ---

if 'chunks_df' in locals() and not chunks_df.empty and 'embedding' in chunks_df.columns:
    # --- Step 2: Prepare the embeddings for FAISS ---
    # FAISS requires the embeddings to be in a specific format (a 2D numpy array of type float32).
    print("Converting embeddings to a FAISS-compatible format...")
    embeddings = np.array(chunks_df['embedding'].tolist()).astype('float32')
    
    # Check the shape of our embeddings matrix
    print(f"Embeddings matrix shape: {embeddings.shape}") # Should be (num_chunks, embedding_dimension)

    # --- Step 3: Create the FAISS Index ---
    # Get the dimension of our embeddings (e.g., 384 for the 'all-MiniLM-L6-v2' model)
    d = embeddings.shape[1]
    
    # We will use a simple, exact-search index called 'IndexFlatL2'.
    # This index performs an exhaustive search, which is perfect for our scale.
    print(f"Creating a FAISS index with dimension {d}...")
    index = faiss.IndexFlatL2(d)
    
    # --- Step 4: Add the embeddings to the index ---
    print(f"Adding {len(embeddings)} embeddings to the index...")
    index.add(embeddings)
    
    print(f"✅ FAISS index created successfully. Total vectors in index: {index.ntotal}")

    # --- Step 5: Save the index and the chunk data for later use ---
    # We need to save both the index itself and the dataframe that maps an index ID
    # back to its original text content.
    
    # Save the FAISS index
    faiss.write_index(index, "policy_document_index.faiss")
    
    # Save the chunks dataframe (our mapping file)
    # We drop the embedding column before saving to make the file smaller.
    chunks_df.drop(columns=['embedding']).to_csv("policy_document_chunks.csv", index=False)
    
    print("\n✅ Knowledge base created and saved successfully.")
    print("Saved files: 'policy_document_index.faiss' and 'policy_document_chunks.csv'")

else:
    print("Error: 'chunks_df' with embeddings not found. Please run the previous steps first.")

In [None]:
#Retriever

import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer

# --- Step 1: Load all our saved components ---
print("Loading the knowledge base and embedding model...")

try:
    # Load the FAISS index
    index = faiss.read_index("policy_document_index.faiss")
    
    # Load the chunk data that maps index IDs to text
    chunks_df = pd.read_csv("policy_document_chunks.csv")
    
    # Load the sentence transformer model
    model = SentenceTransformer('all-MiniLM-L6-v2')
    
    print("✅ Knowledge base and model loaded successfully.")
    print(f"Index contains {index.ntotal} vectors.")

except Exception as e:
    print(f"Error loading files: {e}")
    print("Please ensure 'policy_document_index.faiss' and 'policy_document_chunks.csv' are in the correct directory.")

# --- Step 2: Define the search function ---
def search_documents(query, top_k=3):
    """
    Performs a semantic search for a given query against the FAISS index.
    
    Args:
        query (str): The user's search query.
        top_k (int): The number of top results to return.
        
    Returns:
        pd.DataFrame: A DataFrame containing the top_k most relevant chunks.
    """
    if 'index' not in globals():
        print("Error: FAISS index not loaded.")
        return None
        
    print(f"\nSearching for: '{query}'...")
    # 1. Create an embedding for the user's query
    query_embedding = model.encode([query], convert_to_tensor=False).astype('float32')
    
    # 2. Perform the search in our FAISS index
    # The search function returns distances and the indices (IDs) of the top_k results
    distances, indices = index.search(query_embedding, top_k)
    
    # 3. Retrieve the results
    # The 'indices' is a 2D array, so we take the first row
    results_indices = indices[0]
    
    # Get the corresponding chunks from our dataframe
    results_df = chunks_df.iloc[results_indices].copy()
    results_df['similarity_score'] = 1 - distances[0] # Convert L2 distance to a similarity score
    
    return results_df

# --- Step 3: Test our search function ---
if 'index' in globals():
    # Let's test with a sample query from the problem statement
    sample_query = "Is knee surgery covered?"
    
    search_results = search_documents(sample_query)
    
    if search_results is not None:
        print("\n--- Top Search Results ---")
        # Display the results
        for index, row in search_results.iterrows():
            print(f"Document: {row['document_name']}")
            print(f"Similarity: {row['similarity_score']:.4f}")
            print("---")
            print(row['chunk_text'])
            print("\n--------------------------\n")

In [None]:
import json
import aiohttp
import asyncio

print("\n--- Loading all components for the RAG pipeline ---")

# --- Function 2a: LLM-Powered Query Parser ---
async def parse_query_with_llm(raw_query, session, api_key):
    print(f"\nParsing query with LLM: '{raw_query}'...")
    prompt = f"""
    You are an expert data extraction agent for an insurance company.
    Your task is to parse a user's query and extract the key details into a structured JSON object.
    The possible fields to extract are: "age", "gender", "procedure", "location", "policy_duration_months", "core_question".
    If a piece of information is not present in the query, do not include its key in the JSON output.
    Only return the JSON object, with no other text or explanations.

    Query: "{raw_query}"
    Output:
    """
    try:
        chat_history = [{"role": "user", "parts": [{"text": prompt}]}]
        payload = {"contents": chat_history}
        api_url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={api_key}"
        async with session.post(api_url, headers={'Content-Type': 'application/json'}, json=payload) as response:
            result = await response.json()
        if 'candidates' in result and result['candidates']:
            response_text = result['candidates'][0]['content']['parts'][0]['text']
            cleaned_text = response_text.strip().lstrip('```json').rstrip('```').strip()
            return json.loads(cleaned_text)
        else:
            print("Error: Invalid API response structure.", result)
            return {"error": "Invalid response structure from API."}
    except Exception as e:
        return {"error": f"Failed to get a valid response from the LLM: {e}"}

# --- Function 2b: Semantic Search Retriever ---
def search_documents(query, index_obj, model_obj, mapping_df, top_k=3):
    print(f"\nSearching for: '{query}'...")
    query_embedding = model_obj.encode([query]).astype('float32')
    distances, indices = index_obj.search(query_embedding, top_k)
    results_df = mapping_df.iloc[indices[0]].copy()
    results_df['similarity_score'] = 1 - distances[0]
    return results_df

# --- Function 2c: LLM-Powered Adjudicator (The Judge) ---
async def get_final_decision_with_llm(parsed_query, retrieved_chunks, session, api_key):
    print("\nAdjudicating claim with LLM...")
    context = ""
    for _, row in retrieved_chunks.iterrows():
        context += f"--- Relevant Clause from {row['document_name']} ---\n{row['chunk_text']}\n-----------------------------------------\n\n"
    prompt = f"""
    You are an expert insurance claims adjudicator. Evaluate the user's situation based on the provided policy clauses and return a structured JSON object with fields: "decision", "amount", "justification", "cited_clauses".

    User's Situation:
    {json.dumps(parsed_query, indent=2)}

    Relevant Policy Clauses:
    {context}
    
    Now, evaluate the user's situation based *only* on the provided clauses and return the final JSON object.
    Do not use any external knowledge. Only return the JSON object, with no other text or explanations.

    Output:
    """
    try:
        chat_history = [{"role": "user", "parts": [{"text": prompt}]}]
        payload = {"contents": chat_history}
        api_url = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent?key={api_key}"
        async with session.post(api_url, headers={'Content-Type': 'application/json'}, json=payload) as response:
            result = await response.json()
        if 'candidates' in result and result['candidates']:
            response_text = result['candidates'][0]['content']['parts'][0]['text']
            cleaned_text = response_text.strip().lstrip('```json').rstrip('```').strip()
            return json.loads(cleaned_text)
        else:
            print("Error: Invalid API response structure.", result)
            return {"error": "Invalid response structure from API."}
    except Exception as e:
        return {"error": f"Failed to get a valid response from the LLM: {e}"}

# --- Main Pipeline Execution Function ---
async def run_pipeline():
    # Load the knowledge base components created in the previous cell
    try:
        faiss_index_loaded = faiss.read_index("policy_document_index.faiss")
        chunks_df_map_loaded = pd.read_csv("policy_document_chunks.csv")
        embedding_model_loaded = SentenceTransformer('all-MiniLM-L6-v2')
        print("✅ Knowledge base components loaded successfully for pipeline execution.")
    except Exception as e:
        print(f"Error loading knowledge base files for pipeline: {e}")
        return

    # Retrieve the Gemini API key from Kaggle Secrets
    user_secrets = UserSecretsClient()
    GEMINI_API_KEY = user_secrets.get_secret("GEMINI_API_KEY")

    # Create a single aiohttp session to be reused for all API calls
    async with aiohttp.ClientSession() as session:
        print("\n--- Starting Full End-to-End Test ---")
        user_query = "I am a 55 year old male having bike accident and my policy is 4 years old"

        # 1. PARSE
        parsed_data = await parse_query_with_llm(user_query, session, GEMINI_API_KEY)
        if "error" in parsed_data:
            print(parsed_data)
            return

        # 2. RETRIEVE
        core_question = parsed_data.get('core_question', user_query)
        search_results = search_documents(core_question, faiss_index_loaded, embedding_model_loaded, chunks_df_map_loaded)
        
        # 3. JUDGE
        final_decision = await get_final_decision_with_llm(parsed_data, search_results, session, GEMINI_API_KEY)
        
        print("\n\n========================================")
        print("      FINAL DECISION REPORT")
        print("========================================")
        print(json.dumps(final_decision, indent=2))
        print("========================================")

# --- Run the pipeline ---
await run_pipeline()