In [1]:
import os
import PyPDF2
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain_openai import ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

In [2]:

os.environ["GOOGLE_API_KEY"] = "AIzaSyCdv_P2R4sjaVy_Cd5fQ3GROjYzZJfD4FI"



In [None]:
#do not run again if trained embeddings are downloaded 
# -------------------- LAYER 1: Compute and Save --------------------
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
import PyPDF2

# Load embedding model
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Read and extract text from PDF
pdf_path = "7.%20Brochure%20-NMP-merged.pdf"
reader = PyPDF2.PdfReader(pdf_path)
all_text = ""
for page in reader.pages:
    all_text += page.extract_text() + "\n"

# Split text into chunks
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    separators=["\n\n", "\n", ".", " ", ""]
)
docs = splitter.create_documents([all_text])

# Create and save FAISS vector store
db = FAISS.from_documents(docs, embedding)
db.save_local("faiss_layer1_db")
print("✅ Layer 1 vector store saved to 'faiss_layer1_db'.")


✅ Layer 1 vector store saved to 'faiss_layer1_db'.


In [3]:
# -------------------- LAYER 1: Load and Create Hybrid Retriever --------------------
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.retrievers import BM25Retriever, EnsembleRetriever

# Load embedding model
embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Load saved FAISS index (Layer 1)
db = FAISS.load_local("faiss_layer1_db", embedding, allow_dangerous_deserialization=True)
print("✅ Layer 1 vector store loaded.")

# Create FAISS retriever
faiss_retriever = db.as_retriever(search_type="mmr", search_kwargs={
    "k": 6,            # number of final results
    "fetch_k": 20,     # candidate pool before reranking
    "lambda_mult": 0.7 # balance between relevance (1.0) and diversity (0.0)
})

# Create BM25 retriever using stored documents
bm25_retriever = BM25Retriever.from_documents(list(db.docstore._dict.values()))
bm25_retriever.k = 6

# Create hybrid ensemble retriever combining FAISS + BM25
layer1_hybrid_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, faiss_retriever],
    weights=[0.5, 0.5]
)

print("✅ Layer 1 Hybrid Retriever initialized.")


  embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
  from .autonotebook import tqdm as notebook_tqdm


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'


✅ Layer 1 vector store loaded.
✅ Layer 1 Hybrid Retriever initialized.


In [4]:
# This will be our Layer 2 vector store
policy_db = None
try:
    # Load the new policy document (Now Layer 2)
    policy_pdf_path = "testfile.pdf"  # Make sure this file is uploaded
    policy_reader = PyPDF2.PdfReader(policy_pdf_path)
    policy_text = ""
    for page in policy_reader.pages:
        page_text = page.extract_text()
        if page_text:
            policy_text += page_text + "\n"

    # Split the new document into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100,separators=["\n\n", "\n", ".", " ", ""])
    print("✅ Successfully created the text splitter for 'policy.pdf'.")
    policy_docs = text_splitter.create_documents([policy_text])

    # Create the FAISS vector store for Layer 2
    policy_db = FAISS.from_documents(policy_docs, embedding)
    print("✅ Successfully created the Layer 2 vector store for 'policy.pdf'.")

except FileNotFoundError:
    print("❌ Error: 'policy.pdf' not found. Please upload the file and try again.")
except Exception as e:
    print(f"❌ An error occurred while processing 'policy.pdf': {e}")

✅ Successfully created the text splitter for 'policy.pdf'.


  return forward_call(*args, **kwargs)


✅ Successfully created the Layer 2 vector store for 'policy.pdf'.


In [5]:
if policy_db:
    # Initialize the Language Model
    from langchain_google_genai import ChatGoogleGenerativeAI

    llm = ChatGoogleGenerativeAI(
        model="models/gemini-1.5-flash",  # ✅ use full model path
        temperature=0.2
    )

    # Create a simple retriever for the new document (Layer 2)
    layer2_policy_retriever = policy_db.as_retriever(search_type="mmr", search_kwargs={"k": 5})

    print("✅ LLM and Layer 2 retriever are initialized.")
else:
    print("Skipping initialization because the 'policy.pdf' vector store was not created.")

✅ LLM and Layer 2 retriever are initialized.


In [6]:
if policy_db:
    # The refinement chain now uses context from the main database (Layer 1)
    refine_query_template = """
    Based on the initial context from a general insurance database, refine the original question to be more specific.
    This refined question will be used to find precise details in a new, specific policy document.
    Only output the new, refined question.

    Original Question: {question}

    Initial Context from General Database:
    ---
    {context}
    ---

    Refined Question for the specific policy document:
    """
    refine_prompt = PromptTemplate.from_template(refine_query_template)

    # This function is needed because the hybrid retriever returns a list of docs
    def format_docs(docs):
        return "\n\n".join(doc.page_content for doc in docs)

    # The chain now uses the Layer 1 hybrid retriever for context
    refine_query_chain = (
        {"context": layer1_hybrid_retriever | format_docs, "question": RunnablePassthrough()}
        | refine_prompt
        | llm
        | StrOutputParser()
    )
    print("✅ Query refinement chain created (using Layer 1 context).")

✅ Query refinement chain created (using Layer 1 context).


In [7]:
if policy_db:
    final_prompt_template = """
    You are an expert insurance assistant. Answer the user's question based ONLY on the final context provided from the specific policy document.
    Be concise and clear. If the context is insufficient, state that the information is not available in the provided document.

    Final Context:
    ---
    {context}
    ---

    Question: {question}

    Answer:
    """
    final_prompt = PromptTemplate.from_template(final_prompt_template)

    # The main chain now uses the refined query to retrieve from the Layer 2 policy retriever
    final_rag_chain = (
        {
            "context": refine_query_chain | layer2_policy_retriever | format_docs,
            "question": refine_query_chain # Pass the refined question to the final prompt
        }
        | final_prompt
        | llm
        | StrOutputParser()
    )
    print("✅ Final answering chain created (using Layer 2 context).")

✅ Final answering chain created (using Layer 2 context).


In [12]:
if policy_db:
    # Your initial question goes here
    original_question = "What are the expenses covered under AYUSH treatment in the Arogya Sanjeevani Policy - National?,What is the waiting period for coverage of joint replacement surgery under the Arogya Sanjeevani Policy?,What are the co-payment terms under this policy for insured persons above and below 75 years of age?,4. What documents are required to file a reimbursement claim under the Arogya Sanjeevani Policy?, 5. What are the exclusions applicable under the policy related to cosmetic surgery or weight control treatment?"


    print("\n--- 🚀 Executing Reversed RAG Pipeline ---")
    print(f"\nOriginal Question: {original_question}")

    # Invoke the full chain to get the final answer.
    final_answer = final_rag_chain.invoke(original_question)

    print("\n--- ✅ Final Answer ---")
    print(final_answer)
else:
    print("\nSkipping RAG execution because 'policy.pdf' could not be processed.")


--- 🚀 Executing Reversed RAG Pipeline ---

Original Question: What are the expenses covered under AYUSH treatment in the Arogya Sanjeevani Policy - National?,What is the waiting period for coverage of joint replacement surgery under the Arogya Sanjeevani Policy?,What are the co-payment terms under this policy for insured persons above and below 75 years of age?,4. What documents are required to file a reimbursement claim under the Arogya Sanjeevani Policy?, 5. What are the exclusions applicable under the policy related to cosmetic surgery or weight control treatment?


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)



--- ✅ Final Answer ---
AYUSH treatments (Ayurveda, Unani, Siddha, and Homeopathy) are covered up to the sum insured during each policy year.  Specific coverage limits for each system are not detailed.  The reimbursement process for inpatient AYUSH care is not specified beyond this.
