In [2]:
!pip install -q pypdf faiss-cpu sentence-transformers transformers


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m329.6/329.6 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m71.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [15]:
from google.colab import files

uploaded = files.upload()
pdf_path = list(uploaded.keys())[0]
print("Uploaded file:", pdf_path)


Saving hdfc maxima.pdf to hdfc maxima.pdf
Uploaded file: hdfc maxima.pdf


In [28]:
from pypdf import PdfReader

def load_pdf(path):
    reader = PdfReader(path)
    text = ""
    for page in reader.pages:
        if page.extract_text():
            text += page.extract_text()
    return text

policy_text = load_pdf(pdf_path)
print("Total characters:", len(policy_text))


Total characters: 24401


In [29]:
def split_text(text, chunk_size=500, overlap=40):
    chunks = []
    start = 0
    text_length = len(text)

    while start < text_length:
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start += chunk_size - overlap

    return chunks

chunks = split_text(policy_text)

print("Total chunks:", len(chunks))
print("Sample chunk:\n", chunks[0][:300])


Total chunks: 54
Sample chunk:
 1
HDFC ERGO General Insurance Company Limited. IRDAI Reg. No.146. CIN: U66030MH2007PLC177117. Registered & Corporate Office: 
1st Floor, HDFC House, 165-166 Backbay Reclamation, H. T. Parekh Marg, Churchgate, Mumbai – 400 020. Trade Logo displayed above 
belongs to HDFC Bank Ltd and ERGO Internation


In [30]:
from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedder.encode(chunks)

print("Embedding shape:", embeddings.shape)


Embedding shape: (54, 384)


In [31]:
import faiss
import numpy as np

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))

print("Vectors stored in FAISS:", index.ntotal)


Vectors stored in FAISS: 54


In [32]:
def retrieve_chunks(question, k=3):
    q_embedding = embedder.encode([question])
    distances, indices = index.search(q_embedding, k)
    return [chunks[i] for i in indices[0]]


In [33]:
from transformers import pipeline

qa_model = pipeline(
    "text2text-generation",
    model="google/flan-t5-base"
)


Device set to use cpu


In [34]:
def answer_question(question):
    retrieved_text = retrieve_chunks(question)

    context = "\n\n".join(retrieved_text)

    prompt = f"""
You are an insurance policy assistant. Answer the user’s question strictly and only using the information provided in the supplied policy documents.
Do not make assumptions. Do not use external knowledge or general insurance expertise.
Do not infer or extrapolate beyond what is explicitly stated in the documents.

If the requested information is not explicitly found in the provided policy context, respond exactly with:
“The provided insurance documents do not contain this information.”

Context:
{context}

Question:
{question}
"""

    result = qa_model(prompt, max_length=200)
    return result[0]["generated_text"]


In [40]:
question = " Newborn baby cover?"
answer = answer_question(question)

print("Question:", question)
print("Answer:", answer)


Both `max_new_tokens` (=256) and `max_length`(=200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Question:  Newborn baby cover?
Answer: Optional Coverage for newborn from birth (day 1-90) for In-patient Treatment benefit, subject to acceptance of proposal and premium payment in full B.B.1.k 13. Critical Illness (Optional Benefit) for listed Critical Illness, subject to first diagnosed during the policy period and the Insured Person survives 30 days after such diagnosis B.B.1.l 6 Exclusions (what the policy does not cover) 1. Investigation & Evaluation: Code Excl04: i. Expenses rela rean Delivery- 25K (Including Pre/Post Natal limit of 1.5K and Infant baby limit of 2K) Optional Covers : a. Critical Illness : 3 L b. Newborn baby cover: Normal Delivery- 15K; Caesarean Delivery- 25K (Including Pre/Post Natal limit of 1.5K and Infant baby limit of 2K) B.A.1.a B.A.1 B.B.1.f B.B.1.i B.B.1.h B.B.1.j B.B.3 9 Claims/Claims
