In [43]:
import re
import faiss
import numpy as np
import pandas as pd

from sentence_transformers import SentenceTransformer, util
from transformers import pipeline


In [44]:
# SBERT model (USED FOR RETRIEVAL + SUMMARIZATION)
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")

# BERT QA model
qa_reader = pipeline(
    "question-answering",
    model="deepset/bert-base-cased-squad2"
)



Some weights of the model checkpoint at deepset/bert-base-cased-squad2 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


In [45]:
df = pd.read_csv(
    "../data/processed/government_qa_clean.csv",
    encoding="latin1"
)

embeddings = np.load("../data/processed/question_embeddings.npy")
faiss.normalize_L2(embeddings)

index = faiss.read_index("../data/processed/faiss_index.index")



In [46]:
RETRIEVAL_MIN_SCORE = 0.15
RETRIEVAL_STRONG_SCORE = 0.45
QA_MIN_CONFIDENCE = 0.20


In [47]:
def clean_runtime_answer(text):
    text = re.sub(r"THE MINISTER.*?:", "", text, flags=re.I)
    text = re.sub(r"\[.*?\]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text


In [48]:
def answer_question(query, top_k=3):

    # Step 1: Embed query
    query_embedding = sbert_model.encode([query])
    faiss.normalize_L2(query_embedding)

    # Step 2: FAISS search
    scores, indices = index.search(query_embedding, top_k)
    max_score = scores[0][0]

    # Step 3: Retrieval confidence gating
    if max_score < RETRIEVAL_MIN_SCORE:
        return {
            "type": "no_answer",
            "message": "No relevant answer found."
        }

    # Step 4: Collect contexts
    contexts = []
    for idx in indices[0]:
        ans = str(df.iloc[idx]["answer"])
        ans = clean_runtime_answer(ans)
        contexts.append(ans[:800])

    combined_context = " ".join(contexts)

    # Step 5: QA extraction
    qa_result = qa_reader(
        question=query,
        context=combined_context
    )

    # Step 6: QA confidence fallback
    if qa_result["score"] < QA_MIN_CONFIDENCE:
        fallback_answers = []

        for ans in contexts:
            sentences = re.split(r'(?<=[.!?])\s+', ans)
            fallback_answers.append(" ".join(sentences[:2]))

        return {
            "type": "partial",
            "message": "Relevant information found. Showing closest answers.",
            "confidence": round(max_score, 3),
            "answers": fallback_answers
        }

    # Step 7: Clean & shorten final answer
    final_answer = clean_runtime_answer(qa_result["answer"])

    sentences = re.split(r'(?<=[.!?])\s+', final_answer)
    final_answer = " ".join(sentences[:3])

    return {
        "type": "answer",
        "answer": final_answer,
        "confidence": round(qa_result["score"], 3)
    }



In [49]:
response = answer_question("How can I apply for passport?")

if response["type"] == "no_answer":
    print("❌", response["message"])

elif response["type"] == "partial":
    print("⚠️", response["message"])
    print("Retrieval confidence:", response["confidence"])
    print("\nClosest answers:\n")

    for i, ans in enumerate(response["answers"], 1):
        print(f"{i}. {ans}")

elif response["type"] == "answer":
    print("✅ Answer:", response["answer"])
    print("Confidence:", response["confidence"])


⚠️ Relevant information found. Showing closest answers.
Retrieval confidence: 0.574

Closest answers:

1. (i) Tatkal Scheme to issue a passport within 1-7 days from the date of submission of application on the basis of submission of verification certificate or three identity documents and re-issue passports within three working days, subject to no adverse information being found in the system during the processing of the applications; (ii) expansion of the list of officials authorized to issue verification certificates; (iii) expansion of the category of applicants who can get passports on the basis of post issuance police verification or on post-verification basis and (iv) issue of passports without police verification or on post-verification basis to those applicants (issue of passports to government employees and minors). 2.
2. -i. Passport Seva Project has been implemented which inter alia includes setting up and operationalisation of 77 Passport Seva Kendras (PSKs) acting as exten