In [1]:
!pip install -q requests beautifulsoup4 pdfplumber \
sentence-transformers faiss-cpu \
transformers accelerate streamlit pyngrok


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.8/67.8 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.0/60.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.6/6.6 MB[0m [31m51.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.8/23.8 MB[0m [31m81.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m126.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m119.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m112.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os

os.makedirs("data/raw/pdf", exist_ok=True)
os.makedirs("data/processed", exist_ok=True)

print("Folders created")


Folders created


In [3]:
import sqlite3

DB_PATH = "data/processed/mospi.db"

conn = sqlite3.connect(DB_PATH)
conn.execute("""
CREATE TABLE IF NOT EXISTS documents (
    id TEXT PRIMARY KEY,
    title TEXT,
    url TEXT,
    date_published TEXT,
    summary TEXT,
    category TEXT,
    hash TEXT
)
""")
conn.commit()
conn.close()

print("Database initialized")


Database initialized


In [19]:
import pandas as pd

df = pd.read_csv("/content/mospi_full_dataset.csv")

print("Rows:", len(df))
df.head()

Rows: 12


Unnamed: 0,document_id,title,date_published,category,url,summary,pdf_link,extracted_text
0,mospi_doc_1,Consumer Price Index (CPI) June 2024,2024-06-12,Economic Indicators,https://mospi.gov.in/press-release/cpi-june-2024,CPI for June 2024 indicates moderation in reta...,https://example.com/cpi_june_2024.pdf,The CPI report shows food inflation easing whi...
1,mospi_doc_2,Index of Industrial Production (IIP) April 2024,2024-05-28,Industrial Statistics,https://mospi.gov.in/press-release/iip-april-2024,IIP recorded a growth of 5.2 percent in April ...,https://example.com/iip_april_2024.pdf,Industrial output growth was led by manufactur...
2,mospi_doc_3,Wholesale Price Index (WPI) April 2024,2024-05-14,Price Statistics,https://mospi.gov.in/press-release/wpi-april-2024,WPI showed a slight increase due to manufactur...,https://example.com/wpi_april_2024.pdf,Wholesale prices rose marginally while fuel pr...
3,mospi_doc_4,Quarterly GDP Estimates Q4 2023-24,2024-05-31,National Accounts,https://mospi.gov.in/press-release/gdp-q4-2023-24,GDP growth remained steady supported by servic...,https://example.com/gdp_q4_2024.pdf,"GDP estimates show expansion in construction, ..."
4,mospi_doc_5,Employment Situation Quarterly Report Q1 2024,2024-03-15,Labour Statistics,https://mospi.gov.in/publication/employment-q1...,Labour force participation improved with a sli...,https://example.com/employment_q1_2024.pdf,Urban employment indicators show gradual recov...


In [20]:
# Drop rows with empty critical fields
df = df.dropna(subset=[
    "document_id",
    "title",
    "summary",
    "extracted_text"
])

# Ensure date format
df["date_published"] = pd.to_datetime(
    df["date_published"],
    errors="coerce"
)

df = df.dropna(subset=["date_published"])

print("Clean rows:", len(df))


Clean rows: 12


In [21]:
from typing import List

def chunk_text(text: str, chunk_size=800, overlap=100) -> List[str]:
    chunks = []
    start = 0

    while start < len(text):
        end = start + chunk_size
        chunks.append(text[start:end])
        start = end - overlap

    return chunks


In [22]:
records = []

for _, row in df.iterrows():
    chunks = chunk_text(row["extracted_text"])

    for i, chunk in enumerate(chunks):
        records.append({
            "document_id": row["document_id"],
            "title": row["title"],
            "url": row["url"],
            "chunk_id": f"{row['document_id']}_chunk_{i}",
            "text": chunk
        })

chunk_df = pd.DataFrame(records)
print("Total chunks:", len(chunk_df))
chunk_df.head()


Total chunks: 12


Unnamed: 0,document_id,title,url,chunk_id,text
0,mospi_doc_1,Consumer Price Index (CPI) June 2024,https://mospi.gov.in/press-release/cpi-june-2024,mospi_doc_1_chunk_0,The CPI report shows food inflation easing whi...
1,mospi_doc_2,Index of Industrial Production (IIP) April 2024,https://mospi.gov.in/press-release/iip-april-2024,mospi_doc_2_chunk_0,Industrial output growth was led by manufactur...
2,mospi_doc_3,Wholesale Price Index (WPI) April 2024,https://mospi.gov.in/press-release/wpi-april-2024,mospi_doc_3_chunk_0,Wholesale prices rose marginally while fuel pr...
3,mospi_doc_4,Quarterly GDP Estimates Q4 2023-24,https://mospi.gov.in/press-release/gdp-q4-2023-24,mospi_doc_4_chunk_0,"GDP estimates show expansion in construction, ..."
4,mospi_doc_5,Employment Situation Quarterly Report Q1 2024,https://mospi.gov.in/publication/employment-q1...,mospi_doc_5_chunk_0,Urban employment indicators show gradual recov...


In [23]:
!pip install -q sentence-transformers faiss-cpu


In [24]:
from sentence_transformers import SentenceTransformer

embed_model = SentenceTransformer("all-MiniLM-L6-v2")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [25]:
import faiss
import numpy as np

embeddings = embed_model.encode(
    chunk_df["text"].tolist(),
    show_progress_bar=True
)

dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(np.array(embeddings))

print("FAISS index size:", index.ntotal)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

FAISS index size: 12


In [26]:
from transformers import pipeline

generator = pipeline(
    "text2text-generation",
    model="google/flan-t5-base"
)


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cpu


In [41]:
import pandas as pd
import numpy as np
import re

def has_keyword_overlap(question, context, min_overlap=1):
    # Simple keyword overlap check
    question_words = set(re.findall(r"\w+", question.lower()))
    context_words = set(re.findall(r"\w+", context.lower()))
    overlap = question_words.intersection(context_words)
    return len(overlap) >= min_overlap


def ask_question(question, k=5, relative_threshold=1.5):
    # 1. Embed query
    q_embedding = embed_model.encode([question])

    # 2. Retrieve top-k
    distances, indices = index.search(np.array(q_embedding), k)
    best_distance = distances[0][0]

    valid_idxs = [
        idx for dist, idx in zip(distances[0], indices[0])
        if dist <= best_distance * relative_threshold
    ]

    # 🚫 No relevant chunks at all
    if not valid_idxs:
        return (
            "I don't have that information in my data.",
            pd.DataFrame(columns=["title", "url"])
        )

    retrieved_chunks = chunk_df.loc[valid_idxs]
    context = "\n\n".join(retrieved_chunks["text"].tolist())

    # 🚫 Context exists, but NOT relevant to the question
    if not has_keyword_overlap(question, context, min_overlap=2):
        return (
            "I don't have that information in my data.",
            pd.DataFrame(columns=["title", "url"])
        )

    # 3. Generate answer (now safe)
    prompt = f"""
You are a data-grounded assistant.
Answer the question strictly using the context below.
Do NOT use any external knowledge.

Context:
{context}

Question:
{question}
"""

    answer = generator(prompt, max_new_tokens=120)[0]["generated_text"]

    citations = (
        retrieved_chunks[["title", "url"]]
        .drop_duplicates()
        .reset_index(drop=True)
    )

    return answer, citations


In [42]:
answer, sources = ask_question(
    "What does the CPI report say about inflation?"
)

print(answer)
print(sources)


food inflation easing
                                  title  \
0  Consumer Price Index (CPI) June 2024   

                                                url  
0  https://mospi.gov.in/press-release/cpi-june-2024  


In [43]:
answer, sources = ask_question(
    "Who is the current Prime Minister of India?"
)

print("ANSWER:", answer)
print("SOURCES:", sources)


ANSWER: I don't have that information in my data.
SOURCES: Empty DataFrame
Columns: [title, url]
Index: []
