## Preprocessing

Cell 1 — install dependencies

In [1]:
# Colab cell 1 - install libs
!pip install -q requests beautifulsoup4 sentence_transformers scikit-learn pdfplumber Flask pyngrok apscheduler tqdm nltk
# (optional) if you want Playwright for JS-rendered pages:
# !pip install -q playwright
# !playwright install --with-deps


Cell 2 — imports & helper utilities

In [2]:
# Colab cell 2 - imports and helpers
import os, csv, json, time, hashlib, sqlite3
from urllib.parse import urljoin, urlparse
import requests
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm

# small helper
def ensure_dir(path):
    if not os.path.exists(path):
        os.makedirs(path)

ensure_dir("raw_html")
ensure_dir("pdfs")
ensure_dir("extracted_text")
ensure_dir("wiki_pages")


In [11]:
# NEW, CONSOLIDATED HELPER FUNCTIONS CELL (Place this after Cell 2)

# --- Core Helper Functions ---

import re, nltk
from nltk.tokenize import sent_tokenize
import pdfplumber

# Download NLTK data once
nltk.download('punkt')
nltk.download('punkt_tab') # Added download for punkt_tab

# --- Main Crawler Function (from original Cell 5) ---
def crawl_with_heuristics(seed_url, max_depth=3, max_pages=200):
    parsed = urlparse(seed_url)
    base_domain = parsed.netloc
    visited = set()
    to_visit = [(seed_url, 0)]
    scheme_urls = set()

    # Keywords to identify relevant pages
    include_keywords = ["scheme", "yojana", "benefit", "notification", "gr",
                        "department", "program", "service", "guidelines", "circular"]
    # Patterns to exclude irrelevant links
    exclude_patterns = ["login", "logout", "print", "signin", "signup", "register",
                        "/admin", "/dashboard", "?session", "reset", "#", "?lang", ".jpg", ".png"]

    # Use a shared session for efficiency
    with requests.Session() as session:
        session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })

        pbar = tqdm(total=max_pages, desc=f"Crawling {base_domain}", leave=False)
        while to_visit and len(visited) < max_pages:
            url, depth = to_visit.pop(0)

            if url in visited or depth > max_depth:
                continue

            visited.add(url)
            pbar.update(1)

            try:
                resp = session.get(url, timeout=10)
                if resp.status_code != 200:
                    continue
                html = resp.text
            except requests.RequestException:
                continue

            soup = BeautifulSoup(html, "html.parser")

            # Check if current page is relevant
            lowered_text = (url + " " + soup.get_text()).lower()
            if any(kw in lowered_text for kw in include_keywords):
                scheme_urls.add(url)

            # Find new links to visit
            for a in soup.find_all("a", href=True):
                href = a["href"].strip()
                if not href or any(ep in href.lower() for ep in exclude_patterns):
                    continue

                full_url = urljoin(url, href)

                if urlparse(full_url).netloc != base_domain or full_url in visited:
                    continue

                # Add to queue if the link text or URL seems relevant
                link_text = (href + " " + a.get_text()).lower()
                if any(kw in link_text for kw in include_keywords):
                    if len(to_visit) < max_pages * 2: # Prevent the queue from getting too big
                        to_visit.append((full_url, depth + 1))

            time.sleep(0.2) # Be polite to the server
        pbar.close()

    return scheme_urls, visited


# --- Text Processing Functions ---
def clean_text(t):
    t = re.sub(r'\\s+', ' ', t)
    t = t.replace('\\xa0', ' ')
    return t.strip()

def chunk_text(text, max_chars=1500):
    sents = sent_tokenize(text)
    chunks = []
    current_chunk = ""
    for s in sents:
        if len(current_chunk) + len(s) + 1 > max_chars:
            if current_chunk.strip():
                chunks.append(current_chunk.strip())
            current_chunk = s + " "
        else:
            current_chunk += s + " "
    if current_chunk.strip():
        chunks.append(current_chunk.strip())
    return chunks

# --- Content Extraction Functions ---
def extract_text_from_html(html):
    soup = BeautifulSoup(html, "html.parser")
    for tag in soup(["script", "style", "nav", "footer", "header", "form", "noscript"]):
        tag.decompose()
    return " ".join(soup.stripped_strings)

def download_and_extract_pdf(url, session, out_dir="pdfs"):
    try:
        r = session.get(url, timeout=30)
        if r.status_code == 200 and 'application/pdf' in r.headers.get('Content-Type', ''):
            fname = os.path.join(out_dir, hashlib.sha1(url.encode()).hexdigest()[:12] + ".pdf")
            with open(fname, "wb") as f:
                f.write(r.content)

            text = ""
            with pdfplumber.open(fname) as pdf:
                for page in pdf.pages:
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + "\\n"
            return fname, text
    except Exception as e:
        print(f"  -> PDF download/processing error for {url}: {e}")
    return None, None

print("All helper functions defined and ready.")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


All helper functions defined and ready.


Cell 3 — seeds (the 12 states) + save as CSV

In [4]:
seeds = {
 "Maharashtra":"https://mahadbt.maharashtra.gov.in/",
 "UttarPradesh":"https://urbanschemes.up.in/",
 "TamilNadu":"https://www.tn.gov.in/scheme_wise.php",
 "Karnataka":"https://sevasindhu.karnataka.gov.in/Sevasindhu/English",
 "Gujarat":"https://mariyojana.gujarat.gov.in/",
 "WestBengal":"https://www.wb.gov.in/government-schemes.aspx",
 "JammuKashmir": "https://jk.gov.in/jammukashmir-government-schemes/", # <-- REPLACED Kerala
 "Rajasthan":"https://jansoochna.rajasthan.gov.in/Scheme",
 "AndhraPradesh":"https://housing.ap.gov.in/",
 "Telangana":"https://www.telangana.gov.in/government-initiatives/",
 "Bihar":"https://dbt.bihar.gov.in/wp/SchemeList.aspx",
 "Odisha":"https://rd.odisha.gov.in/scheme"
}

# The rest of the cell, which saves to CSV, remains the same
with open("seeds.csv","w",newline="",encoding="utf-8") as f:
    writer = csv.writer(f)
    writer.writerow(["state","seed_url"])
    for s,u in seeds.items():
        writer.writerow([s,u])
print("Saved seeds.csv with", len(seeds), "entries. (J&K included, Kerala removed)")

Saved seeds.csv with 12 entries. (J&K included, Kerala removed)


Cell 4 — Phase 1: discovery — fetch robots.txt and sitemap

In [5]:
# Colab cell 4 - fetch robots.txt + try sitemap.xml
def fetch_robots_and_sitemap(seed_url):
    parsed = urlparse(seed_url)
    base = f"{parsed.scheme}://{parsed.netloc}"
    out = {"base": base, "robots": None, "sitemap": None}
    try:
        r = requests.get(base + "/robots.txt", timeout=10)
        if r.status_code==200:
            out["robots"] = r.text
            # try to detect sitemap
            for line in r.text.splitlines():
                if line.lower().startswith("sitemap:"):
                    out["sitemap"] = line.split(":",1)[1].strip()
    except Exception as e:
        out["robots"] = None
    # attempt /sitemap.xml
    if not out["sitemap"]:
        try:
            s = requests.get(base + "/sitemap.xml", timeout=8)
            if s.status_code==200 and "<urlset" in s.text.lower():
                out["sitemap"] = base + "/sitemap.xml"
        except:
            pass
    return out

discovery = {}
for name, url in seeds.items():
    info = fetch_robots_and_sitemap(url)
    discovery[name] = info
    print(name, info["base"], " sitemap:", info["sitemap"] is not None)
# Save results
with open("discovery.json","w",encoding="utf-8") as f:
    json.dump(discovery,f,ensure_ascii=False,indent=2)


Maharashtra https://mahadbt.maharashtra.gov.in  sitemap: False
UttarPradesh https://urbanschemes.up.in  sitemap: False
TamilNadu https://www.tn.gov.in  sitemap: False
Karnataka https://sevasindhu.karnataka.gov.in  sitemap: False
Gujarat https://mariyojana.gujarat.gov.in  sitemap: False
WestBengal https://www.wb.gov.in  sitemap: False
JammuKashmir https://jk.gov.in  sitemap: False
Rajasthan https://jansoochna.rajasthan.gov.in  sitemap: False
AndhraPradesh https://housing.ap.gov.in  sitemap: False
Telangana https://www.telangana.gov.in  sitemap: False
Bihar https://dbt.bihar.gov.in  sitemap: False
Odisha https://rd.odisha.gov.in  sitemap: False


Cell 5, 6, 7 — Phase 2: generic scraper (static pages)

In [9]:
# REPLACEMENT FOR CELLS 5, 6, and 7: Full-Scale Crawling and Data Consolidation

import requests
from requests.exceptions import Timeout, ConnectionError

print("Starting full-scale, robust crawl for all states...")

# Define a header to mimic a browser
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}

# This will store text from all sources (HTML, PDF)
all_documents = []

# Loop through each state from your seeds dictionary
for state, seed_url in tqdm(seeds.items(), desc="Processing States"):
    print(f"\\n--- Processing {state} ---")

    # 1. Crawl to find all scheme-related links for the current state
    scheme_links, _ = crawl_with_heuristics(seed_url, max_depth=3, max_pages=150)
    print(f"Found {len(scheme_links)} candidate pages for {state}.")

    if not scheme_links:
        print(f"No candidate links found for {state}, moving to the next one.")
        continue

    pdf_links_for_state = set()

    # 2. Process each HTML link found for this state
    for link in tqdm(scheme_links, desc=f"Scraping {state} HTMLs"):
        try:
            print(f"  Fetching HTML: {link}")
            r = requests.get(link, headers=HEADERS, timeout=(5, 15)) # 5s to connect, 15s to read

            if r.status_code != 200:
                print(f"  -> Skipped with status code: {r.status_code}")
                continue

            # Extract text and title from the HTML
            html_text = extract_text_from_html(r.text)
            soup = BeautifulSoup(r.text, 'html.parser')
            title_tag = soup.find('h1') or soup.find('title')
            doc_title = title_tag.get_text(strip=True) if title_tag else "Untitled Scheme"

            # Store the extracted content
            all_documents.append({
                "state": state,
                "source_url": link,
                "type": "html",
                "title": doc_title,
                "content": clean_text(html_text)
            })

            # Discover any PDF links on the page
            for a in soup.find_all("a", href=True):
                href = a.get("href", "")
                if href.lower().endswith(".pdf"):
                    full_pdf_url = urljoin(link, href)
                    pdf_links_for_state.add(full_pdf_url)

        except Timeout:
            print(f"  -> Timeout error on URL: {link}")
        except ConnectionError:
            print(f"  -> Connection error on URL: {link}")
        except Exception as e:
            print(f"  -> An unexpected error occurred for {link}: {e}")

    print(f"Found {len(pdf_links_for_state)} unique PDF links for {state}.")

    # 3. Process the discovered PDF links for this state
    for pdf_link in tqdm(list(pdf_links_for_state)[:20], desc=f"Downloading {state} PDFs"): # Limit PDFs
        try:
            print(f"  Fetching PDF: {pdf_link}")
            pdf_filename, pdf_text = download_and_extract_pdf(pdf_link)
            if pdf_filename and pdf_text:
                all_documents.append({
                    "state": state,
                    "source_url": pdf_link,
                    "type": "pdf",
                    "title": os.path.basename(urlparse(pdf_link).path),
                    "content": clean_text(pdf_text)
                })
        except Exception as e:
            print(f"  -> Error processing PDF {pdf_link}: {e}")

# Finally, save all the consolidated data
with open("all_extracted_content.json", "w", encoding="utf-8") as f:
    json.dump(all_documents, f, ensure_ascii=False, indent=2)

print(f"\\nFinished processing all states. Total documents extracted: {len(all_documents)}")

Starting full-scale, robust crawl for all states...


Processing States:   0%|          | 0/12 [00:00<?, ?it/s]

\n--- Processing Maharashtra ---


Crawling mahadbt.maharashtra.gov.in:   0%|          | 0/150 [00:00<?, ?it/s]

Found 62 candidate pages for Maharashtra.


Scraping Maharashtra HTMLs:   0%|          | 0/62 [00:00<?, ?it/s]

  Fetching HTML: https://mahadbt.maharashtra.gov.in/SchemeData/SchemeData?str=E9DDFA703C38E51A19A7691F3B40AD4EE0F3DDA5DE324AC54819922BB3D36B63
  Fetching HTML: https://mahadbt.maharashtra.gov.in/SchemeData/SchemeData?str=E9DDFA703C38E51A081772910D526AF0EDDF60A95235BCA1E7C80233A30F725A
  Fetching HTML: https://mahadbt.maharashtra.gov.in/PDF/RTGU.pdf
  Fetching HTML: https://mahadbt.maharashtra.gov.in/SchemeData/SchemeData?str=E9DDFA703C38E51A7909EC2FB46546FC0ADC6852AEB8D3556C79D437D47E3A55
  Fetching HTML: https://mahadbt.maharashtra.gov.in/SchemeData/SchemeData?str=E9DDFA703C38E51A91E82A5FCFBAAB0483ADB704B47ED161F43EE2545EC67678
  Fetching HTML: https://mahadbt.maharashtra.gov.in/SchemeData/SchemeData?str=E9DDFA703C38E51AED33CA69606C0CC2EF25388FB8EC7046E5E9B1E993657CEB
  Fetching HTML: https://mahadbt.maharashtra.gov.in/SchemeData/SchemeData?str=E9DDFA703C38E51AB02E984835E89FEFDB316E301CE6A991F41C5D42B01A7D7E
  Fetching HTML: https://mahadbt.maharashtra.gov.in/Grievance/Grievance
  Fet

Downloading Maharashtra PDFs:   0%|          | 0/20 [00:00<?, ?it/s]

  Fetching PDF: https://mahadbt.maharashtra.gov.in/PDF/30.pdf
  -> Error processing PDF https://mahadbt.maharashtra.gov.in/PDF/30.pdf: download_and_extract_pdf() missing 1 required positional argument: 'session'
  Fetching PDF: https://mahadbt.maharashtra.gov.in/PDF/7.pdf
  -> Error processing PDF https://mahadbt.maharashtra.gov.in/PDF/7.pdf: download_and_extract_pdf() missing 1 required positional argument: 'session'
  Fetching PDF: https://mahadbt.maharashtra.gov.in/PDF/RTGU.pdf
  -> Error processing PDF https://mahadbt.maharashtra.gov.in/PDF/RTGU.pdf: download_and_extract_pdf() missing 1 required positional argument: 'session'
  Fetching PDF: https://mahadbt.maharashtra.gov.in/PDF/PhotoSign.pdf
  -> Error processing PDF https://mahadbt.maharashtra.gov.in/PDF/PhotoSign.pdf: download_and_extract_pdf() missing 1 required positional argument: 'session'
  Fetching PDF: https://mahadbt.maharashtra.gov.in/PDF/13.pdf
  -> Error processing PDF https://mahadbt.maharashtra.gov.in/PDF/13.pdf: d

Crawling urbanschemes.up.in:   0%|          | 0/150 [00:00<?, ?it/s]

Found 5 candidate pages for UttarPradesh.


Scraping UttarPradesh HTMLs:   0%|          | 0/5 [00:00<?, ?it/s]

  Fetching HTML: https://urbanschemes.up.in/Home/chief_minister_nagar_srijan_yojana
  Fetching HTML: https://urbanschemes.up.in/
  Fetching HTML: https://urbanschemes.up.in/Home/mukhyamantri_nagriya_alp_viksit_va_malin_basti_vikas
  Fetching HTML: https://urbanschemes.up.in/Home/nagriya_peyjal_yojna_1_lac_population
  Fetching HTML: https://urbanschemes.up.in/Home/APJ_abdul_kalam_kagriya_kaurya_punj
Found 0 unique PDF links for UttarPradesh.


Downloading UttarPradesh PDFs: 0it [00:00, ?it/s]

\n--- Processing TamilNadu ---


Crawling www.tn.gov.in:   0%|          | 0/150 [00:00<?, ?it/s]

Found 0 candidate pages for TamilNadu.
No candidate links found for TamilNadu, moving to the next one.
\n--- Processing Karnataka ---


Crawling sevasindhu.karnataka.gov.in:   0%|          | 0/150 [00:00<?, ?it/s]

Found 0 candidate pages for Karnataka.
No candidate links found for Karnataka, moving to the next one.
\n--- Processing Gujarat ---


Crawling mariyojana.gujarat.gov.in:   0%|          | 0/150 [00:00<?, ?it/s]

Found 0 candidate pages for Gujarat.
No candidate links found for Gujarat, moving to the next one.
\n--- Processing WestBengal ---


Crawling www.wb.gov.in:   0%|          | 0/150 [00:00<?, ?it/s]

Found 0 candidate pages for WestBengal.
No candidate links found for WestBengal, moving to the next one.
\n--- Processing JammuKashmir ---


Crawling jk.gov.in:   0%|          | 0/150 [00:00<?, ?it/s]

Found 0 candidate pages for JammuKashmir.
No candidate links found for JammuKashmir, moving to the next one.
\n--- Processing Rajasthan ---


Crawling jansoochna.rajasthan.gov.in:   0%|          | 0/150 [00:00<?, ?it/s]

Found 150 candidate pages for Rajasthan.


Scraping Rajasthan HTMLs:   0%|          | 0/150 [00:00<?, ?it/s]

  Fetching HTML: https://jansoochna.rajasthan.gov.in/Services?q=l7aag+q8+mz1DFk+uTDZkkpNOlKd2QLqZWwV7BoM0ZnGkmMVkyUhQEf7wJhG/4I68V5/yYfkMOyZArFy1Tkoo4guoatgGwBjt8j8ECoO3IWevUAMMClrDMToX7aYxwgM
  Fetching HTML: https://jansoochna.rajasthan.gov.in/Services?q=l7aag+q8+myivDsjQAMrH2a5R4suc37g6OPS952Hs5kyMvr12NdA7oVa/R4K0xzO
  Fetching HTML: https://jansoochna.rajasthan.gov.in/Services?q=l7aag+q8+myr95ieSxTU7ayAyog0yXNIt70d42otUGn90I3iigG9AxayQqHcb1wa1JU59P14C8ccUHpCZ1xI6Pe+FQaJYOpd98ZCHbimdcM=
  Fetching HTML: https://jansoochna.rajasthan.gov.in/Services?q=l7aag+q8+myyPwrLfVqZ0Xt4N1D29sLT7iXmuV+YZGm7Y+jXcAQ6HUyFYUn4UEImN9EBoJLgwsY=
  Fetching HTML: https://jansoochna.rajasthan.gov.in/Services?q=l7aag+q8+mxJ8W8JVBS2w4fNkumK7YYCBFPFNhSenWuxpU1fLOEUMkFJ9AtFWffXDqvEd5k89uELLdtm7qGPGU7A/UUAfUVY
  Fetching HTML: https://jansoochna.rajasthan.gov.in/Services?q=l7aag+q8+mwtsSGexkpB9w/nDNWt5qFHYIWvorDJoOAnSY3/00UjWA==
  Fetching HTML: https://jansoochna.rajasthan.gov.in/Services?q=l7aag+q8+myIHVjuyj

Downloading Rajasthan PDFs:   0%|          | 0/20 [00:00<?, ?it/s]

  Fetching PDF: https://jankalyanfile.rajasthan.gov.in//Content/UploadFolder/OrderEntry/Gopalan/2020/Annual%20Progress%20Report/O_200820_36f621e4-27cc-4abc-aaae-a4873200163e.pdf
  -> Error processing PDF https://jankalyanfile.rajasthan.gov.in//Content/UploadFolder/OrderEntry/Gopalan/2020/Annual%20Progress%20Report/O_200820_36f621e4-27cc-4abc-aaae-a4873200163e.pdf: download_and_extract_pdf() missing 1 required positional argument: 'session'
  Fetching PDF: https://jankalyanfile.rajasthan.gov.in//Content/UploadFolder/OrderEntry/Gopalan/2020/Annual%20Progress%20Report/O_200820_03eae4e9-a205-4758-80c4-c92f498bc688.pdf
  -> Error processing PDF https://jankalyanfile.rajasthan.gov.in//Content/UploadFolder/OrderEntry/Gopalan/2020/Annual%20Progress%20Report/O_200820_03eae4e9-a205-4758-80c4-c92f498bc688.pdf: download_and_extract_pdf() missing 1 required positional argument: 'session'
  Fetching PDF: http://rmsc.health.rajasthan.gov.in/content/dam/doitassets/Medical-and-Health-Portal/national-ay

Crawling housing.ap.gov.in:   0%|          | 0/150 [00:00<?, ?it/s]

Found 0 candidate pages for AndhraPradesh.
No candidate links found for AndhraPradesh, moving to the next one.
\n--- Processing Telangana ---


Crawling www.telangana.gov.in:   0%|          | 0/150 [00:00<?, ?it/s]

Found 0 candidate pages for Telangana.
No candidate links found for Telangana, moving to the next one.
\n--- Processing Bihar ---


Crawling dbt.bihar.gov.in:   0%|          | 0/150 [00:00<?, ?it/s]

Found 4 candidate pages for Bihar.


Scraping Bihar HTMLs:   0%|          | 0/4 [00:00<?, ?it/s]

  Fetching HTML: https://dbt.bihar.gov.in/wp/SchemeList.aspx
  Fetching HTML: https://dbt.bihar.gov.in/wp/DepartmentSchemeWiseReport.aspx
  Fetching HTML: https://dbt.bihar.gov.in/wp/LastUpdate_Scheme.aspx
  Fetching HTML: https://dbt.bihar.gov.in/wp/DepartmentList.aspx
Found 0 unique PDF links for Bihar.


Downloading Bihar PDFs: 0it [00:00, ?it/s]

\n--- Processing Odisha ---


Crawling rd.odisha.gov.in:   0%|          | 0/150 [00:00<?, ?it/s]

Found 31 candidate pages for Odisha.


Scraping Odisha HTMLs:   0%|          | 0/31 [00:00<?, ?it/s]

  Fetching HTML: https://rd.odisha.gov.in/quicktabs/nojs/rural_department_organisation_ch/0
  -> Timeout error on URL: https://rd.odisha.gov.in/quicktabs/nojs/rural_department_organisation_ch/0
  Fetching HTML: https://rd.odisha.gov.in/scheme/mmsy-cuvda
  -> Timeout error on URL: https://rd.odisha.gov.in/scheme/mmsy-cuvda
  Fetching HTML: https://rd.odisha.gov.in/or/about-us/about-department
  -> Timeout error on URL: https://rd.odisha.gov.in/or/about-us/about-department
  Fetching HTML: https://rd.odisha.gov.in/scheme/sby
  -> Timeout error on URL: https://rd.odisha.gov.in/scheme/sby
  Fetching HTML: https://rd.odisha.gov.in/or/guidelines
  -> Timeout error on URL: https://rd.odisha.gov.in/or/guidelines
  Fetching HTML: https://rd.odisha.gov.in/departmentwise-citizen-services-list
  -> Timeout error on URL: https://rd.odisha.gov.in/departmentwise-citizen-services-list
  Fetching HTML: https://rd.odisha.gov.in/or/scheme
  -> Timeout error on URL: https://rd.odisha.gov.in/or/scheme
  Fe

Downloading Odisha PDFs: 0it [00:00, ?it/s]

\nFinished processing all states. Total documents extracted: 221


Cell 8 — Phase 3: preprocessing & chunking

Cell 9 — Phase 4: embeddings + vector store (sentence-transformers + sklearn NN)

In [13]:
# Colab cell 9 (Corrected) - model init and indexing chunks from all_documents
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.neighbors import NearestNeighbors

# This assumes model is loaded, if not, uncomment the line below
model = SentenceTransformer('all-MiniLM-L6-v2')

# Gather chunks from the single, consolidated 'all_documents' list
corpus_chunks = []
metadata = []

for doc in tqdm(all_documents, desc="Chunking all documents"):
    content = doc.get("content", "")
    if not content:
        continue

    chunks = chunk_text(content)
    for i, chunk in enumerate(chunks):
        corpus_chunks.append(chunk)
        # Add rich metadata for each chunk
        metadata.append({
            "state": doc["state"],
            "source_url": doc["source_url"],
            "scheme_title": doc["title"],
            "type": doc["type"],
            "chunk_num": i + 1
        })

print(f"Total chunks to be indexed: {len(corpus_chunks)}")

if not corpus_chunks:
    print("No chunks to embed — the crawl might not have found any content.")
else:
    embeddings = model.encode(corpus_chunks, show_progress_bar=True, convert_to_numpy=True)

    # This is your vector store.
    vector_store = {
        "embeddings": embeddings,
        "metadata": metadata,
        "chunks": corpus_chunks
    }

    # Build the nearest-neighbor index for fast retrieval
    nn_index = NearestNeighbors(n_neighbors=5, metric='cosine').fit(vector_store["embeddings"])

    print("Vector database and NearestNeighbors index are ready.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Chunking all documents:   0%|          | 0/221 [00:00<?, ?it/s]

Total chunks to be indexed: 3332


Batches:   0%|          | 0/105 [00:00<?, ?it/s]

Vector database and NearestNeighbors index are ready.


In [18]:
# NEW CELL: Save the Vector Database, Index, and Model to disk

import pickle

print("Saving the vector database and index to files...")

# 1. Save the vector_store dictionary
with open("vector_store.pkl", "wb") as f:
    pickle.dump(vector_store, f)

# 2. Save the NearestNeighbors index object
with open("nn_index.pkl", "wb") as f:
    pickle.dump(nn_index, f)

# 3. Save the model itself
# The sentence-transformer library has a built-in save method
model.save("my_embedding_model")

print("Successfully saved:")
print("- vector_store.pkl (your data and embeddings)")
print("- nn_index.pkl (the search index)")
print("- my_embedding_model/ (the AI model)")

Saving the vector database and index to files...
Successfully saved:
- vector_store.pkl (your data and embeddings)
- nn_index.pkl (the search index)
- my_embedding_model/ (the AI model)


## Starting

In [19]:
# to load and run model from here

# NEW CELL: Load the saved database, index, and model from disk

import pickle
from sentence_transformers import SentenceTransformer
from sklearn.neighbors import NearestNeighbors

print("Loading vector database, index, and model from files...")

# 1. Load the vector_store
with open("vector_store.pkl", "rb") as f:
    vector_store = pickle.load(f)

# 2. Load the nn_index
with open("nn_index.pkl", "rb") as f:
    nn_index = pickle.load(f)

# 3. Load the SentenceTransformer model
model = SentenceTransformer("my_embedding_model")

# You can now access the data just like before
print(f"Successfully loaded. The database contains {len(vector_store['chunks'])} chunks.")

# Verify that the rest of the notebook works
print("\\nRunning a test query to confirm everything is loaded correctly...")
test_query = "scholarship for students"
test_results = retrieve(test_query, top_k=2)
for r in test_results:
    print(f"- Found chunk from: {r['meta']['source_url']}")

Cell 10 — Phase 4b: simple RAG-style retrieval + extractive summary

In [15]:
# Colab cell 10 - query the vector store and produce an extractive summary
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

def retrieve(query, top_k=4):
    qv = model.encode([query])
    dists, idxs = nn_index.kneighbors(qv, n_neighbors=top_k)
    res = []
    for i in idxs[0]:
        res.append({"chunk":corpus_chunks[i], "meta": metadata[i]})
    return res

def extractive_summary_from_chunks(chunks, n_sent=6):
    # naive extractive: combine chunks and select top tf-idf sentences
    bigtext = " ".join(chunks)
    sents = sent_tokenize(bigtext)
    if len(sents)<=n_sent:
        return bigtext
    vect = TfidfVectorizer().fit_transform(sents)
    scores = np.array(vect.sum(axis=1)).ravel()
    top_idx = scores.argsort()[::-1][:n_sent]
    top_idx_sorted = sorted(top_idx)
    return " ".join([sents[i] for i in top_idx_sorted])

# demo query
q = "scholarship amount eligibility"
results = retrieve(q, top_k=5)
print("Top context snippets:")
for r in results:
    print("-", r["meta"].get("source",""), " len:", len(r["chunk"]))
summary = extractive_summary_from_chunks([r["chunk"] for r in results], n_sent=5)
print("\nExtractive summary:\n", summary[:800])

Top context snippets:
-   len: 1270
-   len: 1294
-   len: 1469
-   len: 1455
-   len: 1448

Extractive summary:
 5) If Applicant fails in particular year then he will get the Tuition Fees, Exam Fees and Maintenance allowance of that particular academic year but he/she will not get the benefit until he/she gets promoted to next higher class. 6) If Applicant fails in particular year then he will get the Tuition Fees, Exam Fees and Maintenance allowance of that particular academic year but he/she will not get the benefit until he/she gets promoted to next higher class. Guidelines On “'Right to Give Up'” Guidelines and Rules Download Colleges list Right To Give Up GR Guidelines for SEBC Eligibility Click here for Help Help Videos × DBT overview × Registration with UID × Login and Logout Process × User Profile Update Process × Apply Schemes Process × DBT Grievance and Suggestions Merit Scholarships for Ec


Cell 11 — Phase 4c: Generate a Markdown "wiki" page from retrieved content

In [16]:
# Colab cell 11 - produce a simple wiki/markdown page for a scheme
def make_wiki_md(title, retrieved_chunks, source_urls):
    summary = extractive_summary_from_chunks([c for c in retrieved_chunks], n_sent=6)
    md = f"# {title}\n\n"
    md += "## Summary\n\n" + summary + "\n\n"
    md += "## Sources\n"
    for u in source_urls:
        md += f"- {u}\n"
    return md

# demo: title from query
title = "Auto-generated Scheme summary — demo"
md = make_wiki_md(title, [r["chunk"] for r in results], list({r["meta"].get("source") for r in results}))
with open("wiki_pages/demo_scheme.md","w",encoding="utf-8") as f:
    f.write(md)
print("Saved wiki markdown:", "wiki_pages/demo_scheme.md")
print(md[:800])


Saved wiki markdown: wiki_pages/demo_scheme.md
# Auto-generated Scheme summary — demo

## Summary

5) If Applicant fails in particular year then he will get the Tuition Fees, Exam Fees and Maintenance allowance of that particular academic year but he/she will not get the benefit until he/she gets promoted to next higher class. 6) If Applicant fails in particular year then he will get the Tuition Fees, Exam Fees and Maintenance allowance of that particular academic year but he/she will not get the benefit until he/she gets promoted to next higher class. Guidelines On “'Right to Give Up'” Guidelines and Rules Download Colleges list Right To Give Up GR Guidelines for SEBC Eligibility Click here for Help Help Videos × DBT overview × Registration with UID × Login and Logout Process × User Profile Update Process × Apply Schemes Process × DBT


In [23]:
# FINAL VERSION FOR CELL 9: Load Model, Build Vector DB, and Save Everything

from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.neighbors import NearestNeighbors
import pickle

# --- 1. Load the AI Model ---
# This is fast if it has been downloaded before.
print("Loading the Sentence Transformer model...")
try:
    # Check if a local copy exists to load faster
    model = SentenceTransformer("./my_embedding_model")
    print("Loaded model from local files.")
except Exception:
    model = SentenceTransformer('all-MiniLM-L6-v2')
    print("Downloaded and loaded new model instance.")


# --- 2. Gather Chunks from the NEW 'all_documents' list ---
# This step uses the fresh data from your complete crawl.
corpus_chunks = []
metadata = []

if 'all_documents' not in globals() or not all_documents:
    print("Error: 'all_documents' is empty. Please run the main crawling cell first.")
else:
    for doc in tqdm(all_documents, desc="Chunking all documents"):
        content = doc.get("content", "")
        if not content: continue

        chunks = chunk_text(content)
        for i, chunk in enumerate(chunks):
            corpus_chunks.append(chunk)
            metadata.append({
                "state": doc["state"],
                "source_url": doc["source_url"],
                "scheme_title": doc["title"],
                "type": doc["type"],
                "chunk_num": i + 1
            })

print(f"Total chunks to be indexed from all states: {len(corpus_chunks)}")


# --- 3. Create NEW Embeddings and Build the Index ---
if not corpus_chunks:
    print("No chunks to embed. Cannot build or save the vector database.")
else:
    print("Creating new embeddings for the complete dataset... (This may take some time)")
    embeddings = model.encode(corpus_chunks, show_progress_bar=True, convert_to_numpy=True)

    vector_store = {
        "embeddings": embeddings,
        "metadata": metadata,
        "chunks": corpus_chunks
    }

    nn_index = NearestNeighbors(n_neighbors=5, metric='cosine').fit(vector_store["embeddings"])

    print("New vector database and index are ready.")

    # --- 4. OVERWRITE Old Saved Files with the New, Complete Versions ---
    print("Saving the new, complete vector database and index to files...")

    with open("vector_store.pkl", "wb") as f:
        pickle.dump(vector_store, f)

    with open("nn_index.pkl", "wb") as f:
        pickle.dump(nn_index, f)

    model.save("my_embedding_model")

    print("Successfully updated and saved all database files.")

Loading the Sentence Transformer model...
Loaded model from local files.


Chunking all documents:   0%|          | 0/221 [00:00<?, ?it/s]

Total chunks to be indexed from all states: 3332
Creating new embeddings for the complete dataset... (This may take some time)


Batches:   0%|          | 0/105 [00:00<?, ?it/s]

New vector database and index are ready.
Saving the new, complete vector database and index to files...
Successfully updated and saved all database files.


Cell 12 — Phase 5: Simple publish — local Flask app + ngrok to serve generated wiki pages (and embed official site links)

In [21]:
# COMPLETE CODE FOR WIKI GENERATION

print("Generating state-wise wiki pages from the scraped content...")

# Ensure the output directory exists
ensure_dir("state_wikis")

# Check if the source data exists before starting
if 'all_documents' not in globals() or not all_documents:
    print("Error: The 'all_documents' list is empty. Please run the main crawling and extraction cell first.")
else:
    # Loop through each state defined in your 'seeds' dictionary
    for state in seeds.keys():

        # Filter to get all documents scraped for the current state
        state_docs = [doc for doc in all_documents if doc.get('state') == state]

        # If no documents were found for this state, skip it
        if not state_docs:
            print(f"- No content found for {state}. Skipping wiki generation.")
            continue

        # Start building the Markdown string for this state's wiki page
        md_content = f"# Government Schemes and Information for {state}\\n\\n"
        md_content += "This document is an auto-generated compilation of information scraped from official government portals.\\n\\n---\\n\\n"

        # Loop through each document (HTML page or PDF) for the state
        for doc in state_docs:
            # Add a section for this document
            md_content += f"## {doc.get('title', 'Untitled Document')}\\n\\n"
            md_content += f"**Source Type:** `{doc.get('type', 'N/A')}`\\n"
            md_content += f"**Source URL:** <{doc.get('source_url', '#')}>\\n\\n"
            md_content += "### Extracted Content\\n"
            # Add the cleaned content, ensuring it's not empty
            md_content += f"{doc.get('content', 'No content was extracted.')}\\n\\n---\\n\\n"

        # Define the output filename
        wiki_filename = f"state_wikis/{state}_schemes.md"

        # Write the compiled Markdown content to a file
        try:
            with open(wiki_filename, "w", encoding="utf-8") as f:
                f.write(md_content)
            print(f"✔ Successfully created wiki for {state} at: {wiki_filename}")
        except Exception as e:
            print(f"✘ Failed to write wiki file for {state}. Error: {e}")

print("\nWiki page generation process complete.")

Generating state-wise wiki pages from the scraped content...
✔ Successfully created wiki for Maharashtra at: state_wikis/Maharashtra_schemes.md
✔ Successfully created wiki for UttarPradesh at: state_wikis/UttarPradesh_schemes.md
- No content found for TamilNadu. Skipping wiki generation.
- No content found for Karnataka. Skipping wiki generation.
- No content found for Gujarat. Skipping wiki generation.
- No content found for WestBengal. Skipping wiki generation.
- No content found for JammuKashmir. Skipping wiki generation.
✔ Successfully created wiki for Rajasthan at: state_wikis/Rajasthan_schemes.md
- No content found for AndhraPradesh. Skipping wiki generation.
- No content found for Telangana. Skipping wiki generation.
✔ Successfully created wiki for Bihar at: state_wikis/Bihar_schemes.md
- No content found for Odisha. Skipping wiki generation.

Wiki page generation process complete.


In [22]:
# Colab Cell 12 (Corrected) - Flask server with ngrok

!pip install -q pyngrok

from flask import Flask, send_from_directory, render_template_string
from pyngrok import ngrok
import os

# 🔑 Your ngrok authtoken setup
!ngrok config add-authtoken 33Mekt30HdxOkZuN1c2KWUYTGUF_6pGi4PxV4HvFopXqgEhKA

app = Flask(__name__)

# MODIFIED: Template now points to the correct wiki files
CARD_TPL = """
<!doctype html><html><head><title>Gov Schemes Wiki</title>
<style> body { font-family: sans-serif; } .card { border:1px solid #ddd;padding:12px;margin:8px;border-radius:8px;} .grid{display:grid;grid-template-columns:repeat(auto-fill, minmax(350px, 1fr));gap:12px;} a { color: #007bff; text-decoration: none; } a:hover { text-decoration: underline; }</style></head><body>
<h1>Government Schemes Knowledge Base</h1>
<div class="grid">
{% for s,u in seeds.items() %}
  <div class="card">
    <h3>{{s}}</h3>
    <p><b><a href="{{u}}" target="_blank">Official Portal &rarr;</a></b></p>
    {% set wiki_file = s + "_schemes.md" %}
    {% if os.path.exists("state_wikis/" + wiki_file) %}
      <p><a href="/wiki/{{wiki_file}}">View Generated Wiki Page</a></p>
    {% else %}
      <p><em>No wiki file generated for this state.</em></p>
    {% endif %}
  </div>
{% endfor %}
</div>
</body></html>
"""

@app.route("/")
def index():
    return render_template_string(CARD_TPL, seeds=seeds, os=os)

# MODIFIED: Route now serves from the 'state_wikis' directory
@app.route("/wiki/<path:fn>")
def wiki_file(fn):
    path = os.path.join(os.getcwd(), "state_wikis") # <-- Corrected directory
    if os.path.exists(os.path.join(path, fn)):
        return send_from_directory(path, fn)
    else:
        return "Wiki file not found.", 404

# Open a public ngrok tunnel
port = 5000
public_url = ngrok.connect(port).public_url
print(f"🚀 Your public web app is live at: {public_url}")

# Run the Flask app
app.run(port=port)

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml
🚀 Your public web app is live at: https://rose-nonelaborate-alexandria.ngrok-free.dev
 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug:127.0.0.1 - - [03/Oct/2025 06:49:31] "GET / HTTP/1.1" 200 -
INFO:werkzeug:127.0.0.1 - - [03/Oct/2025 06:49:35] "GET /wiki/Maharashtra_schemes.md HTTP/1.1" 200 -


Cell 13 — Phase 6: change detection + versioning (SQLite)

In [None]:
# Colab cell 13 (Corrected) - compute content hash and store diffs

# ... (database setup code is correct) ...
conn = sqlite3.connect("versions.db")
c = conn.cursor()
c.execute("""CREATE TABLE IF NOT EXISTS versions (
    id INTEGER PRIMARY KEY,
    source_url TEXT,
    content_hash TEXT,
    snippet TEXT,
    timestamp DATETIME DEFAULT CURRENT_TIMESTAMP
)""")
conn.commit()

def store_version(source_url, text_snippet):
    h = hashlib.sha256(text_snippet.encode("utf-8")).hexdigest()
    c.execute("SELECT content_hash FROM versions WHERE source_url=? ORDER BY id DESC LIMIT 1",(source_url,))
    row = c.fetchone()
    if row and row[0]==h:
        return False
    c.execute("INSERT INTO versions (source_url, content_hash, snippet) VALUES (?,?,?)",(source_url,h,text_snippet[:2000]))
    conn.commit()
    return True

# MODIFIED: Test store using the correct 'all_documents' variable
print("Checking for new/changed content to store in version database...")
changes_found = 0
for p in all_documents:
    changed = store_version(p["source_url"], p["content"])
    if changed:
        changes_found += 1
        print(f"  -> Stored new version for: {p['source_url']}")
print(f"Finished. Found and stored {changes_found} new document versions.")

Cell 14 — Phase 7: schedule periodic crawling (APScheduler)

In [None]:
# Colab cell 14 (Corrected) - schedule a periodic job
from apscheduler.schedulers.background import BackgroundScheduler
import atexit

scheduler = BackgroundScheduler()
def periodic_job():
    print("\\n--- Running Scheduled Job ---")
    sname, surl = list(seeds.items())[0] # Just checking the first seed as a demo

    # MODIFIED: Using the correct function name 'crawl_with_heuristics'
    links, _ = crawl_with_heuristics(surl, max_pages=20) # Limit pages for a quick check

    print(f"Scheduled check found {len(links)} links on {sname}. You would process these here.")

scheduler.add_job(periodic_job, 'interval', minutes=60)
scheduler.start()
atexit.register(lambda: scheduler.shutdown(wait=False))
print("Scheduler started. It will run a sample check every 60 minutes.")

Cell 15 — Tests (very small unit test example)

In [None]:
# Colab cell 15 - basic sanity checks
def test_chunking():
    t = "This is sentence one. Sentence two. Sentence three. Sentence four."
    ch = chunk_text(t, max_chars=30)
    assert len(ch) >= 2
    print("chunking test passed")

test_chunking()


## Chatbot Implementation

Step 1: Full-Scale Crawling and Data Consolidation

In [None]:
# FINAL CELL: The RAG Chatbot Implementation

import google.generativeai as genai
from google.colab import userdata
import textwrap

# --- Configure the LLM ---
# This safely gets your API key from Colab's secret manager.
try:
    GEMINI_API_KEY = userdata.get('GEMINI_API_KEY')
    genai.configure(api_key=GEMINI_API_KEY)
    llm = genai.GenerativeModel('gemini-pro')
    print("Gemini Pro model configured successfully.")
except Exception as e:
    llm = None
    print(f"Could not configure the LLM. Please follow the instructions to set up your GEMINI_API_KEY. Error: {e}")
# -------------------------

def get_rag_response(query: str, top_k: int = 5):
    """
    Performs the full RAG pipeline: retrieves context, augments prompt, and generates a response.
    """
    if not llm:
        return "LLM is not configured. Cannot generate a response."
    if 'nn_index' not in globals() or 'vector_store' not in globals():
        return "Vector database is not ready. Please ensure the previous cells have been run successfully."

    # 1. RETRIEVE: Find the most relevant chunks from your vector database
    print(f"Retrieving context for query: '{query}'")
    query_embedding = model.encode([query])
    distances, indices = nn_index.kneighbors(query_embedding, n_neighbors=top_k)

    # Consolidate context and gather unique sources for citation
    context_str = ""
    source_citations = {} # Use a dict to store unique URLs with their titles
    for idx in indices[0]:
        context_str += vector_store["chunks"][idx] + "\\n---\\n"
        meta = vector_store["metadata"][idx]
        source_url = meta['source_url']
        if source_url not in source_citations:
            source_citations[source_url] = meta.get('scheme_title', 'Source Link')

    # 2. AUGMENT: Create the prompt for the LLM
    prompt_template = f"""
You are an expert assistant for Indian government schemes. Your task is to answer the user's question based ONLY on the provided context below.
- Be precise and directly answer the question.
- If the context contains eligibility criteria, benefits, or application steps, list them clearly using bullet points.
- If the information to answer the question is not in the context, you MUST state: "Based on the provided information, I cannot answer this question."
- Do not use any information outside of the provided context.

**CONTEXT:**
---
{context_str}
---

**USER'S QUESTION:**
{query}

**ANSWER:**
"""

    # 3. GENERATE: Get the response from the LLM
    print("Generating response with LLM...")
    try:
        response = llm.generate_content(prompt)

        # Format the final output nicely with sources
        final_answer = response.text
        final_answer += "\\n\\n--- \\n"
        final_answer += "**Sources from the Knowledge Base:**\\n"
        for url, title in source_citations.items():
            final_answer += f"- [{title}]({url})\\n"

        return textwrap.fill(final_answer, width=100)

    except Exception as e:
        return f"An error occurred while generating the LLM response: {e}"


# --- Let's test the chatbot! ---
# Make sure all previous cells, including the vector DB creation, have been run.

print("\\n" + "="*50)
print("              RUNNING CHATBOT QUERIES")
print("="*50 + "\\n")


# Query 1: A specific, multi-faceted question
user_query_1 = "Tell me about education scholarships in Maharashtra for students whose parents' income is low."
chatbot_answer_1 = get_rag_response(user_query_1)
print(f"💬 QUERY 1: {user_query_1}")
print(f"🤖 RESPONSE:\\n{chatbot_answer_1}")
print("\\n" + "-"*50 + "\\n")


# Query 2: A different state and domain
user_query_2 = "I am a farmer in Rajasthan. What are the benefits of the Pradhan Mantri Kisan Samman Nidhi scheme?"
chatbot_answer_2 = get_rag_response(user_query_2)
print(f"💬 QUERY 2: {user_query_2}")
print(f"🤖 RESPONSE:\\n{chatbot_answer_2}")
print("\\n" + "-"*50 + "\\n")

# Query 3: A question that may not have context
user_query_3 = "What are the new space exploration grants in Gujarat?"
chatbot_answer_3 = get_rag_response(user_query_3)
print(f"💬 QUERY 3: {user_query_3}")
print(f"🤖 RESPONSE:\\n{chatbot_answer_3}")