In [2]:
import re
import json
import requests
import tiktoken

BASE_URL = "https://tds.s-anand.net/"

def fetch_sidebar():
    """Download the sidebar listing all course pages."""
    resp = requests.get(BASE_URL + "_sidebar.md")
    resp.raise_for_status()
    return resp.text

def parse_links(markdown_text):
    """Extract (title, filename) pairs from Markdown links."""
    pattern = re.compile(r'\[([^\]]+)\]\(([^)]+)\)')
    return pattern.findall(markdown_text)

def fetch_markdown(filename):
    """Download a Markdown page by filename."""
    url = BASE_URL + filename
    resp = requests.get(url)
    resp.raise_for_status()
    return resp.text

def chunk_text(text, max_tokens=500, model="gpt-3.5-turbo"):
    """
    Split `text` into chunks of roughly max_tokens tokens using tiktoken.
    """
    enc = tiktoken.encoding_for_model(model)
    token_ids = enc.encode(text)
    chunks = []
    for i in range(0, len(token_ids), max_tokens):
        chunk_ids = token_ids[i : i + max_tokens]
        chunks.append(enc.decode(chunk_ids))
    return chunks

def main():
    sidebar = fetch_sidebar()
    links = parse_links(sidebar)

    all_chunks = []
    for title, filename in links:
        md_content = fetch_markdown(filename)
        for idx, chunk in enumerate(chunk_text(md_content)):
            all_chunks.append({
                "source_url": BASE_URL + filename,
                "title": title,
                "chunk_index": idx,
                "chunk_text": chunk
            })

    # Write out to a JSON file
    with open("tds_chunks.json", "w", encoding="utf-8") as f:
        json.dump(all_chunks, f, ensure_ascii=False, indent=2)

if __name__ == "__main__":
    main()


In [3]:
import requests
import json
import tiktoken  # optional, for token-based chunking

BASE_URL = "https://discourse.onlinedegree.iitm.ac.in/"

def get_category_id(slug="tds-kb"):
    """Fetch categories and return the one matching slug."""
    resp = requests.get(f"{BASE_URL}categories.json")
    resp.raise_for_status()
    cats = resp.json()["category_list"]["categories"]
    for cat in cats:
        if cat["slug"] == slug:
            return cat["id"]
    raise ValueError(f"Category '{slug}' not found.")

def list_topics(category_slug, category_id, page=0):
    """
    Retrieve all topics for a category by paging through 
    `/c/{slug}/{id}.json` (20 topics per page).
    """
    url = f"{BASE_URL}c/{category_slug}/{category_id}.json?page={page}"
    resp = requests.get(url)
    resp.raise_for_status()
    data = resp.json()
    topics = data["topic_list"]["topics"]
    # if more pages exist, recurse
    if data["topic_list"].get("more_topics_url"):
        topics += list_topics(category_slug, category_id, page + 1)
    return topics

def fetch_topic_posts(slug, topic_id):
    """
    Fetch full topic JSON, including post_stream.posts array.
    """
    url = f"{BASE_URL}t/{slug}/{topic_id}.json"
    resp = requests.get(url)
    resp.raise_for_status()
    return resp.json()["post_stream"]["posts"]

def chunk_text(text, max_tokens=500, model="gpt-3.5-turbo"):
    """
    Split text into chunks of ~max_tokens using tiktoken.
    """
    enc = tiktoken.encoding_for_model(model)
    tokens = enc.encode(text)
    return [enc.decode(tokens[i : i + max_tokens]) 
            for i in range(0, len(tokens), max_tokens)]

def main():
    cat_slug = "tds-kb"
    cat_id = get_category_id(cat_slug)
    topics = list_topics(cat_slug, cat_id)

    all_chunks = []
    for topic in topics:
        t_id = topic["id"]
        t_slug = topic["slug"]
        t_title = topic["title"]
        t_url   = f"{BASE_URL}t/{t_slug}/{t_id}"
        posts = fetch_topic_posts(t_slug, t_id)

        for post in posts:
            raw = post.get("raw", "")
            # choose either raw or cooked
            # Optionally split into token chunks:
            chunks = chunk_text(raw) if raw else [""]
            for idx, text_chunk in enumerate(chunks):
                all_chunks.append({
                    "topic_id":      t_id,
                    "topic_slug":    t_slug,
                    "topic_title":   t_title,
                    "topic_url":     t_url,
                    "post_id":       post["id"],
                    "post_number":   post["post_number"],
                    "username":      post["username"],
                    "created_at":    post["created_at"],
                    "chunk_index":   idx,
                    "chunk_text":    text_chunk
                })

    # Persist to disk for ingestion
    with open("tds_discourse_chunks.json", "w", encoding="utf-8") as f:
        json.dump(all_chunks, f, ensure_ascii=False, indent=2)

if __name__ == "__main__":
    main()


HTTPError: 403 Client Error: Forbidden for url: https://discourse.onlinedegree.iitm.ac.in/categories.json

In [9]:
import requests
import os
import json
from datetime import datetime, timezone # Ensure timezone is imported
from urllib.parse import urljoin, urlencode

# ========== CONFIGURATION ==========

DISCOURSE_BASE_URL = "https://discourse.onlinedegree.iitm.ac.in/"
CATEGORY_SLUG = "courses/tds-kb"
CATEGORY_ID = 34
START_DATE = "2025-01-01" # Inclusive
END_DATE = "2025-04-15"   # Inclusive

RAW_COOKIE_STRING = """LgekGp1zML0MGFLOfs2Jz3beYsAIU1YEFoa+yG0VJgb75S4nQlH72Pt4b5AqurCE4IevvK3RTt0iUJmL2bGUzlY4SyrIOC0AIboJtu6hw6geSVSIjRTV2CDlIXajnBC2HOMCBdTfmYKk0iR8nDoPqU+VDx04zcKF1dOQUFfqcUGrJ/H0r0D467clgxhs6eCCHtu1dueg8y7rSFHKHpCBLd8hPeYkBEQXsHeF3Ogucq3Fd2rv4HeZIPA2VYLw6uFoFhp2HuCxRwsAHxrvTWn2Fd7KB5eBrl+N/X9uANlTX7zJ1/hA64hTCV/8ZmsGX2XN--oObAbVGGnGt2bOK9--XOplwD43YQYFhtJMsw8r+g=="""
OUTPUT_DIR = "discourse_json"
POST_ID_BATCH_SIZE = 50
MAX_CONSECUTIVE_PAGES_WITHOUT_NEW_TOPICS = 5 # New configuration for breaking loop

# ====================================

def parse_cookie_string(raw_cookie_string):
    """Parses a raw cookie string into a dictionary."""
    cookies = {}
    if not raw_cookie_string.strip():
        print("Warning: RAW_COOKIE_STRING is empty. Requests might fail if authentication is needed.")
        return cookies
    for cookie_part in raw_cookie_string.strip().split(";"):
        if "=" in cookie_part:
            key, value = cookie_part.strip().split("=", 1)
            cookies[key] = value
    return cookies


def get_topic_ids(base_url, category_slug, category_id, start_date_str, end_date_str, cookies):
    """Fetches topic IDs from a specific category within a date range."""
    url = urljoin(base_url, f"c/{category_slug}/{category_id}.json")
    topic_ids = []
    page = 0

    start_dt_naive = datetime.fromisoformat(start_date_str + "T00:00:00")
    start_dt = start_dt_naive.replace(tzinfo=timezone.utc)
    end_dt_naive = datetime.fromisoformat(end_date_str + "T23:59:59.999999")
    end_dt = end_dt_naive.replace(tzinfo=timezone.utc)

    print(f"Fetching topic IDs from category between {start_dt} and {end_dt}...")

    # Variables for the new loop break condition
    consecutive_pages_with_no_new_unique_topics = 0
    last_known_unique_topic_count = 0

    while True:
        paginated_url = f"{url}?page={page}"
        try:
            response = requests.get(paginated_url, cookies=cookies, timeout=30)
            response.raise_for_status()
        except requests.exceptions.RequestException as e:
            print(f"Failed to fetch page {page}: {e}")
            break

        try:
            data = response.json()
        except json.JSONDecodeError:
            print(f"Failed to decode JSON from page {page}. Content: {response.text[:200]}...")
            break

        topics_on_page = data.get("topic_list", {}).get("topics", [])

        if not topics_on_page:
            print(f"No more topics found on page {page} (API returned empty list).")
            break # Primary stop condition: API says no more topics on this page

        # Store current number of unique topics before processing this page
        # This helps check if *this specific page fetch* added anything new
        count_before_processing_page = len(set(topic_ids))

        for topic in topics_on_page:
            created_at_str = topic.get("created_at")
            if created_at_str:
                try:
                    created_date = datetime.fromisoformat(created_at_str.replace("Z", "+00:00"))
                except ValueError:
                    print(f"Warning: Could not parse date '{created_at_str}' for topic ID {topic.get('id')}")
                    continue

                if start_dt <= created_date <= end_dt:
                    topic_ids.append(topic["id"]) # Add ID, will be deduped later for count

        current_unique_topic_count = len(set(topic_ids))

        if topics_on_page and current_unique_topic_count == count_before_processing_page :
            # This means the current page had topics, but none of them were new *and* within the date range,
            # or all topics fetched from this page were duplicates of ones already in topic_ids from *previous pages*.
            # For the staleness check, we care if the overall unique set isn't growing.
             pass # Handled by the check below using last_known_unique_topic_count

        # Staleness check: Has the *total* number of unique topics found stopped growing?
        if current_unique_topic_count == last_known_unique_topic_count and topics_on_page:
            # topics_on_page is checked to ensure we don't increment if an empty page was returned (which is a valid end)
            consecutive_pages_with_no_new_unique_topics += 1
            print(f"Page {page} did not yield any new unique topics. Consecutive stale pages: {consecutive_pages_with_no_new_unique_topics}.")
        else:
            consecutive_pages_with_no_new_unique_topics = 0 # Reset if new unique topics were found

        last_known_unique_topic_count = current_unique_topic_count

        if consecutive_pages_with_no_new_unique_topics >= MAX_CONSECUTIVE_PAGES_WITHOUT_NEW_TOPICS:
            print(f"No new unique topics found for {MAX_CONSECUTIVE_PAGES_WITHOUT_NEW_TOPICS} consecutive pages. Assuming end of relevant category listing.")
            break

        # Original secondary stop condition (heuristic)
        more_topics_url = data.get("topic_list", {}).get("more_topics_url")
        if not more_topics_url:
            # This typically means it's the last page.
            # The condition `len(topics_on_page) < 30` was a heuristic for when more_topics_url might be missing
            # but the page wasn't full. If more_topics_url is definitively gone, it's a strong signal.
            print(f"No 'more_topics_url' indicated on page {page}. Assuming this is the last page of topics.")
            break
        
        print(f"Fetched page {page}, {len(topics_on_page)} topics on page. Total unique topics found so far: {current_unique_topic_count}. Continuing...")
        page += 1


    final_unique_topic_ids = list(set(topic_ids)) # Deduplicate
    print(f"Total unique topics found in timeframe: {len(final_unique_topic_ids)}")
    return final_unique_topic_ids


def get_full_topic_json(base_url, topic_id, cookies):
    """Fetches the full topic JSON, including all posts by handling pagination."""
    initial_topic_url = urljoin(base_url, f"t/{topic_id}.json")
    print(f"Fetching initial data for topic {topic_id} from {initial_topic_url}")

    try:
        response = requests.get(initial_topic_url, cookies=cookies, timeout=30)
        response.raise_for_status()
        topic_data = response.json()
    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch initial topic data for {topic_id}: {e}")
        return None
    except json.JSONDecodeError:
        print(f"Failed to decode initial JSON for topic {topic_id}. Content: {response.text[:200]}...")
        return None

    post_stream = topic_data.get("post_stream")
    if not post_stream or "stream" not in post_stream or "posts" not in post_stream:
        print(f"Error: 'post_stream' not found or incomplete in topic {topic_id}. Skipping post fetching.")
        return topic_data

    all_post_ids_in_stream = post_stream.get("stream", [])
    loaded_post_ids = {post["id"] for post in post_stream.get("posts", [])}

    all_post_ids_in_stream = [pid for pid in all_post_ids_in_stream if pid is not None]

    missing_post_ids = [pid for pid in all_post_ids_in_stream if pid not in loaded_post_ids]

    print(f"Topic {topic_id}: Total posts in stream: {len(all_post_ids_in_stream)}, Initially loaded: {len(loaded_post_ids)}, Missing: {len(missing_post_ids)}")

    if not missing_post_ids:
        print(f"All posts for topic {topic_id} already loaded in initial fetch.")
        return topic_data

    fetched_additional_posts = []
    for i in range(0, len(missing_post_ids), POST_ID_BATCH_SIZE):
        batch_ids = missing_post_ids[i:i + POST_ID_BATCH_SIZE]

        query_params = [("post_ids[]", pid) for pid in batch_ids]
        posts_url = urljoin(base_url, f"t/{topic_id}/posts.json")

        print(f"Fetching batch of {len(batch_ids)} posts for topic {topic_id} (IDs: {batch_ids[0]}...{batch_ids[-1]})")

        try:
            batch_response = requests.get(posts_url, params=query_params, cookies=cookies, timeout=60)
            batch_response.raise_for_status()
            batch_data = batch_response.json()

            if isinstance(batch_data, list):
                 fetched_additional_posts.extend(batch_data)
            elif "post_stream" in batch_data and "posts" in batch_data["post_stream"]:
                fetched_additional_posts.extend(batch_data["post_stream"]["posts"])
            elif "posts" in batch_data and isinstance(batch_data["posts"], list):
                 fetched_additional_posts.extend(batch_data["posts"])
            else:
                print(f"Warning: Unexpected JSON structure for post batch in topic {topic_id}. Data: {str(batch_data)[:200]}...")

        except requests.exceptions.RequestException as e:
            print(f"Failed to fetch post batch for topic {topic_id} (IDs: {batch_ids}): {e}")
        except json.JSONDecodeError:
            print(f"Failed to decode JSON for post batch in topic {topic_id}. Response: {batch_response.text[:200]}...")

    if fetched_additional_posts:
        print(f"Successfully fetched {len(fetched_additional_posts)} additional posts for topic {topic_id}.")
        existing_posts_in_topic_data = {post['id']: post for post in topic_data["post_stream"]["posts"]}
        for post in fetched_additional_posts:
            if post['id'] not in existing_posts_in_topic_data:
                topic_data["post_stream"]["posts"].append(post)
                existing_posts_in_topic_data[post['id']] = post

        post_id_to_post_map = {post['id']: post for post in topic_data["post_stream"]["posts"]}

        sorted_posts = []
        for post_id_val in all_post_ids_in_stream: # Renamed post_id to post_id_val to avoid conflict
            if post_id_val in post_id_to_post_map:
                sorted_posts.append(post_id_to_post_map[post_id_val])

        topic_data["post_stream"]["posts"] = sorted_posts
        print(f"Topic {topic_id}: Final post count in JSON: {len(topic_data['post_stream']['posts'])}")

    return topic_data


def save_topic_json(topic_id, json_data, output_dir):
    """Saves the topic JSON data to a file."""
    os.makedirs(output_dir, exist_ok=True)
    filepath = os.path.join(output_dir, f"topic_{topic_id}.json")
    try:
        with open(filepath, "w", encoding="utf-8") as f:
            json.dump(json_data, f, indent=2, ensure_ascii=False)
        # print(f"Successfully saved topic {topic_id} to {filepath}") # Reduced verbosity
    except IOError as e:
        print(f"Error saving topic {topic_id} to {filepath}: {e}")


def main():
    """Main function to orchestrate the downloading process."""
    print("Script started.")
    cookies = parse_cookie_string(RAW_COOKIE_STRING)
    if not cookies and DISCOURSE_BASE_URL != "https://meta.discourse.org/":
        print("Warning: Running without cookies. This may fail for private forums or specific content.")

    topic_ids = get_topic_ids(
        DISCOURSE_BASE_URL,
        CATEGORY_SLUG,
        CATEGORY_ID,
        START_DATE,
        END_DATE,
        cookies
    )

    if not topic_ids:
        print("No topic IDs found for the given criteria. Exiting.")
        return

    total_topics = len(topic_ids)
    success_downloads = 0
    failed_topic_ids = []

    print(f"\nStarting download of {total_topics} topics...\n")

    for i, topic_id in enumerate(topic_ids, 1):
        print(f"--- [{i}/{total_topics}] Processing topic ID: {topic_id} ---")
        topic_json_data = get_full_topic_json(DISCOURSE_BASE_URL, topic_id, cookies)
        if topic_json_data:
            save_topic_json(topic_id, topic_json_data, OUTPUT_DIR)
            success_downloads += 1
        else:
            print(f"Failed to get complete data for topic {topic_id}.")
            failed_topic_ids.append(topic_id)
        # print(f"--- Finished processing topic ID: {topic_id} ---\n") # Reduced verbosity

    print("\n========= SUMMARY =========")
    print(f"Total topics identified: {total_topics}")
    print(f"Successfully downloaded full data for: {success_downloads} topics")
    print(f"Failed to download/process: {len(failed_topic_ids)} topics")
    if failed_topic_ids:
        print("Failed topic IDs:", failed_topic_ids)
    print(f"Downloaded files are in: {os.path.abspath(OUTPUT_DIR)}")
    print("Script finished.")

if __name__ == "__main__":
    main()

Script started.
Fetching topic IDs from category between 2025-01-01 00:00:00+00:00 and 2025-04-15 23:59:59.999999+00:00...
Failed to fetch page 0: 403 Client Error: Forbidden for url: https://discourse.onlinedegree.iitm.ac.in/c/courses/tds-kb/34.json?page=0
Total unique topics found in timeframe: 0
No topic IDs found for the given criteria. Exiting.


In [17]:
import json
from langchain.schema import Document

def load_and_normalize():
    docs = []
    # 1. Course‐content chunks
    with open("tds_chunks.json", "r", encoding="utf-8") as f:
        chunks = json.load(f)
    for c in chunks:
        docs.append(
            Document(
                page_content=c["chunk_text"],
                metadata={
                    "source_url": c["source_url"],
                    "title":      c["title"],
                    "chunk_index":c["chunk_index"],
                },
            )
        )

    # 2. Discourse‐content chunks
    with open("tds_discourse_chunks.json", "r", encoding="utf-8") as f:
        posts = json.load(f)
    for p in posts:
        # raw content lives in `content`
        docs.append(
            Document(
                page_content=p["content"],
                metadata={
                    "topic_id":    p["topic_id"],
                    "topic_title": p["topic_title"],
                    "post_id":     p["post_id"],
                    "created_at":  p["created_at"],
                    "url":         p["url"],
                    "chunk_index": p.get("chunk_index", 0),
                },
            )
        )

    return docs

docs = load_and_normalize()
print(f"Loaded {len(docs)} Document objects")

# Convert to JSON-serializable dict format
normalized_docs = [
    {
        "page_content": doc.page_content,
        "metadata": doc.metadata
    }
    for doc in docs
]

# Save to disk
with open("normalized_docs.json", "w", encoding="utf-8") as f:
    json.dump(normalized_docs, f, ensure_ascii=False, indent=2)

print("Saved normalized_docs.json")


Loaded 1001 Document objects
Saved normalized_docs.json


In [31]:
import re
import json
import os
from langchain.schema import Document
from bs4 import BeautifulSoup  # pip install beautifulsoup4

# Regex patterns
MD_IMAGE_RE = re.compile(r'!\[[^\]]*\]\((https?://[^\)]+)\)')
MD_LINK_RE  = re.compile(r'\[[^\]]*\]\((https?://[^\)]+)\)')
URL_RE      = re.compile(r'(https?://[^\s)]+)')

def clean_text(text: str) -> tuple[str, list[str]]:
    """
    1. Pull out all URLs from markdown images, links, and bare URLs.
    2. Remove them from the text.
    3. Strip HTML tags, inline code ticks, headings, bullets, and garbage lines.
    4. Return (cleaned_text, unique_links_list).
    """
    links: list[str] = []

    # 1. Extract markdown image URLs
    text = MD_IMAGE_RE.sub(lambda m: links.append(m.group(1)) or '', text)
    # 2. Extract markdown link URLs
    text = MD_LINK_RE.sub(lambda m: links.append(m.group(1)) or '', text)
    # 3. Extract bare URLs
    text = URL_RE.sub(lambda m: links.append(m.group(1)) or '', text)

    # 4. Strip any remaining HTML
    text = BeautifulSoup(text, "html.parser").get_text()

    # 5. Remove inline code fences/backticks
    text = re.sub(r'`([^`]*)`', r'\1', text)

    # 6. Clean line-by-line
    cleaned_lines = []
    for line in text.splitlines():
        # trim whitespace
        line = line.strip()
        # drop headings
        line = re.sub(r'^#+\s*', '', line)
        # drop bullets/quotes
        line = re.sub(r'^[\-\*\>\s]+', '', line)
        # skip very short or non-alphanumeric lines
        if len(line) < 3 or not re.search(r'\w', line):
            continue
        cleaned_lines.append(line)

    # 7. Re-join & collapse multiple blanks
    cleaned = "\n".join(cleaned_lines)
    cleaned = re.sub(r'\n{2,}', "\n\n", cleaned)

    # 8. Deduplicate links (preserve order)
    seen = set()
    unique_links = []
    for url in links:
        if url not in seen:
            seen.add(url)
            unique_links.append(url)

    return cleaned, unique_links

def load_and_normalize(base_path: str) -> list[Document]:
    docs: list[Document] = []

    # Course-content chunks
    with open(os.path.join(base_path, "tds_chunks.json"), "r", encoding="utf-8") as f:
        course_chunks = json.load(f)
    for c in course_chunks:
        clean, links = clean_text(c["chunk_text"])
        meta = {
            "source_url":  c["source_url"],
            "title":       c["title"],
            "chunk_index": c["chunk_index"],
            "links":       links,
        }
        docs.append(Document(page_content=clean, metadata=meta))

    # Discourse-content chunks
    with open(os.path.join(base_path, "tds_discourse_chunks.json"), "r", encoding="utf-8") as f:
        forum_posts = json.load(f)
    for p in forum_posts:
        raw = p.get("content", "")
        clean, links = clean_text(raw)
        meta = {
            "topic_id":    p["topic_id"],
            "topic_title": p["topic_title"],
            "post_id":     p["post_id"],
            "created_at":  p["created_at"],
            "url":         p["url"],
            "chunk_index": p.get("chunk_index", 0),
            "links":       links,
        }
        docs.append(Document(page_content=clean, metadata=meta))

    return docs

if __name__ == "__main__":
    BASE = r"C:\Users\adavy\Desktop\TDS_P1\dataset_test"
    docs = load_and_normalize(BASE)
    print(f"Loaded & cleaned {len(docs)} documents.")

    # Write out a simple JSON list of dicts
    normalized = [
        {"page_content": d.page_content, "metadata": d.metadata}
        for d in docs
    ]
    with open(os.path.join(BASE, "normalized_docs.json"), "w", encoding="utf-8") as f:
        json.dump(normalized, f, ensure_ascii=False, indent=2)
    print("Saved cleaned normalized_docs.json")


✅ Saved 243 chunks to C:\Users\adavy\Desktop\TDS_P1\dataset_test\tds_discourse_threads.json


In [34]:
import os
import re
import json
from collections import defaultdict

import tiktoken            # pip install tiktoken
from bs4 import BeautifulSoup  # pip install beautifulsoup4

# ──────────────────────────────────────────────────────────────────────────────
# CONFIG
# ──────────────────────────────────────────────────────────────────────────────
BASE_DIR    = r"C:\Users\adavy\Desktop\TDS_P1\dataset_test"  # adjust as needed
IN_COURSE   = os.path.join(BASE_DIR, "tds_chunks.json")
IN_FORUM    = os.path.join(BASE_DIR, "tds_discourse_chunks.json")
OUT_NORMAL  = os.path.join(BASE_DIR, "normalized_docs2.json")

MAX_TOKENS  = 500
ENC_MODEL   = "gpt-3.5-turbo"

# ──────────────────────────────────────────────────────────────────────────────
# PATTERNS
# ──────────────────────────────────────────────────────────────────────────────
MD_IMAGE_RE = re.compile(r'!\[[^\]]*\]\((https?://[^\)]+)\)')
MD_LINK_RE  = re.compile(r'\[[^\]]*\]\((https?://[^\)]+)\)')
URL_RE      = re.compile(r'(https://[^\s)]+)')
AT_RE       = re.compile(r'@[\w\-]+')
MULTI_SPACE = re.compile(r' {2,}')

# ──────────────────────────────────────────────────────────────────────────────
# CLEAN & EXTRACT LINKS
# ──────────────────────────────────────────────────────────────────────────────
def clean_and_extract(text: str) -> (str, list[str]):
    links = []
    # extract and remove markdown images
    text = MD_IMAGE_RE.sub(lambda m: links.append(m.group(1)) or '', text)
    # extract and remove markdown links
    text = MD_LINK_RE.sub(lambda m: links.append(m.group(1)) or '', text)
    # extract and remove bare URLs
    text = URL_RE.sub(lambda m: links.append(m.group(1)) or '', text)
    # remove @mentions
    text = AT_RE.sub('', text)
    # strip HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    # remove inline code ticks
    text = re.sub(r'`([^`]*)`', r'\1', text)
    # collapse multi-space to single
    text = MULTI_SPACE.sub(' ', text)
    # split lines and clean
    lines = []
    for L in text.splitlines():
        L = L.strip()
        # drop empty or too short lines
        if len(L) < 4:
            continue
        # drop markdown headings/bullets
        L = re.sub(r'^[#>\-\*\s]+', '', L)
        # collapse within-line multi-spaces again
        L = MULTI_SPACE.sub(' ', L)
        if L:
            lines.append(L)
    cleaned = "\n".join(lines)
    # dedupe links
    seen = set(); uniq = []
    for u in links:
        if u not in seen:
            seen.add(u); uniq.append(u)
    return cleaned, uniq

# ──────────────────────────────────────────────────────────────────────────────
# CHUNKING
# ──────────────────────────────────────────────────────────────────────────────
enc = tiktoken.encoding_for_model(ENC_MODEL)
def chunk(text: str) -> list[str]:
    ids = enc.encode(text)
    return [enc.decode(ids[i:i+MAX_TOKENS]) for i in range(0, len(ids), MAX_TOKENS)]

# ──────────────────────────────────────────────────────────────────────────────
# MAIN NORMALIZATION
# ──────────────────────────────────────────────────────────────────────────────
def main():
    out = []
    # process course chunks
    for c in json.load(open(IN_COURSE, "r", encoding="utf-8")):
        txt, links = clean_and_extract(c["chunk_text"])
        md = {
            "source_url": c["source_url"],
            "title":      c["title"],
            "chunks":     len(chunk(txt)),
            "links":      links
        }
        for idx, ch in enumerate(chunk(txt)):
            out.append({
                "type":         "course",
                "chunk_index":  idx,
                "page_content": ch,
                "metadata":     md
            })
    # group forum posts by topic
    threads = defaultdict(list)
    for p in json.load(open(IN_FORUM, "r", encoding="utf-8")):
        threads[p["topic_id"]].append(p)
    # merge and chunk each thread
    for tid, posts in threads.items():
        posts.sort(key=lambda x: x["post_number"])
        parts = []
        for p in posts:
            tag = "Q" if p["post_number"] == 1 else f"R{p['post_number']-1}"
            parts.append(f"{tag}: {p.get('content','')}")
        full = "\n\n".join(parts)
        txt, links = clean_and_extract(full)
        md = {
            "topic_id":    tid,
            "topic_title": posts[0]["topic_title"],
            "url":         posts[0]["url"],
            "chunks":      len(chunk(txt)),
            "links":       links
        }
        for idx, ch in enumerate(chunk(txt)):
            out.append({
                "type":         "forum",
                "chunk_index":  idx,
                "page_content": ch,
                "metadata":     md
            })
    # save normalized
    with open(OUT_NORMAL, "w", encoding="utf-8") as f:
        json.dump(out, f, ensure_ascii=False, indent=2)
    print(f"Saved {len(out)} normalized chunks to {OUT_NORMAL}")

if __name__ == "__main__":
    main()


Saved 424 normalized chunks to C:\Users\adavy\Desktop\TDS_P1\dataset_test\normalized_docs2.json


# embedding testing


In [38]:
import json

with open(r"C:\Users\adavy\Desktop\TDS_P1\dataset_test\normalized_docs2.json", encoding="utf-8") as f:
    docs = json.load(f)

total_chars = sum(len(doc["page_content"]) for doc in docs)
avg_chars_per_token = 4  # Approximate average
total_tokens = total_chars // avg_chars_per_token
print(f"Total tokens: {total_tokens}")

Total tokens: 123617


In [49]:
import genai

client = genai.Client(api_key="GEMINI_API_KEY")

result = client.models.embed_content(
        model="gemini-embedding-exp-03-07",
        contents="What is the meaning of life?")

print(result.embeddings)

ModuleNotFoundError: No module named 'genai'

In [None]:
import os
import google.generativeai as genai



model = genai.GenerativeModel('gemini-1.5-flash')

prompt = "Write a story about a magic backpack."
response = model.generate_content(prompt)

print(response.text) 

Elara wasn't your typical twelve-year-old. While others obsessed over pop stars and social media, Elara craved adventure.  Her escape was the dusty attic of her grandmother's rambling Victorian house, a place filled with forgotten treasures and the scent of mothballs and old paper.  It was there, nestled amongst chipped teacups and faded photographs, that she found it – a worn leather backpack, its stitching intricate and oddly shimmering.

The backpack was unremarkable except for a small, tarnished silver clasp shaped like a hummingbird.  Curiosity piqued, Elara opened the clasp.  Instead of the expected emptiness, she found it brimming with… well, everything.  A perfectly ripe mango, a first edition copy of "Alice's Adventures in Wonderland," a tiny, singing frog, a map drawn on parchment that seemed to glow faintly, and a compass that spun wildly, pointing in all directions at once.

Over the next few days, Elara’s life transformed.  The backpack, she discovered, was magical.  It pr

In [None]:
# prompt: give me google gemini code to embed text


import google.generativeai as genai

# Assuming you have a secret named 'GEMINI_API_KEY' set in Colab secrets
# Replace with your actual secret name if different
GEMINI_API_KEY = ''

if not GEMINI_API_KEY:
  print("GEMINI_API_KEY not found in Colab secrets. Please set it.")
else:
  genai.configure(api_key=GEMINI_API_KEY)

  # Example of embedding text
  text_to_embed = "This is a test sentence for embedding."
  result = genai.embed_content(model="models/embedding-001", content=text_to_embed)

  # The embedding is in result['embedding']
  embedding = result['embedding']
  print("Embedding:", embedding)



Embedding: [0.053849667, -0.040725958, -0.0046300436, -0.03277522, 0.015930427, 0.026341947, 0.03961317, -0.015965927, -0.007771914, 0.038305927, 0.053404994, -0.01101266, -0.0064206608, -0.0338029, -0.017255515, 0.0022145999, -0.0009908334, -0.008556645, 0.008218227, -0.022017727, 0.0023225478, 0.0049427766, -0.0041448046, -0.0005308099, 0.0027797983, -0.024324711, 0.04580033, -0.03871397, -0.019353203, 0.015879262, -0.061900564, 0.050104205, -0.067114435, -0.013838072, 0.008133515, -0.045278054, -0.02982182, -0.009732122, 0.026702441, 0.022631288, -0.0059094853, -0.009791319, 0.008383838, -0.0486036, 0.047498666, -0.01581222, -0.0112596, 0.037022218, -0.0045725727, -0.056783926, 0.054819044, -0.020024724, 0.03964844, -0.008953915, -0.012323955, -0.038888805, 0.057950422, 0.008822417, -0.049521614, -0.008155741, -0.0062932796, -0.032998957, -0.025061931, 0.062351584, -0.03447409, -0.07040939, -0.051851302, 0.01860223, 0.04383988, -0.041436013, 0.010627225, -0.027357712, 0.050579593, 0

In [None]:
import os
import json
import time
import numpy as np
import tiktoken             # pip install tiktoken
import google.generativeai as genai  # pip install google-generativeai
from google.api_core.exceptions import ResourceExhausted, GoogleAPICallError

# ──────────────────────────────────────────────────────────────────────────────
# CONFIGURATION
# ──────────────────────────────────────────────────────────────────────────────
BASE_DIR      = r"C:\Users\adavy\Desktop\TDS_P1\dataset_test"
NORMAL_FILE   = os.path.join(BASE_DIR, "normalized_docs2.json")
EMBED_FILE    = os.path.join(BASE_DIR, "embeddings.npz")
META_FILE     = os.path.join(BASE_DIR, "metadata.json")

GEN_API_KEY   = os.getenv("GOOGLE_API_KEY") 
if not GEN_API_KEY:
    raise RuntimeError("Please set the GOOGLE_API_KEY environment variable")

MODEL         = "models/embedding-001"
BATCH_SIZE    = 50
MAX_RPM       = 1500
SLEEP_BETWEEN = 60.0 / MAX_RPM   # ~0.04s between calls

# ──────────────────────────────────────────────────────────────────────────────
# SETUP
# ──────────────────────────────────────────────────────────────────────────────
genai.configure(api_key=GEN_API_KEY)

with open(NORMAL_FILE, "r", encoding="utf-8") as f:
    docs = json.load(f)
texts    = [d["page_content"] for d in docs]
metadata = [d["metadata"]     for d in docs]
total    = len(texts)

enc = tiktoken.encoding_for_model("gpt-3.5-turbo")

# Resume if possible
if os.path.exists(EMBED_FILE) and os.path.exists(META_FILE):
    data      = np.load(EMBED_FILE)
    embs      = data["arr_0"].tolist()
    start_idx = len(embs)
    print(f"Resuming from {start_idx}/{total} embedded chunks")
else:
    embs      = []
    start_idx = 0
    with open(META_FILE, "w", encoding="utf-8") as f:
        json.dump(metadata, f, ensure_ascii=False)
    print(f"Starting fresh: 0/{total} embedded chunks")

# ──────────────────────────────────────────────────────────────────────────────
# EMBEDDING LOOP
# ──────────────────────────────────────────────────────────────────────────────
for i in range(start_idx, total, BATCH_SIZE):
    raw_batch = texts[i : i + BATCH_SIZE]
    # strip any bad UTF-8 surrogates
    batch_texts = [t.encode('utf-8', 'ignore').decode('utf-8') for t in raw_batch]
    tries = 0

    while True:
        try:
            resp = genai.embed_content(model=MODEL, content=batch_texts)
            # fix: handle whichever key is present
            if "embeddings" in resp:
                vectors = resp["embeddings"]
            elif "data" in resp and isinstance(resp["data"], list):
                vectors = [item["embedding"] for item in resp["data"]]
            elif "embedding" in resp:
                vectors = resp["embedding"]
            else:
                raise KeyError(f"No embeddings in response keys={list(resp.keys())}")
            embs.extend(vectors)
            break

        except ResourceExhausted:
            print("429 rate limit hit; sleeping 60s…")
            time.sleep(60)

        except GoogleAPICallError as e:
            tries += 1
            if tries <= 3:
                wait = 5 * tries
                print(f"API error, retry {tries}/3 in {wait}s…", e)
                time.sleep(wait)
            else:
                print(f"Skipping batch at index {i} after 3 failures.")
                break

        except Exception as e:
            print(f"Unexpected error at batch {i}: {e}")
            raise

    # incremental save
    arr = np.array(embs, dtype="float32")
    np.savez_compressed(EMBED_FILE, arr)
    done = len(embs)
    left = total - done
    print(f"Embedded {done}/{total} chunks. {left} remaining.")
    time.sleep(SLEEP_BETWEEN)

print("✅ All done. Embeddings saved to", EMBED_FILE)


Starting fresh: 0/424 embedded chunks
Embedded 50/424 chunks. 374 remaining.
Embedded 100/424 chunks. 324 remaining.
Embedded 150/424 chunks. 274 remaining.
Embedded 200/424 chunks. 224 remaining.
Embedded 250/424 chunks. 174 remaining.
Embedded 300/424 chunks. 124 remaining.
Embedded 350/424 chunks. 74 remaining.
Embedded 400/424 chunks. 24 remaining.
Embedded 424/424 chunks. 0 remaining.
✅ All done. Embeddings saved to C:\Users\adavy\Desktop\TDS_P1\dataset_test\embeddings.npz


# retrieval 

In [72]:
import numpy as np
import faiss
import json

# 1. Load embeddings and metadata
emb_data = np.load(r"C:\Users\adavy\Desktop\TDS_P1\dataset_test\embeddings.npz")
embeddings = emb_data["arr_0"]               # shape (N, D)
with open(r"C:\Users\adavy\Desktop\TDS_P1\dataset_test\metadata.json", "r", encoding="utf-8") as f:
    metadata = json.load(f)                   # list of length N

# 2. Build FAISS index (L2 distance)
D = embeddings.shape[1]                       # embedding dimension
index = faiss.IndexFlatL2(D)                  # exact L2
index.add(embeddings)                         # add all N vectors
print(f"FAISS index contains {index.ntotal} vectors of dimension {D}")

# 3. (Optional) Normalize for cosine similarity:
# faiss.normalize_L2(embeddings)
# index = faiss.IndexFlatIP(D)
# index.add(embeddings)

# 4. Retrieval helper
def retrieve(query_emb: np.ndarray, k: int = 5):
    """
    query_emb: 1-D array of length D
    returns: list of dicts {metadata, distance}
    """
    # ensure shape (1, D)
    q = query_emb.reshape(1, -1).astype("float32")
    distances, indices = index.search(q, k)
    
    results = []
    for dist, idx in zip(distances[0], indices[0]):
        entry = metadata[idx].copy()
        entry["score"] = float(dist)
        results.append(entry)
    return results

# 5. Example usage
if __name__ == "__main__":
    # Suppose you have a query embedding already:
    # query_emb = <your embedding pipeline>
    # hits = retrieve(query_emb, k=5)
    # for h in hits:
    #     print(h["source_url"], "→", h["score"])
    pass


FAISS index contains 424 vectors of dimension 768


In [None]:
import os
import json
import numpy as np
import faiss
import google.generativeai as genai

# ─── CONFIG ───────────────────────────────────────────────────────────────────
BASE_DIR       = r"C:\Users\adavy\Desktop\TDS_P1\dataset_test"
NORMAL_FILE    = os.path.join(BASE_DIR, "normalized_docs2.json")
EMBED_FILE     = os.path.join(BASE_DIR, "embeddings.npz")
META_FILE      = os.path.join(BASE_DIR, "metadata.json")

GEN_API_KEY    = os.getenv("GOOGLE_API_KEY")
MODEL          = "models/embedding-001"
TOP_K          = 5

# ─── INITIALIZE API ───────────────────────────────────────────────────────────
genai.configure(api_key=GEN_API_KEY)

# ─── LOAD DATA ────────────────────────────────────────────────────────────────
# Load chunk texts + metadata
with open(NORMAL_FILE, "r", encoding="utf-8") as f:
    docs = json.load(f)
texts    = [d["page_content"] for d in docs]
metas    = [d["metadata"]     for d in docs]

# Load embeddings
emb_data  = np.load(EMBED_FILE)
embeddings = emb_data["arr_0"].astype("float32")

# ─── BUILD FAISS INDEX ─────────────────────────────────────────────────────────
dim   = embeddings.shape[1]
index = faiss.IndexFlatL2(dim)
index.add(embeddings)
print(f"Loaded {index.ntotal} embeddings of dimension {dim}")

# ─── QUERY EMBEDDING ──────────────────────────────────────────────────────────
def embed_query(text: str) -> np.ndarray:
    resp = genai.embed_content(model=MODEL, content=[text])
    # handle possible fields
    if "embeddings" in resp:
        vec = resp["embeddings"][0]
    elif "data" in resp:
        vec = resp["data"][0]["embedding"]
    elif "embedding" in resp:
        vec = resp["embedding"]
    else:
        raise KeyError(f"No embedding in response keys={resp.keys()}")
    return np.array(vec, dtype="float32").reshape(1, -1)

# ─── RETRIEVE FUNCTION ────────────────────────────────────────────────────────
def retrieve(question: str, k: int = TOP_K):
    # 1) embed the question
    q_vec = embed_query(question)
    # 2) search index
    dists, idxs = index.search(q_vec, k)
    # 3) assemble results
    results = []
    for dist, idx in zip(dists[0], idxs[0]):
        results.append({
            "chunk_text": texts[idx],
            **metas[idx],
            "score": float(dist)
        })
    return results

# ─── DEMO ─────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
    question = input("Enter your question: ").strip()
    hits = retrieve(question)
    print(f"\nTop {len(hits)} results:")
    for i, h in enumerate(hits, 1):
        url = h.get("source_url") or h.get("url") or "<no-url>"
        print(f"{i}. (score: {h['score']:.2f}) {url}\n   {h['chunk_text'][:200].replace(chr(10),' ')}...\n")


Loaded 424 embeddings of dimension 768

Top 5 results:
1. (score: 0.47) https://tds.s-anand.net/llm-website-scraping.md
   LLM Website Scraping...

2. (score: 0.49) https://tds.s-anand.net/data-sourcing.md
   Data Sourcing Before you do any kind of data science, you obviously have to get the data to be able to analyze it, visualize it, narrate it, and deploy it. And what we are going to cover in this modul...

3. (score: 0.50) https://tds.s-anand.net/scraping-live-sessions.md
   Scraping: Live Sessions Fundamentals of web scraping with urllib and BeautifulSoup Intermediate web scraping use of cookies XML intro and scraping...

4. (score: 0.54) https://tds.s-anand.net/scraping-emarketer.md
   Scraping emarketer In this live scraping session, we explore a real-life scenario where Straive had to scrape data from emarketer.com for a demo. This is a fairly realistic and representative way of h...

5. (score: 0.58) https://tds.s-anand.net/scraping-with-google-sheets.md
   Scraping with Googl

# prompting and llm calling

In [None]:
import os
import json
import numpy as np
import faiss
import google.generativeai as genai
from google.api_core.exceptions import ResourceExhausted, GoogleAPICallError



# ──────────────────────────────────────────────────────────────────────────────
# CONFIGURATION
# ──────────────────────────────────────────────────────────────────────────────
BASE_DIR       = r"C:\Users\adavy\Desktop\TDS_P1\dataset_test"
NORMAL_FILE    = os.path.join(BASE_DIR, "normalized_docs2.json")
EMBED_FILE     = os.path.join(BASE_DIR, "embeddings.npz")
META_FILE      = os.path.join(BASE_DIR, "metadata.json")

QUESTION_MODEL = "models/embedding-001"
CHAT_MODEL     = "gemini-1.5-flash"  # alias for the latest Gemini 1.5 Flash
TOP_K          = 5

# API Key
GEN_API_KEY    = ""
if not GEN_API_KEY:
    raise RuntimeError("Set the GOOGLE_API_KEY environment variable")
genai.configure(api_key=GEN_API_KEY)

# ──────────────────────────────────────────────────────────────────────────────
# LOAD DATA & INDEX
# ──────────────────────────────────────────────────────────────────────────────
# 1) normalized docs
with open(NORMAL_FILE, "r", encoding="utf-8") as f:
    docs = json.load(f)
texts    = [d["page_content"] for d in docs]
metas    = [d["metadata"]     for d in docs]

# 2) embeddings
arr       = np.load(EMBED_FILE)["arr_0"].astype("float32")
dim       = arr.shape[1]

# 3) FAISS
index     = faiss.IndexFlatL2(dim)
index.add(arr)
with open(META_FILE, "r", encoding="utf-8") as f:
    metadata = json.load(f)

# ──────────────────────────────────────────────────────────────────────────────
# QUERY EMBEDDING & RETRIEVAL
# ──────────────────────────────────────────────────────────────────────────────
def embed_query(text: str) -> np.ndarray:
    resp = genai.embed_content(model=QUESTION_MODEL, content=[text])
    # extract embedding from whichever key is present
    if "embeddings" in resp:
        vec = resp["embeddings"][0]
    elif "data" in resp:
        vec = resp["data"][0]["embedding"]
    elif "embedding" in resp:
        vec = resp["embedding"]
    else:
        raise KeyError(f"No embedding in response: {resp.keys()}")
    return np.array(vec, dtype="float32").reshape(1, -1)

def retrieve(question: str, k: int = TOP_K):
    q_vec = embed_query(question)
    dists, idxs = index.search(q_vec, k)
    hits = []
    for dist, idx in zip(dists[0], idxs[0]):
        hit = {
            "chunk_text": texts[idx],
            **(metas[idx]),
            "score": float(dist)
        }
        hits.append(hit)
    return hits

# ──────────────────────────────────────────────────────────────────────────────
# PROMPT CONSTRUCTION & GENERATION
# ──────────────────────────────────────────────────────────────────────────────
def generate_answer(question: str):
    # 1) Retrieve top-k
    hits = retrieve(question)

    # 2) Build context block
    ctx_lines = []
    for i, h in enumerate(hits, 1):
        url = h.get("source_url") or h.get("url")
        snippet = h["chunk_text"].replace("\n", " ")
        snippet = snippet if len(snippet) < 200 else snippet[:197] + "..."
        ctx_lines.append(f"[{i}] \"{snippet}\" (Source: {url})")
    context = "\n".join(ctx_lines)

    # 3) Build chat messages
    system = (
        "You are the IITM TDS Virtual TA. "
        "Use the provided context snippets and their exact source URLs to answer "
        "the student’s question. Cite all sources by URL."
    )
    user = (
        f"Context:\n{context}\n\n"
        f"Question: {question}\n\n"
        "Produce a JSON object with two fields:\n"
        "  \"answer\": A concise answer.\n"
        "  \"links\": A list of exactly these source URLs in the order you used them.\n"
        "Do not include any extra keys."
    )

    # 4) Call Gemini 1.5 Flash
    gen_model = genai.GenerativeModel( CHAT_MODEL,system_instruction=(system))
    
    try:
        response = gen_model.generate_content(user)
    except ResourceExhausted:
        # rate-limited—back off and retry once
        time.sleep(60)
        response = gen_model.generate_content(user)

    text = response.text.strip()
    # 5) Parse JSON out
    return json.loads(text)



# ──────────────────────────────────────────────────────────────────────────────
# DEMO
# ──────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
    q = ("i want to learn docker and podman what to choose ").strip()
    result = generate_answer(q)
    print("\nAnswer:\n", result["answer"])
    print("\nLinks:")
    for link in result["links"]:
        print(" -", link)


JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
import os
import json
import time
import numpy as np
import faiss
import google.generativeai as genai
from google.api_core.exceptions import ResourceExhausted, GoogleAPICallError

# ──────────────────────────────────────────────────────────────────────────────
# CONFIGURATION
# ──────────────────────────────────────────────────────────────────────────────
BASE_DIR       = r"C:\Users\adavy\Desktop\TDS_P1\dataset_test"
NORMAL_FILE    = os.path.join(BASE_DIR, "normalized_docs2.json")
EMBED_FILE     = os.path.join(BASE_DIR, "embeddings.npz")
META_FILE      = os.path.join(BASE_DIR, "metadata.json")

QUESTION_MODEL = "models/embedding-001"
CHAT_MODEL     = "gemini-1.5-flash"
TOP_K          = 5

# Load your API key from env or hard-code (for testing)
GEN_API_KEY    = os.getenv("GOOGLE_API_KEY") or ""
if not GEN_API_KEY:
    raise RuntimeError("Set the GOOGLE_API_KEY env var")
genai.configure(api_key=GEN_API_KEY)

# ──────────────────────────────────────────────────────────────────────────────
# LOAD & BUILD INDEX
# ──────────────────────────────────────────────────────────────────────────────
# Load normalized docs
with open(NORMAL_FILE, "r", encoding="utf-8") as f:
    docs = json.load(f)
texts = [d["page_content"] for d in docs]
metas = [d["metadata"]     for d in docs]

# Load embeddings
arr = np.load(EMBED_FILE)["arr_0"].astype("float32")
dim = arr.shape[1]

# Build FAISS
index = faiss.IndexFlatL2(dim)
index.add(arr)

# ──────────────────────────────────────────────────────────────────────────────
# EMBEDDING & RETRIEVE FUNCTIONS
# ──────────────────────────────────────────────────────────────────────────────
def embed_query(text: str) -> np.ndarray:
    resp = genai.embed_content(model=QUESTION_MODEL, content=[text])
    if "embeddings" in resp:
        vec = resp["embeddings"][0]
    elif "data" in resp:
        vec = resp["data"][0]["embedding"]
    elif "embedding" in resp:
        vec = resp["embedding"]
    else:
        raise KeyError(f"No embedding in response: {resp.keys()}")
    return np.array(vec, dtype="float32").reshape(1, -1)

def retrieve(question: str, k: int = TOP_K):
    q_vec = embed_query(question)
    dists, idxs = index.search(q_vec, k)
    hits = []
    for dist, idx in zip(dists[0], idxs[0]):
        hits.append({
            "chunk_text": texts[idx],
            **metas[idx],
            "score": float(dist)
        })
    return hits

# ──────────────────────────────────────────────────────────────────────────────
# PROMPT + CHAT + PARSE
# ──────────────────────────────────────────────────────────────────────────────
def generate_answer(question: str):
    # 1) retrieve top-K
    hits = retrieve(question)
    
    # 2) assemble context snippets
    context_lines = []
    for i, h in enumerate(hits, 1):
        url = h.get("source_url") or h.get("url", "")
        snippet = h["chunk_text"].replace("\n", " ")
        snippet = snippet if len(snippet) <= 200 else snippet[:197] + "..."
        context_lines.append(f"[{i}] \"{snippet}\" (Source: {url})")
    context_block = "\n".join(context_lines)

    # 3) strict system + user prompts
    system = (
        "You are an amazing professor of applications of data science tools with experience of 20+ years. You are replying to students on the Discourse forum to solve their problems using the context. "
        "You have also been provided with context containing FAQs and course content to use as reference. "
        "add all the links provided in context as it is return all the links you can make the content short but mention all links and you can leave text blank"
        "You must respond **only** with a valid JSON object, NO EXTRA TEXT — only JSON format, nothing extra (not even ```json or any prefix/suffix).\n"
        "Schema:\n"
        "{\n"
        '  "answer": string,\n'
        '  "links": [\n'
        '    {\n'
        '      "url": "https://........",\n'
        '      "text": "description for link."\n'
        '    },\n'
        '    {\n'
        '      "url": "https://discourse......." ,\n'
        '      "text": "description for link"\n'
        '    }\n'
        '  ]\n'
        "}"
    )
    user = (
        f"Context:\n{context_block}\n\n"
        f"Question: {question}\n\n"
        "Always output exactly one JSON object following the schema above."
        
    )

    # 4) start chat session with system instruction

    gen_model = genai.GenerativeModel( CHAT_MODEL,system_instruction=(system))
    
    try:
        response = gen_model.generate_content(user)
    except ResourceExhausted:
        time.sleep(60)
        response = gen_model.generate_content(user)

    text = response.text.strip()
    # 6) parse and return JSON
    return (text)

# ──────────────────────────────────────────────────────────────────────────────
# DEMO
# ──────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
    q = ("when is graded assignment ga 2 deadline ").strip()
    out = generate_answer(q)

    print("adsdasaff")
    print(out)


adsdasaff
```json
{
  "answer": "The deadline for GA2 has been changed multiple times.  Initially it was January 26th, then February 2nd, and finally it's been changed to May 25th, 2025.  Please refer to the most recent announcements for the most up-to-date information.",
  "links": [
    {
      "url": "https://tds.s-anand.net/README.md",
      "text": "README file containing initial GA2 date and other updates."
    },
    {
      "url": "https://discourse.onlinedegree.iitm.ac.in/t/revised-dates-tds-jan-2025/168506/1",
      "text": "Discourse forum thread with revised dates."
    },
    {
      "url": "https://discourse.onlinedegree.iitm.ac.in/t/ga2-deadline/165142/1",
      "text": "Discourse forum thread discussing conflicting GA2 deadlines."
    },
    {
      "url": "https://tds.s-anand.net/README.md",
      "text": "README file containing project and assignment deadlines."
    },
    {
      "url": "https://discourse.onlinedegree.iitm.ac.in/t/project-2-and-week-6-assignment/1683