In [2]:
import requests
import logging
import json
from typing import Dict, List
from datetime import datetime

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

BASE_URL = "https://discourse.onlinedegree.iitm.ac.in"
LOGIN_ENDPOINT = "/session"
CSRF_ENDPOINT = "/session/csrf"
TOPIC_LIST_ENDPOINT = "/c/courses/tds-kb/34/l/latest.json?filter=default&page={}"

HEADERS = {
    "User-Agent": "Mozilla/5.0",
    "Accept": "application/json",
    "X-Requested-With": "XMLHttpRequest",
    "Accept-Language": "en-GB,en;q=0.9"
}

START_DATE = datetime(2025, 1, 1)
END_DATE = datetime(2025, 4, 14)

def is_within_range(date_str: str) -> bool:
    try:
        date_obj = datetime.fromisoformat(date_str.rstrip("Z"))
        return START_DATE <= date_obj <= END_DATE
    except Exception:
        return False

def get_csrf_token(session: requests.Session) -> str:
    url = BASE_URL + CSRF_ENDPOINT
    resp = session.get(url, headers=HEADERS)
    resp.raise_for_status()
    return resp.json().get("csrf")

def login(session: requests.Session, username: str, password: str, csrf_token: str) -> str:
    url = BASE_URL + LOGIN_ENDPOINT
    data = {
        "login": username,
        "password": password,
        "second_factor_method": 1,
        "timezone": "Asia/Calcutta"
    }
    headers = HEADERS.copy()
    headers.update({
        "Content-Type": "application/x-www-form-urlencoded",
        "X-Csrf-Token": csrf_token,
        "Origin": BASE_URL,
        "Referer": f"{BASE_URL}/login"
    })
    resp = session.post(url, headers=headers, data=data)
    resp.raise_for_status()
    auth_token = session.cookies.get("_t")
    if not auth_token:
        raise Exception("_t token not found after login.")
    return auth_token

def fetch_topics(session: requests.Session, auth_token: str) -> List[Dict]:
    headers = HEADERS.copy()
    headers["Discourse-Logged-In"] = "true"
    session.cookies.set("_t", auth_token, domain="discourse.onlinedegree.iitm.ac.in")
    topics = []
    page = 1
    while True:
        url = BASE_URL + TOPIC_LIST_ENDPOINT.format(page)
        resp = session.get(url, headers=headers)
        resp.raise_for_status()
        data = resp.json().get("topic_list", {}).get("topics", [])
        if not data:
            break
        topics.extend(data)
        page += 1
    return topics

def fetch_topic_content(session: requests.Session, topic_id: int) -> Dict:
    url = f"{BASE_URL}/t/{topic_id}.json"
    resp = session.get(url, headers=HEADERS)
    resp.raise_for_status()
    return resp.json()

def extract_post_contents(topic_data: Dict) -> List[str]:
    posts = topic_data.get("post_stream", {}).get("posts", [])
    return [post.get("cooked", "") for post in posts]

def main():
    session = requests.Session()
    username = "keyasinha"
    password = "singhisking"

    csrf_token = get_csrf_token(session)
    auth_token = login(session, username, password, csrf_token)
    topics = fetch_topics(session, auth_token)

    all_data = []  # List to collect everything

    for topic in topics:
        topic_id = topic.get("id")
        title = topic.get("title")
        created_at = topic.get("created_at") or topic.get("last_posted_at")

        if not is_within_range(created_at):
            continue

        print(f"\n🔍 {title} (ID: {topic_id}) - {created_at}")

        topic_data = fetch_topic_content(session, topic_id)
        posts = extract_post_contents(topic_data)

        topic_entry = {
            "id": topic_id,
            "title": title,
            "created_at": created_at,
            "posts": posts
        }

        all_data.append(topic_entry)

    # ✅ Save to JSON file
    with open("tds_discourse_posts.json", "w", encoding="utf-8") as f:
        json.dump(all_data, f, ensure_ascii=False, indent=2)

    print("\n✅ All data saved to tds_discourse_posts.json")

if __name__ == "__main__":
    main()




🔍 Which subject to choose in jan term (ID: 161071) - 2025-01-01T16:04:48.718Z

🔍 GA3 - Large Language Models - Discussion Thread [TDS Jan 2025] (ID: 163247) - 2025-01-14T13:00:03.125Z

🔍 Tds: assignment is not submitting (ID: 166189) - 2025-02-03T06:54:27.692Z

🔍 GA1 - Development Tools - Discussion Thread [TDS Jan 2025] (ID: 161083) - 2025-01-02T02:30:03.518Z

🔍 Project 1 Evaluation second mail is not correct and reports files missing while they are present (ID: 171477) - 2025-04-01T03:38:04.214Z

🔍 Project 2 - TDS Solver - Discussion Thread (ID: 169029) - 2025-03-03T03:42:18.928Z

🔍 Tds-official-Project1-discrepencies (ID: 171141) - 2025-03-28T18:34:40.927Z

🔍 Bonus Marks in TDS for Jan 25 (ID: 172246) - 2025-04-09T13:03:21.636Z

🔍 Repeat course (ID: 171798) - 2025-04-05T06:12:14.503Z

🔍 END TERM MOCK [TDS Jan 25] (ID: 172333) - 2025-04-10T07:52:12.506Z

🔍 PYQ Haversine (ID: 172546) - 2025-04-12T12:32:53.503Z

🔍 What to do if peer has not allowed access and the deadline is over for 

In [None]:
import json

# Replace this with your actual variable name if different
with open("tds_discourse_posts.json", "w", encoding="utf-8") as f:
    json.dump(all_posts, f, ensure_ascii=False, indent=2)

print("\n✅ All data saved to tds_discourse_posts.json")
