In [None]:
!pip install markdownify

Collecting markdownify
  Downloading markdownify-1.1.0-py3-none-any.whl.metadata (9.1 kB)
Downloading markdownify-1.1.0-py3-none-any.whl (13 kB)
Installing collected packages: markdownify
Successfully installed markdownify-1.1.0


In [None]:
pip install playwright markdownify


Collecting playwright
  Downloading playwright-1.52.0-py3-none-manylinux1_x86_64.whl.metadata (3.5 kB)
Collecting pyee<14,>=13 (from playwright)
  Downloading pyee-13.0.0-py3-none-any.whl.metadata (2.9 kB)
Downloading playwright-1.52.0-py3-none-manylinux1_x86_64.whl (45.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.1/45.1 MB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyee-13.0.0-py3-none-any.whl (15 kB)
Installing collected packages: pyee, playwright
Successfully installed playwright-1.52.0 pyee-13.0.0


In [None]:
!playwright install

Downloading Chromium 136.0.7103.25 (playwright build v1169)[2m from https://cdn.playwright.dev/dbazure/download/playwright/builds/chromium/1169/chromium-linux.zip[22m
[1G167.7 MiB [] 0% 10.7s[0K[1G167.7 MiB [] 0% 18.2s[0K[1G167.7 MiB [] 0% 11.7s[0K[1G167.7 MiB [] 0% 7.3s[0K[1G167.7 MiB [] 1% 5.6s[0K[1G167.7 MiB [] 1% 4.7s[0K[1G167.7 MiB [] 2% 4.2s[0K[1G167.7 MiB [] 2% 4.0s[0K[1G167.7 MiB [] 3% 3.7s[0K[1G167.7 MiB [] 3% 3.6s[0K[1G167.7 MiB [] 4% 3.4s[0K[1G167.7 MiB [] 5% 3.3s[0K[1G167.7 MiB [] 5% 3.2s[0K[1G167.7 MiB [] 6% 3.4s[0K[1G167.7 MiB [] 6% 3.3s[0K[1G167.7 MiB [] 7% 3.4s[0K[1G167.7 MiB [] 7% 3.6s[0K[1G167.7 MiB [] 7% 3.5s[0K[1G167.7 MiB [] 8% 3.4s[0K[1G167.7 MiB [] 9% 3.3s[0K[1G167.7 MiB [] 9% 3.4s[0K[1G167.7 MiB [] 10% 3.3s[0K[1G167.7 MiB [] 10% 3.2s[0K[1G167.7 MiB [] 11% 3.0s[0K[1G167.7 MiB [] 12% 2.8s[0K[1G167.7 MiB [] 13% 2.7s[0K[1G167.7 MiB [] 14% 2.7s[0K[1G167.7 MiB [] 15% 2.6s[0K[1G167.7 MiB [] 16% 2.6s[0K[1G167.

In [None]:
import os
import json
import re
from datetime import datetime
from urllib.parse import urljoin
from markdownify import markdownify as md
from playwright.async_api import async_playwright
import asyncio

BASE_URL = "https://tds.s-anand.net/#/2025-01/"
BASE_ORIGIN = "https://tds.s-anand.net"
OUTPUT_DIR = "tds_pages_md"
METADATA_FILE = "metadata.json"

visited = set()
metadata = []

def sanitize_filename(title):
    return re.sub(r'[\\/*?:"<>|]', "_", title).strip().replace(" ", "_")

async def extract_all_internal_links(page):
    links = await page.eval_on_selector_all("a[href]", "els => els.map(el => el.href)")
    return list(set(
        link for link in links
        if BASE_ORIGIN in link and '/#/' in link
    ))

async def wait_for_article_and_get_html(page):
    await page.wait_for_selector("article.markdown-section#main", timeout=10000)
    return await page.inner_html("article.markdown-section#main")

async def crawl_page(page, url):
    if url in visited:
        return
    visited.add(url)

    print(f"📄 Visiting: {url}")
    try:
        await page.goto(url, wait_until="domcontentloaded")
        await page.wait_for_timeout(1000)
        html = await wait_for_article_and_get_html(page)
    except Exception as e:
        print(f"❌ Error loading page: {url}\n{e}")
        return

    # Extract title and save markdown
    title = (await page.title()).split(" - ")[0].strip() or f"page_{len(visited)}"
    filename = sanitize_filename(title)
    filepath = os.path.join(OUTPUT_DIR, f"{filename}.md")

    markdown = md(html)
    with open(filepath, "w", encoding="utf-8") as f:
        f.write(f"---\n")
        f.write(f"title: \"{title}\"\n")
        f.write(f"original_url: \"{url}\"\n")
        f.write(f"downloaded_at: \"{datetime.now().isoformat()}\"\n")
        f.write(f"---\n\n")
        f.write(markdown)

    metadata.append({
        "title": title,
        "filename": f"{filename}.md",
        "original_url": url,
        "downloaded_at": datetime.now().isoformat()
    })

    links = await extract_all_internal_links(page)
    for link in links:
        if link not in visited:
            await crawl_page(page, link)

async def main():
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    global visited, metadata

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context()
        page = await context.new_page()
        await crawl_page(page, BASE_URL)

        with open(METADATA_FILE, "w", encoding="utf-8") as f:
            json.dump(metadata, f, indent=2)

        print(f"\n✅ Completed. {len(metadata)} pages saved.")
        await browser.close()

# For Jupyter/Colab
await main()


📄 Visiting: https://tds.s-anand.net/#/2025-01/
📄 Visiting: https://tds.s-anand.net/#/../data-analysis-with-sql
📄 Visiting: https://tds.s-anand.net/#/colab
📄 Visiting: https://tds.s-anand.net/#/llm-video-screen-scraping
📄 Visiting: https://tds.s-anand.net/#/profiling-data-with-python
📄 Visiting: https://tds.s-anand.net/#/bbc-weather-api-with-python
📄 Visiting: https://tds.s-anand.net/#/data-analysis-with-python
📄 Visiting: https://tds.s-anand.net/#/multimodal-embeddings
📄 Visiting: https://tds.s-anand.net/#/sqlite
📄 Visiting: https://tds.s-anand.net/#/correlation-with-excel
📄 Visiting: https://tds.s-anand.net/#/fastapi
📄 Visiting: https://tds.s-anand.net/#/visualizing-network-data-with-kumu
📄 Visiting: https://tds.s-anand.net/#/development-tools
📄 Visiting: https://tds.s-anand.net/#/data-visualization-with-chatgpt
📄 Visiting: https://tds.s-anand.net/#/parsing-json
📄 Visiting: https://tds.s-anand.net/#/data-sourcing
📄 Visiting: https://tds.s-anand.net/#/narratives-with-llms
📄 Visiting: h

In [None]:
import shutil
shutil.make_archive("tds_pages_md", 'zip', "tds_pages_md")

'/content/tds_pages_md.zip'