# Retrieve ToS;DR Markdown Content

*This notebook scrapes privacy policy documents whose links where extracted from ToS;DR services and converts them to markdown format using Playwright and browser automation.*

## 1. Load Libraries

*Import required packages for async web scraping with Playwright, JSON handling, and progress visualization.*

In [1]:
import asyncio
import json
import random
from pathlib import Path
from playwright.async_api import async_playwright

from rich.console import Console
from rich.progress import (
    Progress, SpinnerColumn, BarColumn, TextColumn, 
    TimeRemainingColumn, MofNCompleteColumn
)

## 2. Configuration

*Define file paths for input service data, markdown output, content extraction script, and concurrency settings for parallel processing.*

In [8]:
ROOT = Path('../..')
DATA_DIR = ROOT / "data-generated" / "TOSDR"
DATA_FILE = DATA_DIR / "tosdr_data.jsonl"
MARKDOWN_OUTPUT = DATA_DIR / "policies_md.jsonl"
EXTENSION_FILE = ROOT / "chrome-extension" / "content.js"

CONCURRENCY_LIMIT = 20
console = Console()

## 3. Scraping Functions

*Async functions to process documents in parallel using Playwright browser automation.*

### 3.1 Document Processing Worker

*Processes a single document by loading the page, executing JavaScript extraction logic, and saving the markdown result.*

In [None]:
async def process_document(context, document_task, extraction_script, semaphore, progress, progress_task, output_file):    
    async with semaphore:
        url = document_task['url']
        page = await context.new_page()
        
        try:
            await page.goto(url, wait_until="domcontentloaded", timeout=60000)
            await asyncio.sleep(random.uniform(1, 3)) 

            result = await page.evaluate(extraction_script)

            if result and result.get('text'):
                output_data = {
                    **document_task,
                    "markdown": result['text'],
                    "status": "success"
                }
                output_file.write(json.dumps(output_data, ensure_ascii=False) + "\n")
                output_file.flush()
            else:
                progress.console.print(f"[yellow]⚠ Empty: {document_task['service_name']} ({url})[/yellow]")

        except Exception as e:
            error_data = {**document_task, "status": "error", "error": str(e)}
            output_file.write(json.dumps(error_data, ensure_ascii=False) + "\n")
            output_file.flush()
        finally:
            await page.close()
            progress.update(progress_task, advance=1)

### 3.2 Main Scraping Pipeline

*Orchestrates the full scraping process: loading tasks, launching browser, and coordinating parallel document processing with progress tracking.*

In [None]:
async def main_scraper():
    if not DATA_FILE.exists():
        console.print(f"[red]File missing: {DATA_FILE}[/red]")
        return

    extraction_script = EXTENSION_FILE.read_text(encoding='utf-8')

    processed_urls = set()
    if MARKDOWN_OUTPUT.exists():
        with open(MARKDOWN_OUTPUT, "r", encoding="utf-8") as f:
            for line in f:
                try:
                    data = json.loads(line)
                    processed_urls.add(data['url'])
                except: continue

    all_services = [json.loads(line) for line in DATA_FILE.read_text(encoding='utf-8').splitlines()]
    pending_tasks = []
    for service in all_services:
        for doc in service.get('documents', []):
            if doc['url'] not in processed_urls:
                pending_tasks.append({
                    "service_id": service['service_id'],
                    "service_name": service['name'],
                    "doc_name": doc['name'],
                    "url": doc['url']
                })

    if not pending_tasks:
        console.print("[bold green]✔ All markdowns are already extracted![/bold green]")
        return

    console.print(f"[bold blue]Starting parallel scraping ({CONCURRENCY_LIMIT} workers)...[/bold blue]")

    async with async_playwright() as playwright:
        browser = await playwright.chromium.launch(headless=True)
        browser_context = await browser.new_context(
            viewport={'width': 1280, 'height': 800},
            locale="en-US",
            extra_http_headers={
                "Accept-Language": "en-US,en;q=0.9"
            },
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
        )

        semaphore = asyncio.Semaphore(CONCURRENCY_LIMIT)
        
        with Progress(
            SpinnerColumn(),
            TextColumn("[progress.description]{task.description}"),
            BarColumn(),
            MofNCompleteColumn(),
            TimeRemainingColumn(),
            console=console
        ) as progress:
            
            scraping_task = progress.add_task("[cyan]Extracting...", total=len(pending_tasks))
            
            with open(MARKDOWN_OUTPUT, "a", encoding="utf-8") as output_file:
                async_tasks = [
                    process_document(browser_context, task, extraction_script, semaphore, progress, scraping_task, output_file) 
                    for task in pending_tasks
                ]
                await asyncio.gather(*async_tasks)

        await browser.close()
        console.print(f"[bold green]✔ Completed! File: {MARKDOWN_OUTPUT}[/bold green]")

In [None]:
await main_scraper()