# Retrieve Tos;dr data

## Load libraries

In [1]:
import asyncio
import json
import random
from pathlib import Path
from playwright.async_api import async_playwright

from rich.console import Console
from rich.progress import (
    Progress, SpinnerColumn, BarColumn, TextColumn, 
    TimeRemainingColumn, MofNCompleteColumn
)

## Global variables

In [2]:
ROOT = Path('../..')
DATA_DIR = ROOT / "data" / "TOSDR"
DATA_FILE = DATA_DIR / "tosdr_data.jsonl"
MARKDOWN_OUTPUT = DATA_DIR / "tosdr_markdowns_en.jsonl"
EXTENSION_FILE = ROOT / "EULAI-extension" / "content.js"

CONCURRENCY_LIMIT = 20
console = Console()

## Utilities functions

In [3]:
async def process_document(context, task, js_logic, semaphore, progress, main_task, f_out):    
    async with semaphore:
        url = task['url']
        page = await context.new_page()
        
        try:
            await page.goto(url, wait_until="domcontentloaded", timeout=60000)
            await asyncio.sleep(random.uniform(1, 3)) 

            result = await page.evaluate(js_logic)

            if result and result.get('text'):
                output_data = {
                    **task,
                    "markdown": result['text'],
                    "status": "success"
                }
                f_out.write(json.dumps(output_data, ensure_ascii=False) + "\n")
                f_out.flush()
            else:
                progress.console.print(f"[yellow]⚠ Empty: {task['service_name']} ({url})[/yellow]")

        except Exception as e:
            error_data = {**task, "status": "error", "error": str(e)}
            f_out.write(json.dumps(error_data, ensure_ascii=False) + "\n")
            f_out.flush()
        finally:
            await page.close()
            progress.update(main_task, advance=1)

async def main_scraper():
    if not DATA_FILE.exists():
        console.print(f"[red]File missing: {DATA_FILE}[/red]")
        return

    js_logic = EXTENSION_FILE.read_text(encoding='utf-8')

    processed_urls = set()
    if MARKDOWN_OUTPUT.exists():
        with open(MARKDOWN_OUTPUT, "r", encoding="utf-8") as f:
            for line in f:
                try:
                    data = json.loads(line)
                    processed_urls.add(data['url'])
                except: continue

    all_services = [json.loads(l) for l in DATA_FILE.read_text(encoding='utf-8').splitlines()]
    tasks_to_do = []
    for s in all_services:
        for doc in s.get('documents', []):
            if doc['url'] not in processed_urls:
                tasks_to_do.append({
                    "service_id": s['service_id'],
                    "service_name": s['name'],
                    "doc_name": doc['name'],
                    "url": doc['url']
                })

    if not tasks_to_do:
        console.print("[bold green]✔ All markdowns are already extracted![/bold green]")
        return

    console.print(f"[bold blue]Starting parallel scraping ({CONCURRENCY_LIMIT} workers)...[/bold blue]")

    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        context = await browser.new_context(
            viewport={'width': 1280, 'height': 800},
            locale="en-US",
            extra_http_headers={
                "Accept-Language": "en-US,en;q=0.9"
            },
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
        )

        semaphore = asyncio.Semaphore(CONCURRENCY_LIMIT)
        
        with Progress(
            SpinnerColumn(),
            TextColumn("[progress.description]{task.description}"),
            BarColumn(),
            MofNCompleteColumn(),
            TimeRemainingColumn(),
            console=console
        ) as progress:
            
            main_task = progress.add_task("[cyan]Extraction...", total=len(tasks_to_do))
            
            with open(MARKDOWN_OUTPUT, "a", encoding="utf-8") as f_out:
                futures = [
                    process_document(context, task, js_logic, semaphore, progress, main_task, f_out) 
                    for task in tasks_to_do
                ]
                await asyncio.gather(*futures)

        await browser.close()
        console.print(f"[bold green]✔ Completed! File: {MARKDOWN_OUTPUT}[/bold green]")

In [None]:
await main_scraper()

Output()