# Retrieve Tos;dr data

## Load libraries

In [None]:
import requests
import json
import time
import random
import concurrent.futures
import threading
from pathlib import Path

from rich import print as rprint
from rich.console import Console
from rich.progress import (
    Progress, SpinnerColumn, BarColumn, TextColumn, 
    TimeRemainingColumn, MofNCompleteColumn
)


## Global variables

In [None]:
console = Console()
API_BASE = "https://api.tosdr.org"

USER_AGENTS = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
    'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36'
]

ROOT = Path('../..')
DATA_DIR = ROOT / "data" / "TOSDR"
DATA_DIR.mkdir(parents=True, exist_ok=True)

DATA_FILE = DATA_DIR / "tosdr_data.jsonl"
ID_FILE = DATA_DIR / "tosdr_ids.txt"

# Verrou pour l'écriture fichier
file_lock = threading.Lock()

## Utilities functions

In [None]:
def safe_get(url, retries=3, backoff=5):
    """Requête GET simple et rapide."""
    headers = {'User-Agent': random.choice(USER_AGENTS)}
    for i in range(retries):
        try:
            time.sleep(random.uniform(0.1, 0.3)) 
            resp = requests.get(url, headers=headers, timeout=10)
            
            if resp.status_code == 429:
                wait = backoff * (i + 1)
                if wait > 10:
                    console.print(f"[red]⚡ Pause 429 ({wait}s)...[/red]")
                time.sleep(wait)
                continue
                
            if resp.status_code == 200:
                return resp
        except:
            time.sleep(1)
    return None

In [None]:
def fetch_service_index():
    """Récupère rapidement tous les IDs via la pagination V3"""
    valid_ids = []
    page = 1
    empty_count = 0
    
    console.print("[cyan]Récupération de la liste des services...[/cyan]")
    
    with console.status("Scan de l'index...") as status:
        while True:
            resp = safe_get(f'{API_BASE}/service/v3/?page={page}')
            if not resp: break
            
            try:
                data = resp.json()
                if isinstance(data, list):
                    services = data
                else:
                    services = data.get('parameters', {}).get('services', [])
                    if not services: services = data.get('services', [])
                
                if not services:
                    empty_count += 1
                    if empty_count >= 3: break
                else:
                    empty_count = 0
                    for s in services:
                        if isinstance(s, dict) and s.get('id'):
                            valid_ids.append(s['id'])
                
                status.update(f"Page {page} - {len(valid_ids)} services trouvés")
                page += 1
            except:
                break
                
    unique_ids = sorted(list(set(valid_ids)))
    console.print(f"[green]✔ Index terminé : {len(unique_ids)} services uniques.[/green]")
    return unique_ids

In [None]:
def fetch_service_worker(session, service_id):
    """
    Worker capable de gérer les structures JSON incohérentes de l'API ToS;DR.
    """
    url = f"{API_BASE}/service/v3/?id={service_id}"
    
    for attempt in range(3):
        try:
            time.sleep(random.uniform(0.5, 1.5))
            
            resp = session.get(url, timeout=15)
            
            if resp.status_code == 429:
                if attempt == 2: return {"error": "Rate Limited (429)", "id": service_id}
                time.sleep(5 * (attempt + 1))
                continue
            
            if resp.status_code != 200:
                return {"error": f"HTTP {resp.status_code}", "id": service_id, "skippable": True}

            try:
                raw_data = resp.json()
            except json.JSONDecodeError:
                return {"error": "Invalid JSON", "id": service_id}

            service_info = None

            if isinstance(raw_data, dict) and raw_data.get("name") and raw_data.get("documents") is not None:
                service_info = raw_data
            
            elif "parameters" in raw_data:
                params = raw_data["parameters"]
                if isinstance(params, dict):
                    if "name" in params:
                        service_info = params
                    elif "services" in params and isinstance(params["services"], list) and len(params["services"]) > 0:
                        service_info = params["services"][0]

            elif "services" in raw_data and isinstance(raw_data["services"], list) and len(raw_data["services"]) > 0:
                service_info = raw_data["services"][0]

            if not service_info:
                return {"error": "JSON vide ou structure inconnue", "id": service_id, "skippable": True}

            if not service_info.get("name"):
                 return {"error": "Nom du service manquant", "id": service_id, "skippable": True}

            documents = []
            raw_docs = service_info.get("documents", [])
            if raw_docs: 
                for d in raw_docs:
                    if d.get("url"):
                        documents.append({
                            "name": d.get("name"), 
                            "url": d.get("url")
                        })

            return {
                "success": True,
                "data": {
                    "service_id": service_id,
                    "name": service_info.get("name"),
                    "documents": documents
                }
            }

        except Exception as e:
            if attempt == 2: return {"error": f"Exception: {str(e)}", "id": service_id}
            time.sleep(1)

    return {"error": "Max retries", "id": service_id}


def main():
    console.print(f"[bold cyan]Dossier de données :[/bold cyan] {DATA_DIR}")
    
    if not ID_FILE.exists():
        console.print(f"[red]Fichier IDs manquant : {ID_FILE}[/red]")
        return

    with open(ID_FILE, "r") as f:
        all_ids = [line.strip() for line in f.readlines() if line.strip()]
    
    console.print(f"IDs chargés : {len(all_ids)}")

    processed_ids = set()
    if DATA_FILE.exists():
        console.print("[yellow]Lecture du checkpoint...[/yellow]")
        with open(DATA_FILE, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line: continue
                try:
                    item = json.loads(line)
                    if "service_id" in item:
                        processed_ids.add(str(item["service_id"]))
                except:
                    continue

    ids_to_process = [sid for sid in all_ids if str(sid) not in processed_ids]
    
    console.print(f"[green]Déjà traités : {len(processed_ids)}[/green]")
    console.print(f"[bold blue]Reste à traiter : {len(ids_to_process)}[/bold blue]")

    if not ids_to_process:
        console.print("[green]Tout est déjà à jour ![/green]")
        return

    f_out = open(DATA_FILE, "a", encoding="utf-8", buffering=1)

    try:
        with Progress(
            SpinnerColumn(),
            TextColumn("[progress.description]{task.description}"),
            BarColumn(),
            MofNCompleteColumn(),
            TimeRemainingColumn(),
            console=console
        ) as progress:
            
            task = progress.add_task("Extraction...", total=len(ids_to_process))
            
            with requests.Session() as session:
                session.headers.update({'User-Agent': random.choice(USER_AGENTS)})
                
                with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
                    future_to_id = {
                        executor.submit(fetch_service_worker, session, sid): sid 
                        for sid in ids_to_process
                    }
                    
                    for future in concurrent.futures.as_completed(future_to_id):
                        sid = future_to_id[future]
                        try:
                            result = future.result()
                            
                            if result and result.get("success"):
                                data_str = json.dumps(result["data"], ensure_ascii=False)
                                with file_lock:
                                    f_out.write(data_str + "\n")
                                    f_out.flush()
                            else:
                                if result and not result.get("skippable"):
                                    error_msg = result.get("error", "Unknown")
                                    progress.console.print(f"[red]✘ ID {sid} : {error_msg}[/red]")
                                
                        except Exception as e:
                            progress.console.print(f"[red]Erreur critique {sid}: {e}[/red]")
                        
                        progress.update(task, advance=1)
    
    finally:
        f_out.close()
        console.print(f"[bold green]✔ Terminé ! Fichier : {DATA_FILE}[/bold green]")

main()