In [1]:
import os, time, json, pathlib, pprint, requests

EP       = os.getenv("BROWSER_ENDPOINT", "http://browser:8004")
SCRAPED  = pathlib.Path("/storage/scraped_data")

def wait_for(job_id, every=2):
    while True:
        rec = requests.get(f"{EP}/jobs/{job_id}").json()
        if not rec["status"] in {"finished", "error"}:
            print("\r" + rec["status_with_elapsed"], end="")
        else:
            print("\n" + rec["status"])
            return rec
        time.sleep(every)

def submit(task, payload):
    r = requests.post(f"{EP}/jobs/{task}", json=payload)
    r.raise_for_status()
    jid = r.json()["job_id"]
    print("🆔", task, "job:", jid)
    return wait_for(jid)

In [2]:
res = submit("saudi-open-data", {"dataset_id": "18887141-a088-4167-9aef-177791adb412"})
print("\n\n\n")
pprint.pp(res)

🆔 saudi-open-data job: d137fb44a5534a62a91f1e804d078ee2
running 4s
finished




{'job_id': 'd137fb44a5534a62a91f1e804d078ee2',
 'task_name': 'saudi-open-data',
 'params': {'dataset_id': '18887141-a088-4167-9aef-177791adb412'},
 'status': 'finished',
 'created_at': '2025-08-12T19:16:21.107718',
 'started_at': '2025-08-12T19:16:21.107944',
 'finished_at': '2025-08-12T19:16:26.324560',
 'result': {'dataset_id': '18887141-a088-4167-9aef-177791adb412',
            'dataset_title': None,
            'dataset_url': 'https://open.data.gov.sa/en/datasets/view/18887141-a088-4167-9aef-177791adb412/resources',
            'total_resources': 2,
            'downloaded': 2,
            'failed': 0,
            'resources_json': 'resources.json',
            'downloads_json': 'downloads.json',
            'files_sample': [{'status': 'ok',
                              'via': 'ctx(v1)',
                              'url': 'https://open.data.gov.sa/data/api/v1/datasets/18887141-a088-4167-9aef-177791ad

In [3]:
# 2a) first 5 datasets of a publisher (range)
pub_id = "694ebd35-2ea6-4f1d-84b6-2ea875159b95"
range_demo = {"publisher_id": pub_id, "dataset_range": [1, 500]}
res2a = submit("saudi-open-data", range_demo)
pprint.pp(res2a)

🆔 saudi-open-data job: d712e7ab8f99499395f35d10f5437505
running 44m 31s
finished
{'job_id': 'd712e7ab8f99499395f35d10f5437505',
 'task_name': 'saudi-open-data',
 'params': {'publisher_id': '694ebd35-2ea6-4f1d-84b6-2ea875159b95',
            'dataset_range': [1, 500]},
 'status': 'finished',
 'created_at': '2025-08-12T19:16:27.125826',
 'started_at': '2025-08-12T19:16:27.126009',
 'finished_at': '2025-08-12T20:00:58.942562',
 'result': {'publisher_id': '694ebd35-2ea6-4f1d-84b6-2ea875159b95',
            'total_datasets': 500,
            'datasets_succeeded': 498,
            'datasets_partial': 1,
            'datasets_failed': 1,
            'total_files_ok': 677,
            'total_files_failed': 3,
            'details_file': 'publisher_results.json',
            'datasets_complete': 498},
 'error': None,
 'status_with_elapsed': 'finished'}


In [6]:
# 2b) *all* datasets of that publisher
all_demo = {"publisher_id": pub_id}      # no range / limit -> all
res2b = submit("saudi-open-data", all_demo)
pprint.pp(res2b)

🆔 saudi-open-data job: 84bc2639c7fd4a4eb74f8152748bec91
running 1h 31ms
finished
{'job_id': '84bc2639c7fd4a4eb74f8152748bec91',
 'task_name': 'saudi-open-data',
 'params': {'publisher_id': '694ebd35-2ea6-4f1d-84b6-2ea875159b95'},
 'status': 'finished',
 'created_at': '2025-08-12T20:01:00.951446',
 'started_at': '2025-08-12T20:01:00.951840',
 'finished_at': '2025-08-12T21:32:15.691041',
 'result': {'publisher_id': '694ebd35-2ea6-4f1d-84b6-2ea875159b95',
            'total_datasets': 1027,
            'datasets_succeeded': 1025,
            'datasets_partial': 2,
            'datasets_failed': 0,
            'total_files_ok': 1384,
            'total_files_failed': 2,
            'details_file': 'publisher_results.json',
            'datasets_complete': 1025},
 'error': None,
 'status_with_elapsed': 'finished'}


In [5]:
# # 3) full-site crawl of naama.sa (unbounded)
# site_job = {
#     "url": "https://naama.sa",
#     "delay_seconds": [1, 2]   # be polite
#     # no max_pages → crawl everything on naama.sa domain
# }
# res3 = submit("scrape-site", site_job)
# pprint.pp(res3)