In [19]:
import os, time, pathlib, pprint, requests

EP       = os.getenv("BROWSER_ENDPOINT", "http://browser:8004")
SCRAPED  = pathlib.Path("/storage/scraped_data")

def wait_for(job_id, every=2):
    while True:
        rec = requests.get(f"{EP}/jobs/{job_id}").json()
        if not rec["status"] in {"finished", "error"}:
            print("\r" + rec["status_with_elapsed"], end="")
        else:
            print("\n" + rec["status"])
            return rec
        time.sleep(every)

def submit(task, payload):
    r = requests.post(f"{EP}/jobs/{task}", json=payload)
    r.raise_for_status()
    jid = r.json()["job_id"]
    print("🆔", task, "job:", jid)
    return wait_for(jid)

In [20]:
# #  single dataset
# res = submit("saudi-open-data", {"dataset_id": "18887141-a088-4167-9aef-177791adb412"})
# print("\n\n\n")
# pprint.pp(res)

In [21]:
# # all the publisher data, no range
# pub_id = "694ebd35-2ea6-4f1d-84b6-2ea875159b95"
# range_demo = {"publisher_id": pub_id, "dataset_range": [1, 500]}
# res2a = submit("saudi-open-data", range_demo)
# pprint.pp(res2a)

In [22]:
range_demo = {"publisher_id": "694ebd35-2ea6-4f1d-84b6-2ea875159b95"}
pprint.pp(submit("saudi-open-data", range_demo))

🆔 saudi-open-data job: 0ee9855d7632446fb28f4579551d9d29
running 2h 2mms
finished
{'job_id': '0ee9855d7632446fb28f4579551d9d29',
 'task_name': 'saudi-open-data',
 'params': {'publisher_id': '694ebd35-2ea6-4f1d-84b6-2ea875159b95'},
 'status': 'finished',
 'created_at': '2025-08-18T20:24:45.179135',
 'started_at': '2025-08-18T20:24:45.179344',
 'finished_at': '2025-08-18T22:27:09.733667',
 'result': {'publisher_id': '694ebd35-2ea6-4f1d-84b6-2ea875159b95',
            'status': 'success',
            'total_datasets': 1027,
            'datasets_succeeded': 1027,
            'datasets_failed': 0,
            'total_files_ok': 1377,
            'total_files_failed': 9,
            'details_file': 'publisher_results.json',
            'organization_metadata_file': 'organization_metadata.json'},
 'error': None,
 'status_with_elapsed': 'finished'}


In [23]:
range_demo = {"publisher_id": "3181a11d-14a4-4a9c-93c7-5669b2205e08"}
pprint.pp(submit("saudi-open-data", range_demo))

🆔 saudi-open-data job: 54d91078c2f448f0b85be6b21b5a8a61
running 1m 8s
finished
{'job_id': '54d91078c2f448f0b85be6b21b5a8a61',
 'task_name': 'saudi-open-data',
 'params': {'publisher_id': '3181a11d-14a4-4a9c-93c7-5669b2205e08'},
 'status': 'finished',
 'created_at': '2025-08-18T22:27:10.896876',
 'started_at': '2025-08-18T22:27:10.897068',
 'finished_at': '2025-08-18T22:28:19.868461',
 'result': {'publisher_id': '3181a11d-14a4-4a9c-93c7-5669b2205e08',
            'status': 'success',
            'total_datasets': 11,
            'datasets_succeeded': 11,
            'datasets_failed': 0,
            'total_files_ok': 28,
            'total_files_failed': 0,
            'details_file': 'publisher_results.json',
            'organization_metadata_file': 'organization_metadata.json'},
 'error': None,
 'status_with_elapsed': 'finished'}


In [24]:
range_demo = {"publisher_id": "b39db5e4-dd11-46c2-8fe9-a436e9b21b3d"}
pprint.pp(submit("saudi-open-data", range_demo))

🆔 saudi-open-data job: ee2426e10269486a875090ed648d2ae7
running 54s
finished
{'job_id': 'ee2426e10269486a875090ed648d2ae7',
 'task_name': 'saudi-open-data',
 'params': {'publisher_id': 'b39db5e4-dd11-46c2-8fe9-a436e9b21b3d'},
 'status': 'finished',
 'created_at': '2025-08-18T22:28:20.999101',
 'started_at': '2025-08-18T22:28:20.999310',
 'finished_at': '2025-08-18T22:29:15.179103',
 'result': {'publisher_id': 'b39db5e4-dd11-46c2-8fe9-a436e9b21b3d',
            'status': 'success',
            'total_datasets': 7,
            'datasets_succeeded': 7,
            'datasets_failed': 0,
            'total_files_ok': 16,
            'total_files_failed': 0,
            'details_file': 'publisher_results.json',
            'organization_metadata_file': 'organization_metadata.json'},
 'error': None,
 'status_with_elapsed': 'finished'}


In [25]:
range_demo = {"publisher_id": "11303c55-3673-4283-9930-f5051eec2b2a"}
pprint.pp(submit("saudi-open-data", range_demo))

🆔 saudi-open-data job: b000e8f2662b416f9d73556fd9010558
running 16s
finished
{'job_id': 'b000e8f2662b416f9d73556fd9010558',
 'task_name': 'saudi-open-data',
 'params': {'publisher_id': '11303c55-3673-4283-9930-f5051eec2b2a'},
 'status': 'finished',
 'created_at': '2025-08-18T22:29:17.084576',
 'started_at': '2025-08-18T22:29:17.084778',
 'finished_at': '2025-08-18T22:29:34.038300',
 'result': {'publisher_id': '11303c55-3673-4283-9930-f5051eec2b2a',
            'status': 'success',
            'total_datasets': 4,
            'datasets_succeeded': 4,
            'datasets_failed': 0,
            'total_files_ok': 8,
            'total_files_failed': 0,
            'details_file': 'publisher_results.json',
            'organization_metadata_file': 'organization_metadata.json'},
 'error': None,
 'status_with_elapsed': 'finished'}


In [26]:
range_demo = {"publisher_id": "96cc49ce-1074-4ae3-a429-e00e76dabdbb"}
pprint.pp(submit("saudi-open-data", range_demo))

🆔 saudi-open-data job: 7abe510bfb2c4db5823cb7e515141c7a
running 30s
finished
{'job_id': '7abe510bfb2c4db5823cb7e515141c7a',
 'task_name': 'saudi-open-data',
 'params': {'publisher_id': '96cc49ce-1074-4ae3-a429-e00e76dabdbb'},
 'status': 'finished',
 'created_at': '2025-08-18T22:29:35.161160',
 'started_at': '2025-08-18T22:29:35.161378',
 'finished_at': '2025-08-18T22:30:06.451551',
 'result': {'publisher_id': '96cc49ce-1074-4ae3-a429-e00e76dabdbb',
            'status': 'success',
            'total_datasets': 6,
            'datasets_succeeded': 6,
            'datasets_failed': 0,
            'total_files_ok': 12,
            'total_files_failed': 0,
            'details_file': 'publisher_results.json',
            'organization_metadata_file': 'organization_metadata.json'},
 'error': None,
 'status_with_elapsed': 'finished'}


In [27]:
range_demo = {"publisher_id": "69b07598-8502-4003-b284-ccd2f3bf0f59"}
pprint.pp(submit("saudi-open-data", range_demo))

🆔 saudi-open-data job: 3d57a39fbb9f4efca153556aeb0dcce4
running 8m 36s
finished
{'job_id': '3d57a39fbb9f4efca153556aeb0dcce4',
 'task_name': 'saudi-open-data',
 'params': {'publisher_id': '69b07598-8502-4003-b284-ccd2f3bf0f59'},
 'status': 'finished',
 'created_at': '2025-08-18T22:30:07.213090',
 'started_at': '2025-08-18T22:30:07.213314',
 'finished_at': '2025-08-18T22:38:45.541221',
 'result': {'publisher_id': '69b07598-8502-4003-b284-ccd2f3bf0f59',
            'status': 'success',
            'total_datasets': 68,
            'datasets_succeeded': 68,
            'datasets_failed': 0,
            'total_files_ok': 238,
            'total_files_failed': 0,
            'details_file': 'publisher_results.json',
            'organization_metadata_file': 'organization_metadata.json'},
 'error': None,
 'status_with_elapsed': 'finished'}


In [28]:
range_demo = {"publisher_id": "89b405e5-8b83-4fc1-a229-69c5e2f1f888"}
pprint.pp(submit("saudi-open-data", range_demo))

🆔 saudi-open-data job: 97d5e25d4d614369afd26bc22d75a990
running 52s
finished
{'job_id': '97d5e25d4d614369afd26bc22d75a990',
 'task_name': 'saudi-open-data',
 'params': {'publisher_id': '89b405e5-8b83-4fc1-a229-69c5e2f1f888'},
 'status': 'finished',
 'created_at': '2025-08-18T22:38:45.913572',
 'started_at': '2025-08-18T22:38:45.913778',
 'finished_at': '2025-08-18T22:39:38.992447',
 'result': {'publisher_id': '89b405e5-8b83-4fc1-a229-69c5e2f1f888',
            'status': 'success',
            'total_datasets': 11,
            'datasets_succeeded': 11,
            'datasets_failed': 0,
            'total_files_ok': 22,
            'total_files_failed': 0,
            'details_file': 'publisher_results.json',
            'organization_metadata_file': 'organization_metadata.json'},
 'error': None,
 'status_with_elapsed': 'finished'}


In [29]:
range_demo = {"publisher_id": "91e2baab-2d29-4f47-8791-bb7ddceea273"}
pprint.pp(submit("saudi-open-data", range_demo))

🆔 saudi-open-data job: 2d6f6dcd584445f394ca30263b019cfd
running 30s
finished
{'job_id': '2d6f6dcd584445f394ca30263b019cfd',
 'task_name': 'saudi-open-data',
 'params': {'publisher_id': '91e2baab-2d29-4f47-8791-bb7ddceea273'},
 'status': 'finished',
 'created_at': '2025-08-18T22:39:39.994866',
 'started_at': '2025-08-18T22:39:39.995048',
 'finished_at': '2025-08-18T22:40:10.759741',
 'result': {'publisher_id': '91e2baab-2d29-4f47-8791-bb7ddceea273',
            'status': 'success',
            'total_datasets': 7,
            'datasets_succeeded': 7,
            'datasets_failed': 0,
            'total_files_ok': 14,
            'total_files_failed': 0,
            'details_file': 'publisher_results.json',
            'organization_metadata_file': 'organization_metadata.json'},
 'error': None,
 'status_with_elapsed': 'finished'}


In [30]:
range_demo = {"publisher_id": "8a5a23f7-9f8f-4e95-8f04-cd2f79a83228"}
pprint.pp(submit("saudi-open-data", range_demo))

🆔 saudi-open-data job: 6579e0e4d85045baaa3e372155813a6a
running 14s
finished
{'job_id': '6579e0e4d85045baaa3e372155813a6a',
 'task_name': 'saudi-open-data',
 'params': {'publisher_id': '8a5a23f7-9f8f-4e95-8f04-cd2f79a83228'},
 'status': 'finished',
 'created_at': '2025-08-18T22:40:12.048611',
 'started_at': '2025-08-18T22:40:12.048803',
 'finished_at': '2025-08-18T22:40:26.737173',
 'result': {'publisher_id': '8a5a23f7-9f8f-4e95-8f04-cd2f79a83228',
            'status': 'success',
            'total_datasets': 4,
            'datasets_succeeded': 4,
            'datasets_failed': 0,
            'total_files_ok': 8,
            'total_files_failed': 0,
            'details_file': 'publisher_results.json',
            'organization_metadata_file': 'organization_metadata.json'},
 'error': None,
 'status_with_elapsed': 'finished'}


In [31]:
range_demo = {"publisher_id": "16999930-bd31-4e3d-8df1-937799a75b28"}
pprint.pp(submit("saudi-open-data", range_demo))

🆔 saudi-open-data job: b4c51f4641ea408ea478ad9366ce6b52
running 14s
finished
{'job_id': 'b4c51f4641ea408ea478ad9366ce6b52',
 'task_name': 'saudi-open-data',
 'params': {'publisher_id': '16999930-bd31-4e3d-8df1-937799a75b28'},
 'status': 'finished',
 'created_at': '2025-08-18T22:40:28.080727',
 'started_at': '2025-08-18T22:40:28.081562',
 'finished_at': '2025-08-18T22:40:43.821707',
 'result': {'publisher_id': '16999930-bd31-4e3d-8df1-937799a75b28',
            'status': 'success',
            'total_datasets': 3,
            'datasets_succeeded': 3,
            'datasets_failed': 0,
            'total_files_ok': 6,
            'total_files_failed': 0,
            'details_file': 'publisher_results.json',
            'organization_metadata_file': 'organization_metadata.json'},
 'error': None,
 'status_with_elapsed': 'finished'}


In [32]:
range_demo = {"publisher_id": "d81c49f1-5f16-47cd-8c6d-7a921840ecc8"}
pprint.pp(submit("saudi-open-data", range_demo))

🆔 saudi-open-data job: c12488e3a9a24a64913a0bc7093a4afa
running 6s
finished
{'job_id': 'c12488e3a9a24a64913a0bc7093a4afa',
 'task_name': 'saudi-open-data',
 'params': {'publisher_id': 'd81c49f1-5f16-47cd-8c6d-7a921840ecc8'},
 'status': 'finished',
 'created_at': '2025-08-18T22:40:44.114409',
 'started_at': '2025-08-18T22:40:44.114627',
 'finished_at': '2025-08-18T22:40:51.765837',
 'result': {'publisher_id': 'd81c49f1-5f16-47cd-8c6d-7a921840ecc8',
            'status': 'success',
            'total_datasets': 1,
            'datasets_succeeded': 1,
            'datasets_failed': 0,
            'total_files_ok': 2,
            'total_files_failed': 0,
            'details_file': 'publisher_results.json',
            'organization_metadata_file': 'organization_metadata.json'},
 'error': None,
 'status_with_elapsed': 'finished'}


In [33]:
range_demo = {"publisher_id": "2d5b6db6-caaa-416d-ae1f-c91400e68ed8"}
pprint.pp(submit("saudi-open-data", range_demo))

🆔 saudi-open-data job: c362dc9c0ebf41999c77c37f1c0564fb
running 17m 49s
finished
{'job_id': 'c362dc9c0ebf41999c77c37f1c0564fb',
 'task_name': 'saudi-open-data',
 'params': {'publisher_id': '2d5b6db6-caaa-416d-ae1f-c91400e68ed8'},
 'status': 'finished',
 'created_at': '2025-08-18T22:40:52.136585',
 'started_at': '2025-08-18T22:40:52.136780',
 'finished_at': '2025-08-18T22:58:42.514490',
 'result': {'publisher_id': '2d5b6db6-caaa-416d-ae1f-c91400e68ed8',
            'status': 'success',
            'total_datasets': 181,
            'datasets_succeeded': 181,
            'datasets_failed': 0,
            'total_files_ok': 181,
            'total_files_failed': 0,
            'details_file': 'publisher_results.json',
            'organization_metadata_file': 'organization_metadata.json'},
 'error': None,
 'status_with_elapsed': 'finished'}


In [34]:
range_demo = {"publisher_id": "156cd4a0-3fa2-4e2b-b9af-c1b35fdf9962"}
pprint.pp(submit("saudi-open-data", range_demo))

🆔 saudi-open-data job: fda5f3cb42a843f7bd9c8e75e7887c60
running 5m 48s
finished
{'job_id': 'fda5f3cb42a843f7bd9c8e75e7887c60',
 'task_name': 'saudi-open-data',
 'params': {'publisher_id': '156cd4a0-3fa2-4e2b-b9af-c1b35fdf9962'},
 'status': 'finished',
 'created_at': '2025-08-18T22:58:43.646207',
 'started_at': '2025-08-18T22:58:43.646399',
 'finished_at': '2025-08-18T23:04:32.570701',
 'result': {'publisher_id': '156cd4a0-3fa2-4e2b-b9af-c1b35fdf9962',
            'status': 'success',
            'total_datasets': 47,
            'datasets_succeeded': 47,
            'datasets_failed': 0,
            'total_files_ok': 61,
            'total_files_failed': 0,
            'details_file': 'publisher_results.json',
            'organization_metadata_file': 'organization_metadata.json'},
 'error': None,
 'status_with_elapsed': 'finished'}


In [35]:
range_demo = {"publisher_id": "34a5b638-71b4-4cc9-ae56-28583334ec77"}
pprint.pp(submit("saudi-open-data", range_demo))

🆔 saudi-open-data job: 41dc258e539e46e5ba82a36675efe713
running 14m 45s
finished
{'job_id': '41dc258e539e46e5ba82a36675efe713',
 'task_name': 'saudi-open-data',
 'params': {'publisher_id': '34a5b638-71b4-4cc9-ae56-28583334ec77'},
 'status': 'finished',
 'created_at': '2025-08-18T23:04:34.189604',
 'started_at': '2025-08-18T23:04:34.189811',
 'finished_at': '2025-08-18T23:19:20.192212',
 'result': {'publisher_id': '34a5b638-71b4-4cc9-ae56-28583334ec77',
            'status': 'success',
            'total_datasets': 30,
            'datasets_succeeded': 30,
            'datasets_failed': 0,
            'total_files_ok': 60,
            'total_files_failed': 0,
            'details_file': 'publisher_results.json',
            'organization_metadata_file': 'organization_metadata.json'},
 'error': None,
 'status_with_elapsed': 'finished'}
