In [1]:
import os, time, pathlib, pprint, requests

EP       = os.getenv("BROWSER_ENDPOINT", "http://browser:8004")
SCRAPED  = pathlib.Path("/storage/scraped_data")

def wait_for(job_id, every=2):
    while True:
        rec = requests.get(f"{EP}/jobs/{job_id}").json()
        if not rec["status"] in {"finished", "error"}:
            print("\r" + rec["status_with_elapsed"], end="")
        else:
            print("\n" + rec["status"])
            return rec
        time.sleep(every)

def submit(task, payload):
    r = requests.post(f"{EP}/jobs/{task}", json=payload)
    r.raise_for_status()
    jid = r.json()["job_id"]
    print("🆔", task, "job:", jid)
    return wait_for(jid)

In [2]:
# res = submit("saudi-open-data", {"dataset_id": "18887141-a088-4167-9aef-177791adb412"})
# print("\n\n\n")
# pprint.pp(res)

In [3]:
# # 2a) first 5 datasets of a publisher (range)
# pub_id = "694ebd35-2ea6-4f1d-84b6-2ea875159b95"
# range_demo = {"publisher_id": pub_id, "dataset_range": [1, 500]}
# res2a = submit("saudi-open-data", range_demo)
# pprint.pp(res2a)

In [4]:
# # 2b) *all* datasets of that publisher
# all_demo = {"publisher_id": pub_id}      # no range / limit -> all
# res2b = submit("saudi-open-data", all_demo)
# pprint.pp(res2b)

In [5]:
github_code_only = {
    "repo_url": "https://github.com/0aub/Qitta",
    "include_issues": False,
    "include_releases": False,
    "max_files": 100
}
res1 = submit("github-repo", github_code_only)
print("GitHub repo (code only):")
pprint.pp(res1)

🆔 github-repo job: c31b09426a7447329089be3e300162d6
running 16s
finished
GitHub repo (code only):
{'job_id': 'c31b09426a7447329089be3e300162d6',
 'task_name': 'github-repo',
 'params': {'repo_url': 'https://github.com/0aub/Qitta',
            'include_issues': False,
            'include_releases': False,
            'max_files': 100},
 'status': 'finished',
 'created_at': '2025-08-16T08:40:46.959744',
 'started_at': '2025-08-16T08:40:46.959982',
 'finished_at': '2025-08-16T08:41:04.070417',
 'result': {'repository_url': 'https://github.com/0aub/Qitta',
            'owner': '0aub',
            'repo_name': 'qitta',
            'files_collected': 36,
            'issues_collected': 0,
            'releases_collected': 0,
            'total_code_size': 184966,
            'vector_items': 36,
            'output_files': {'complete': 'repository_complete.json',
                             'metadata': 'metadata.json',
                             'files': 'files.json',
                    

In [6]:
github_test = {"repo_url": "https://github.com/0aub/Qitta"}
res2 = submit("github-repo", github_test)
print("GitHub repo scraping:")
pprint.pp(res2)

🆔 github-repo job: 841360e37c9a4533b97a6968bce53a64
running 6s
finished
GitHub repo scraping:
{'job_id': '841360e37c9a4533b97a6968bce53a64',
 'task_name': 'github-repo',
 'params': {'repo_url': 'https://github.com/0aub/Qitta'},
 'status': 'finished',
 'created_at': '2025-08-16T08:41:05.040231',
 'started_at': '2025-08-16T08:41:05.040486',
 'finished_at': '2025-08-16T08:41:12.086783',
 'result': {'repository_url': 'https://github.com/0aub/Qitta',
            'owner': '0aub',
            'repo_name': 'qitta',
            'files_collected': 16,
            'issues_collected': 0,
            'releases_collected': 0,
            'total_code_size': 13197,
            'vector_items': 16,
            'output_files': {'complete': 'repository_complete.json',
                             'metadata': 'metadata.json',
                             'files': 'files.json',
                             'issues': 'issues.json',
                             'releases': 'releases.json',
                   

In [7]:
# # Limited pages scraping
# limited_test = {
#     "url": "https://docs.python.org",
#     "max_pages": 10
# }
# res3 = submit("scrape-site", limited_test)
# print("Limited pages scrape:")
# pprint.pp(res3)

In [8]:
  # Test the fixes - website scraping (should skip CSS files now)
  scrape_test = {"url": "www.care-mate.co", "max_pages": 5, "use_browser": True}
  res4 = submit("scrape-site", scrape_test)
  print("Fixed website scraping:")
  pprint.pp(res4)

🆔 scrape-site job: 5c133335e8f94b1ba21da93b4f7133db
running 34s
finished
Fixed website scraping:
{'job_id': '5c133335e8f94b1ba21da93b4f7133db',
 'task_name': 'scrape-site',
 'params': {'url': 'www.care-mate.co', 'max_pages': 5, 'use_browser': True},
 'status': 'finished',
 'created_at': '2025-08-16T08:41:13.109105',
 'started_at': '2025-08-16T08:41:13.109314',
 'finished_at': '2025-08-16T08:41:48.235276',
 'result': {'start_url': 'https://www.care-mate.co',
            'base_domain': 'www.care-mate.co',
            'pages_scraped': 5,
            'pages_failed': 0,
            'total_content_size': 488079,
            'html_directory': 'raw_html',
            'metadata_file': 'crawl_metadata.json',
            'urls_file': 'page_urls.json',
            'quality_score_avg': 1.0},
 'error': None,
 'status_with_elapsed': 'finished'}


In [9]:
# Test extract-content with proper job_id discovery
if res4.get("status") == "finished":
    extract_test = {
        "job_id": res4["job_id"],
        "chunk_size": 1000,
        "overlap": 200
    }
    res5 = submit("extract-content", extract_test)
    print("Fixed content extraction:")
    pprint.pp(res5)

🆔 extract-content job: 3bd32013fc20455297e9e00dfbd9d135

finished
Fixed content extraction:
{'job_id': '3bd32013fc20455297e9e00dfbd9d135',
 'task_name': 'extract-content',
 'params': {'job_id': '5c133335e8f94b1ba21da93b4f7133db',
            'chunk_size': 1000,
            'overlap': 200},
 'status': 'finished',
 'created_at': '2025-08-16T08:41:49.197931',
 'started_at': '2025-08-16T08:41:49.198118',
 'finished_at': '2025-08-16T08:41:49.216962',
 'result': {'source_directory': '/storage/scraped_data/scrape-site/5c133335e8f94b1ba21da93b4f7133db',
            'html_files_found': 5,
            'successful_extractions': 5,
            'failed_extractions': 0,
            'total_chunks': 48,
            'total_tokens': 7430,
            'content_directory': 'extracted_content',
            'vector_store_file': 'vector_store_ready.json',
            'summary_file': 'extraction_summary.json'},
 'error': None,
 'status_with_elapsed': 'finished'}
