In [1]:
import os, time, pathlib, pprint, requests

EP       = os.getenv("BROWSER_ENDPOINT", "http://browser:8004")
SCRAPED  = pathlib.Path("/storage/scraped_data")

def wait_for(job_id, every=2):
    while True:
        rec = requests.get(f"{EP}/jobs/{job_id}").json()
        if not rec["status"] in {"finished", "error"}:
            print("\r" + rec["status_with_elapsed"], end="")
        else:
            print("\n" + rec["status"])
            return rec
        time.sleep(every)

def submit(task, payload):
    r = requests.post(f"{EP}/jobs/{task}", json=payload)
    r.raise_for_status()
    jid = r.json()["job_id"]
    print("🆔", task, "job:", jid)
    return wait_for(jid)

In [None]:
# Limited pages scraping
limited_test = {
    "url": "https://docs.python.org",
    "max_pages": 10
}
res1 = submit("scrape-site", limited_test)
print("Limited pages scrape:")
pprint.pp(res1)

In [None]:
  # Test the fixes - website scraping (should skip CSS files now)
  scrape_test = {"url": "www.care-mate.co", "max_pages": 5, "use_browser": True}
  res2 = submit("scrape-site", scrape_test)
  print("Fixed website scraping:")
  pprint.pp(res2)

In [None]:
# Test extract-content with proper job_id discovery
if res2.get("status") == "finished":
    extract_test = {
        "job_id": res2["job_id"],
        "chunk_size": 1000,
        "overlap": 200
    }
    res3 = submit("extract-content", extract_test)
    print("Fixed content extraction:")
    pprint.pp(res3)