In [1]:
import json, mimetypes, time, sys, asyncio
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
import requests

# Optional Playwright
try:
    from playwright.sync_api import sync_playwright
    PLAYWRIGHT_AVAILABLE = True
except ImportError:
    PLAYWRIGHT_AVAILABLE = False

# Use the Proactor loop on Windows (needed for Playwright subprocess)
if sys.platform.startswith("win"):
    asyncio.set_event_loop_policy(asyncio.WindowsProactorEventLoopPolicy())

# ---------- CONFIG: folder with deduplicated.json ----------
BASE_DIR = Path(r"C:\Users\Ans\Desktop\code\36_PACS_PROS\agents\data\PACS Viewers\2. gehealthcare.com, [GE HealthCare True PACS (formerly Centricity PACS)], PACS - Diagnostic Imaging Viewer")
dedup_file = BASE_DIR / "deduplicated.json"
files_dir = BASE_DIR / "files"
pages_dir = BASE_DIR / "web pages"
files_dir.mkdir(exist_ok=True)
pages_dir.mkdir(exist_ok=True)

UA = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
TIMEOUT = 60
MAX_HTTP_WORKERS = 12

manifest = []
pending_render = []  # queued for main-thread Playwright

def clean_url(url: str) -> str:
    url = url.strip().strip("*_")
    if not url.startswith(("http://", "https://")):
        url = "https://" + url
    return url

def classify_via_headers(url: str):
    try:
        r = requests.head(url, allow_redirects=True, timeout=TIMEOUT, headers={"User-Agent": UA})
        return r.headers.get("Content-Type", "").lower(), r.headers.get("Content-Disposition", "")
    except Exception:
        return "", ""

def guess_name(url: str, disp: str, ctype: str):
    if "filename=" in disp:
        name = disp.split("filename=")[-1].strip("\"'; ")
        if name:
            return name
    path_part = url.split("?")[0].rstrip("/").split("/")[-1]
    if path_part:
        return path_part
    ext = mimetypes.guess_extension(ctype.split(";")[0].strip()) if ctype else ".bin"
    return f"download{ext or '.bin'}"

def save_file(url: str, ctype: str, disp: str):
    name = guess_name(url, disp, ctype or "")
    target = files_dir / name
    with requests.get(url, stream=True, allow_redirects=True, timeout=TIMEOUT, headers={"User-Agent": UA}) as r:
        r.raise_for_status()
        with open(target, "wb") as f:
            for chunk in r.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
    return str(target)

def save_html_raw(url: str):
    r = requests.get(url, allow_redirects=True, timeout=TIMEOUT, headers={"User-Agent": UA})
    r.raise_for_status()
    name = guess_name(url, "", "text/html")
    if not name.lower().endswith(".html"):
        name += ".html"
    target = pages_dir / name
    target.write_text(r.text, encoding=r.encoding or "utf-8", errors="ignore")
    return str(target)

def save_html_rendered(url: str):
    if not PLAYWRIGHT_AVAILABLE:
        return None
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=True)
        page = browser.new_page()
        page.goto(url, wait_until="networkidle", timeout=TIMEOUT * 1000)
        page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
        page.wait_for_timeout(2000)
        html = page.content()
        name = guess_name(url, "", "text/html")
        if not name.lower().endswith(".html"):
            name += ".rendered.html"
        target = pages_dir / name
        target.write_text(html, encoding="utf-8", errors="ignore")
        browser.close()
        return str(target)

def process_url_http(url: str):
    record = {"url": url, "status": "unknown", "saved": None, "type": None, "error": None}
    try:
        ctype, disp = classify_via_headers(url)
        is_file = any(s in ctype for s in ["application/", "image/", "audio/", "video/"]) or "filename=" in disp
        if is_file:
            record["saved"] = save_file(url, ctype, disp)
            record["type"] = "file"
            record["status"] = "ok"
            return record
        # Try raw HTML
        record["saved"] = save_html_raw(url)
        record["type"] = "html_raw"
        record["status"] = "ok"
        return record
    except Exception as e:
        record["error"] = repr(e)
        # Queue for render fallback (main thread) if available
        if PLAYWRIGHT_AVAILABLE:
            record["status"] = "pending_render"
            pending_render.append(record)
        else:
            record["status"] = "failed"
        return record

# Load URLs
data = json.loads(dedup_file.read_text(encoding="utf-8"))
urls = []
for key, entry in data.items():
    if key == "_meta":
        continue
    raw = entry.get("original form") or entry.get("bare minimum form") or ""
    if raw:
        urls.append(clean_url(raw))

start = time.time()

# Phase 1: HTTP in parallel
with ThreadPoolExecutor(max_workers=MAX_HTTP_WORKERS) as pool:
    futures = {pool.submit(process_url_http, u): u for u in urls}
    for fut in as_completed(futures):
        rec = fut.result()
        manifest.append(rec)
        print(f"{rec['status']:12} {rec['type'] or '-':12} {rec['url']} -> {rec['saved']}")

# Phase 2: Render pending URLs in main thread (sequential)
if PLAYWRIGHT_AVAILABLE and pending_render:
    print(f"\nRendering {len(pending_render)} URLs via Playwright...")
    for rec in pending_render:
        try:
            saved = save_html_rendered(rec["url"])
            rec["saved"] = saved
            rec["type"] = "html_rendered"
            rec["status"] = "ok"
            rec["error"] = None
            print(f"ok           html_rendered  {rec['url']} -> {saved}")
        except Exception as e:
            rec["status"] = "failed"
            rec["error"] = repr(e)
            print(f"failed       -              {rec['url']} -> {rec['error']}")

manifest_path = BASE_DIR / "fetch_manifest.json"
manifest_path.write_text(json.dumps(manifest, indent=2), encoding="utf-8")
print(f"\nDone in {time.time() - start:0.2f}s. Manifest: {manifest_path}")


ok           html_raw     https://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfpmn/pmn.cfm?ID=K123174 -> C:\Users\Ans\Desktop\code\36_PACS_PROS\agents\data\PACS Viewers\2. gehealthcare.com, [GE HealthCare True PACS (formerly Centricity PACS)], PACS - Diagnostic Imaging Viewer\web pages\pmn.cfm.html
ok           html_raw     https://www.accessdata.fda.gov/scripts/cdrh/cfdocs/cfpmn/pmn.cfm?ID=K083018 -> C:\Users\Ans\Desktop\code\36_PACS_PROS\agents\data\PACS Viewers\2. gehealthcare.com, [GE HealthCare True PACS (formerly Centricity PACS)], PACS - Diagnostic Imaging Viewer\web pages\pmn.cfm.html
ok           file         https://www.gehealthcare.com/-/jssmedia/gehc/us/files/products/truepacpage/solution-brief-edison-true-radiology-iw.pdf?rev=-1 -> C:\Users\Ans\Desktop\code\36_PACS_PROS\agents\data\PACS Viewers\2. gehealthcare.com, [GE HealthCare True PACS (formerly Centricity PACS)], PACS - Diagnostic Imaging Viewer\files\solution-brief-edison-true-radiology-iw.pdf
ok           file      

In [1]:
a=2

In [1]:
from playwright.sync_api import sync_playwright