In [None]:
#| default_exp core

# PlaywrightNB source
> Helpers for using Playwright from notebooks and more

In [None]:
#| export
from fastcore.utils import *
from fastcore.meta import delegates
import uuid, warnings

from playwright.async_api import async_playwright, TimeoutError as PTimeoutError
from playwright_stealth import stealth_async
from anyio import from_thread
from httpx import get

from bs4 import BeautifulSoup, GuessedAtParserWarning
from html2text import HTML2Text

In [None]:
#| export
async def get_page(*args, stealth=False, **kwargs):
    p = await async_playwright().start()
    c = await p.chromium.launch(*args, **kwargs)
    ctx = await c.new_context()
    page = await ctx.new_page()
    page.stop = p.stop
    if stealth: await stealth_async(page)
    return page

In [None]:
page = await get_page()
await page.goto('http://example.org')

<Response url='http://example.org/' request=<Request url='http://example.org/' method='GET'>>

In [None]:
#| export
async def page_ready(page, pause=50, timeout=5000):
    "Waith until main content of `page` is ready"
    await page.wait_for_load_state('domcontentloaded')
    await page.wait_for_load_state('networkidle')
    await page.wait_for_timeout(pause)
    try: await page.wait_for_selector('meta', state="attached", timeout=timeout)
    except PTimeoutError as e: pass
    await page.wait_for_timeout(pause)

In [None]:
#| export
async def frames_ready(page, pause=50, timeout=5000):
    "Wait until all visible frames (if any) on `page` are ready"
    iframes = await page.query_selector_all('iframe:visible')
    if not iframes: return
    for iframe in iframes:
        await iframe.wait_for_element_state('visible', timeout=timeout)
        await page.wait_for_timeout(pause)
        frame = await iframe.content_frame()
        if frame:
            await frame.wait_for_load_state('domcontentloaded', timeout=timeout)
            await frame.wait_for_load_state('networkidle', timeout=timeout)

In [None]:
#| export
async def wait_page(page, pause=50, timeout=5000):
    "Wait until page and visible frames (if any) on `page` are ready"
    await page_ready(page, pause=pause, timeout=timeout)
    await frames_ready(page, pause=pause, timeout=timeout)

In [None]:
sh_url = 'https://help.dyalog.com/19.0/#UserGuide/Installation%20and%20Configuration/Shell%20Scripts.htm'

In [None]:
await page.goto(sh_url)
await wait_page(page)

In [None]:
#| export
async def get_full_content(page):
    "Tuple of page content and dict of frames' content"
    main_content = await page.content()
    iframes = await page.query_selector_all('iframe')
    iframe_contents = {}
    for iframe in iframes:
        frame = await iframe.content_frame()
        if frame:
            key = await iframe.get_attribute('id') or str(uuid.uuid4())
            iframe_contents[key] = await frame.content()
    return main_content, iframe_contents

In [None]:
cts, iframes = await get_full_content(page)

In [None]:
await page.close()

In [None]:
#| export
async def read_page_async(url, pause=50, timeout=5000, stealth=False, page=None):
    "Return contents of `url` and its iframes using Playwright async"
    has_page = bool(page)
    if not page: page = await get_page(stealth=stealth)
    try:
        await page.goto(url)
        await wait_page(page, pause=pause, timeout=timeout)
        return await get_full_content(page)
    finally:
        if not has_page: await page.close()

In [None]:
cts,iframes = await read_page_async(sh_url)

In [None]:
#| export
def read_page(url, pause=50, timeout=5000, stealth=False, page=None):
    "Return contents of `url` and its iframes using Playwright"
    with from_thread.start_blocking_portal() as p: return p.call(read_page_async, url, pause, timeout, stealth, page)

In [None]:
cts,iframes = read_page(sh_url)

In [None]:
iframes['topic'][:50]

'<!DOCTYPE html><html xmlns:madcap="http://www.madc'

In [None]:
#| export
def h2md(h):
    "Convert HTML `h` to markdown using `HTML2Text"
    h2t = HTML2Text(bodywidth=5000)
    h2t.ignore_links = True
    h2t.mark_code = True
    h2t.ignore_images = True
    return h2t.handle(str(h))

In [None]:
#| export
async def url2md_async(url, sel=None, pause=50, timeout=5000, stealth=False, page=None):
    "Read `url` with `read_page`, optionally selecting CSS selector `sel`"
    warnings.filterwarnings("ignore", category=GuessedAtParserWarning)
    cts,_ = await read_page_async(url, pause, timeout=timeout, stealth=stealth, page=page)
    soup = BeautifulSoup(cts)
    content = soup.select_one(sel)
    return h2md(content)

In [None]:
#| export
def url2md(url, sel=None, pause=50, timeout=5000, stealth=False, page=None):
    "Read `url` with `read_page`"
    warnings.filterwarnings("ignore", category=GuessedAtParserWarning)
    cts,_ = read_page(url, pause, timeout=timeout, stealth=stealth, page=page)
    soup = BeautifulSoup(cts)
    content = soup.select_one(sel)
    return h2md(content)

In [None]:
#| export
@delegates(get)
def get2md(url, sel=None, **kwargs):
    "Read `url` with `httpx.get`"
    warnings.filterwarnings("ignore", category=GuessedAtParserWarning)
    cts = get(url, **kwargs)
    soup = BeautifulSoup(cts)
    content = soup.select_one(sel)
    return h2md(content)

In [None]:
url = 'https://docs.railway.app/guides/public-api'
md = get2md(url, ".docs-content")
print(md[:120])

# Use the Public API

The Railway public API is built with GraphQL and is the same API that powers the Railway dashboard


## Export -

In [None]:
#|hide
import nbdev; nbdev.nbdev_export()