In [49]:
#| default_exp download

# Download helpers

- Download and process LLM-ready documents

In [50]:
#| export
from fastcore.utils import *
from httpx import get
from fastcore.meta import delegates
from fastcore.test import *
from llms_txt import *

from html2text import HTML2Text
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin

In [51]:
from IPython.display import Markdown,HTML

In [52]:
#| export
def clean_md(text, rm_comments=True, rm_details=True):
    "Remove comments and `<details>` sections from `text`"
    if rm_comments: text = re.sub(r'\n?<!--.*?-->\n?', '', text, flags=re.DOTALL)
    if rm_details: text = re.sub(r'\n?<details>.*?</details>\n?', '', text, flags=re.DOTALL)
    return text

In [53]:
#| export
@delegates(get)
def read_md(url, rm_comments=True, rm_details=True, **kwargs):
    "Read text from `url` and clean with `clean_docs`"
    return clean_md(get(url, **kwargs).text, rm_comments=rm_comments, rm_details=rm_details)

In [54]:
mdurl = 'https://claudette.answer.ai/index.html.md'
md = read_md(mdurl)
# Markdown(md)

In [55]:
#| export
def html2md(s:str):
    "Convert `s` from HTML to markdown"
    o = HTML2Text(bodywidth=5000)
    o.ignore_links = True
    o.mark_code = True
    o.ignore_images = True
    return o.handle(s)

In [56]:
#| export
def read_html(url, sel=None, rm_comments=True, rm_details=True):
    "Get `url`, optionally selecting CSS selector `sel`, and convert to clean markdown"
    page = get(url).text
    if sel:
        soup = BeautifulSoup(page, 'html.parser')
        page = str(soup.find(sel))
    md = html2md(page)
    return clean_md(md, rm_comments, rm_details=rm_details)

In [57]:
htmlurl = 'https://hypermedia.systems/hypermedia-a-reintroduction/'
hmd = read_html(htmlurl)
# Markdown(hmd)

In [58]:
#| export
def get_llmstxt(url, optional=False, n_workers=None):
    "Get llms.txt file from and expand it with `llms_txt.create_ctx()`"
    if not url.endswith('llms.txt'): return None
    resp = get(url)
    if resp.status_code!=200: return None
    return create_ctx(resp.text, optional=optional, n_workers=n_workers)

In [59]:
# print(get_llmstxt('https://llmstxt.org/llms.txt'))

In [60]:
#| export
def split_url(url):
    "Split `url` into base, path, and file name, normalising name to '/' if empty"
    parsed = urlparse(url.strip('/'))
    base = f"{parsed.scheme}://{parsed.netloc}"
    path,spl,fname = parsed.path.rpartition('/')
    fname = spl+fname
    if not path and not fname: path='/'
    return base,path,fname

In [61]:
urls = ('https://claudette.answer.ai/path/index.html', 'https://claudette.answer.ai/',
        'https://claudette.answer.ai/index.html', 'https://llmstxt.org', 'https://llmstxt.org/')

[split_url(o) for o in urls]

[('https://claudette.answer.ai', '/path', '/index.html'),
 ('https://claudette.answer.ai', '/', ''),
 ('https://claudette.answer.ai', '', '/index.html'),
 ('https://llmstxt.org', '/', ''),
 ('https://llmstxt.org', '/', '')]

In [62]:
#| export
def _tryget(url):
    "Return response from `url` if `status_code!=404`, otherwise `None`"
    res = get(url)
    return None if res.status_code==404 else url

In [63]:
#| export
def find_docs(url):
    "If available, return LLM-friendly llms.txt context or markdown file location from `url`"
    base,path,fname = split_url(url)
    url = (base+path+fname).strip('/')
    if fname=='/llms.txt': return url
    if Path(fname).suffix in('.md', '.txt', '.rst'): return _tryget(url)
    if '.' in fname: return _tryget(url+'.md')
    res = _tryget(url+'/llms.txt')
    if res: return res
    res = _tryget(url+'/index.md')
    if res: return res
    res = _tryget(url+'/index.html.md')
    if res: return res
    res = _tryget(url+'/index-commonmark.md')
    if res: return res
    return None

In [64]:
fl_url = 'https://answerdotai.github.io/fastlite'

In [65]:
find_docs(fl_url)

'https://answerdotai.github.io/fastlite/index.html.md'

In [66]:
for o in urls: print(find_docs(o))

None
https://claudette.answer.ai/index.html.md
https://claudette.answer.ai/index.html.md
https://llmstxt.org/llms.txt
https://llmstxt.org/llms.txt


In [67]:
#| export
def read_docs(url, optional=False, n_workers=None, rm_comments=True, rm_details=True):
    "If available, return LLM-friendly llms.txt context or markdown file response for `url`"
    fn = find_docs(url)
    if not fn:
        parsed_url = urlparse(url)
        if parsed_url.path == '/' or not parsed_url.path: return None
        return read_docs(urljoin(url, '..'), optional, n_workers, rm_comments, rm_details)
    if fn.endswith('/llms.txt'): res = get_llmstxt(fn, optional=optional, n_workers=n_workers)
    else: res = get(fn).text
    return clean_md(res, rm_comments=rm_comments, rm_details=rm_details)

In [68]:
llmstxt = "https://llmstxt.org"
test_eq(read_docs(llmstxt), read_docs(llmstxt + "/path"))
test_eq(read_docs(llmstxt), read_docs(llmstxt + "/path/path"))

## Export -

In [69]:
#|hide
#|eval: false
from nbdev.doclinks import nbdev_export
nbdev_export()