# Python source

> Source code for `llms_txt` Python module, containing helpers to create and use llms.txt files

In [None]:
#| default_exp core

In [None]:
#| export
import re

In [None]:
#| hide
from nbdev.showdoc import *
import nbdev; nbdev.nbdev_export()

In [None]:
#| export
from fastcore.utils import *
from fastcore.xml import *
from fastcore.script import *
import httpx

## Introduction

In [None]:
%ai reset

The llms.txt file spec is for files located in the path `llms.txt` of a website (or, optionally, in a subpath). `llms-sample.txt` is a simple example. A file following the spec contains the following sections as markdown, in the specific order:

- An H1 with the name of the project or site. This is the only required section
- A blockquote with a short summary of the project, containing key information necessary for understanding the rest of the file
- Zero or more markdown sections (e.g. paragraphs, lists, etc) of any type, except headings, containing more detailed information about the project and how to interpret the provided files
- Zero or more markdown sections delimited by H2 headers, containing "file lists" of URLs where further detail is available
  - Each "file list" is a markdown list, containing a required markdown hyperlink `[name](url)`, then optionally a `:` and notes about the file.

Here's the start of a sample llms.txt file we'll use for testing:

In [None]:
samp = Path('llms-sample.txt').read_text()
print(samp[:480])

# FastHTML

> FastHTML is a python library which brings together Starlette, Uvicorn, HTMX, and fastcore's `FT` "FastTags" into a library for creating server-rendered hypermedia applications.

Remember:

- Use `serve()` for running uvicorn (`if __name__ == "__main__"` is not needed since it's automatic)
- When a title is needed with a response, use `Titled`; note that that already wraps children in `Container`, and already includes both the meta title as well as the H1 element


## Reading

We'll implement `parse_llms_file` to pull out the sections of llms.txt into a simple data structure.

In [None]:
#| export
def opt_re(s):
    "Pattern to optionally match `s`"
    return f'(?:{s})?'

def named_re(nm, pat):
    "Pattern to match `pat` in a named capture group"
    return f'(?P<{nm}>{pat})'

def search(pat, txt, flags=0):
    "Dictionary of matched groups in `pat` within `txt`"
    res = re.search(pat, txt, flags=flags)
    return res.groupdict() if res else None

We'll work "outside in" so we can test the innermost matches as we go.

### Parse links

In [None]:
link = '- [FastHTML quick start](https://docs.fastht.ml/tutorials/quickstart_for_web_devs.html.md): A brief overview of FastHTML features'

In [None]:
%%aip 0
Parse the first part of `link` into a dict

In [None]:
title = named_re('title', r'[^\]]+')
pat =  fr'-\s*\[{title}\]'
search(pat, samp)

{'title': 'FastHTML quick start'}

In [None]:
%%aip 0
Do the next bit.

In [None]:
url = named_re('url', r'[^\)]+')
pat += fr'\({url}\)'
search(pat, samp)

{'title': 'FastHTML quick start',
 'url': 'https://docs.fastht.ml/tutorials/quickstart_for_web_devs.html.md'}

In [None]:
%%aip 0
Do the final bit. Note it's optional.

In [None]:
desc = named_re('desc', r'.*')
pat += opt_re(fr':\s*{desc}')
search(pat, link)

{'title': 'FastHTML quick start',
 'url': 'https://docs.fastht.ml/tutorials/quickstart_for_web_devs.html.md',
 'desc': 'A brief overview of FastHTML features'}

In [None]:
%%aip 0
Combine those sections into a function `parse_link(txt)`

In [None]:
#| export
def parse_link(txt):
    "Parse a link section from llms.txt"
    title = named_re('title', r'[^\]]+')
    url = named_re('url', r'[^\)]+')
    desc = named_re('desc', r'.*')
    desc_pat = opt_re(fr":\s*{desc}")
    pat = fr'-\s*\[{title}\]\({url}\){desc_pat}'
    return re.search(pat, txt).groupdict()

In [None]:
parse_link(link)

{'title': 'FastHTML quick start',
 'url': 'https://docs.fastht.ml/tutorials/quickstart_for_web_devs.html.md',
 'desc': 'A brief overview of FastHTML features'}

In [None]:
parse_link('-[foo](http://foo)')

{'title': 'foo', 'url': 'http://foo', 'desc': None}

### Parse sections

In [None]:
sections = '''First bit.

## S1

-[foo](http://foo)
- [foo2](http://foo2): stuff

## S2

- [foo3](http://foo3)'''

In [None]:
start,*rest = re.split(fr'^##\s*(.*?$)', sections, flags=re.MULTILINE)
start

'First bit.\n\n'

In [None]:
rest

['S1',
 '\n\n-[foo](http://foo)\n- [foo2](http://foo2): stuff\n\n',
 'S2',
 '\n\n- [foo3](http://foo3)']

In [None]:
%%aip 0
Concisely create a dict from the pairs in `rest`.

In [None]:
d = dict(chunked(rest, 2))
d

{'S1': '\n\n-[foo](http://foo)\n- [foo2](http://foo2): stuff\n\n',
 'S2': '\n\n- [foo3](http://foo3)'}

In [None]:
links = d['S1']
links.strip()

'-[foo](http://foo)\n- [foo2](http://foo2): stuff'

In [None]:
%%aip 0
Parse `links` into a list of links. There can be multiple newlines between them.

In [None]:
#| export
def _parse_links(links):
    return [parse_link(l) for l in re.split(r'\n+', links.strip()) if l.strip()]

In [None]:
_parse_links(links)

[{'title': 'foo', 'url': 'http://foo', 'desc': None},
 {'title': 'foo2', 'url': 'http://foo2', 'desc': 'stuff'}]

In [None]:
%%aip 0
Create a function that uses the above steps to parse an llms.txt into `start` and a dict with keys like `d` and parsed list of links as values.

In [None]:
#| export
def _parse_llms(txt):
    start,*rest = re.split(fr'^##\s*(.*?$)', txt, flags=re.MULTILINE)
    d = dict(chunked(rest, 2))
    sects = {k: _parse_links(v) for k,v in d.items()}
    return start.strip(),sects

In [None]:
start, sects = _parse_llms(samp)
start

'# FastHTML\n\n> FastHTML is a python library which brings together Starlette, Uvicorn, HTMX, and fastcore\'s `FT` "FastTags" into a library for creating server-rendered hypermedia applications.\n\nRemember:\n\n- Use `serve()` for running uvicorn (`if __name__ == "__main__"` is not needed since it\'s automatic)\n- When a title is needed with a response, use `Titled`; note that that already wraps children in `Container`, and already includes both the meta title as well as the H1 element.'

In [None]:
title = named_re('title', r'.+?$')
summ = named_re('summary', '.+?$')
summ_pat = opt_re(fr"^>\s*{summ}$")
info = named_re('info', '.*')

In [None]:
pat = fr'^#\s*{title}\n+{summ_pat}\n+{info}'
search(pat, start, (re.MULTILINE|re.DOTALL))

{'title': 'FastHTML',
 'summary': 'FastHTML is a python library which brings together Starlette, Uvicorn, HTMX, and fastcore\'s `FT` "FastTags" into a library for creating server-rendered hypermedia applications.',
 'info': 'Remember:\n\n- Use `serve()` for running uvicorn (`if __name__ == "__main__"` is not needed since it\'s automatic)\n- When a title is needed with a response, use `Titled`; note that that already wraps children in `Container`, and already includes both the meta title as well as the H1 element.'}

In [None]:
%%aip 0
Let's finish it off!

In [None]:
#| export
def parse_llms_file(txt):
    "Parse llms.txt file contents in `txt` to an `AttrDict`"
    start,sects = _parse_llms(txt)
    title = named_re('title', r'.+?$')
    summ = named_re('summary', '.+?$')
    summ_pat = opt_re(fr"^>\s*{summ}$")
    info = named_re('info', '.*')
    pat = fr'^#\s*{title}\n+{summ_pat}\n+{info}'
    d = search(pat, start, (re.MULTILINE|re.DOTALL))
    d['sections'] = sects
    return dict2obj(d)

In [None]:
llmsd = parse_llms_file(samp)
llmsd.summary

'FastHTML is a python library which brings together Starlette, Uvicorn, HTMX, and fastcore\'s `FT` "FastTags" into a library for creating server-rendered hypermedia applications.'

In [None]:
llmsd.sections.Examples

(#1) [{'title': 'Todo list application', 'url': 'https://raw.githubusercontent.com/AnswerDotAI/fasthtml/main/examples/adv_app.py', 'desc': 'Detailed walk-thru of a complete CRUD app in FastHTML showing idiomatic use of FastHTML and HTMX patterns.'}]

## XML conversion

For some LLMs such as Claude, XML format is preferred, so we'll provide a function to create that format.

In [None]:
#| export
from fastcore.xml import Sections,Project,Doc

In [None]:
#| export
def _doc(kw):
    "Create a `Doc` FT object with the text retrieved from `url` as the child, and `kw` as attrs."
    url = kw.pop('url')
    re_comment = re.compile('^<!--.*-->$', flags=re.MULTILINE)
    txt = [o for o in httpx.get(url).text.splitlines() if not re_comment.search(o)]
    return Doc('\n'.join(txt), **kw)

In [None]:
#| export
def _section(nm, items, n_workers=None):
    "Create a section containing a `Doc` object for each child."
    return ft(nm, *parallel(_doc, items, n_workers=n_workers, threadpool=True))

In [None]:
#| export
def mk_ctx(d, optional=True, n_workers=None):
    "Create a `Project` with a `Section` for each H2 part in `d`, optionally skipping the 'optional' section."
    skip = '' if optional else 'Optional'
    sections = [_section(k, v, n_workers=n_workers) for k,v in d.sections.items() if k!=skip]
    return Project(title=d.title, summary=d.summary)(d.info, *sections)

In [None]:
ctx = mk_ctx(llmsd)
print(to_xml(ctx, do_escape=False)[:260]+'...')

<project title="FastHTML" summary='FastHTML is a python library which brings together Starlette, Uvicorn, HTMX, and fastcore&#39;s `FT` "FastTags" into a library for creating server-rendered hypermedia applications.'>
Remember:

- Use `serve()` for running uvi...


In [None]:
#| export
def get_sizes(ctx):
    "Get the size of each section of the LLM context"
    return {o.tag:{p.title:len(p.children[0]) for p in o.children} for o in ctx.children if hasattr(o,'tag')}

In [None]:
get_sizes(ctx)

{'docs': {'FastHTML quick start': 27376,
  'HTMX reference': 26427,
  'Starlette quick guide': 7936},
 'examples': {'Todo list application': 18558},
 'optional': {'Starlette full documentation': 48331}}

In [None]:
Path('../fasthtml.md').write_text(to_xml(ctx, do_escape=False))

129814

In [None]:
#| export
def create_ctx(txt, optional=False, n_workers=None):
    "A `Project` with a `Section` for each H2 part in `txt`, optionally skipping the 'optional' section."
    d = parse_llms_file(txt)
    ctx = mk_ctx(d, optional=optional, n_workers=n_workers)
    return to_xml(ctx, do_escape=False)

In [None]:
#| export
@call_parse
def llms_txt2ctx(
    fname:str, # File name to read
    optional:bool_arg=False, # Include 'optional' section?
    n_workers:int=None # Number of threads to use for parallel downloading
):
    "Print a `Project` with a `Section` for each H2 part in file read from `fname`, optionally skipping the 'optional' section."
    print(create_ctx(Path(fname).read_text(), optional=optional, n_workers=n_workers))

In [None]:
!llms_txt2ctx llms-sample.txt > ../fasthtml.md

## Export -

In [None]:
#|hide
#|eval: false
from nbdev import nbdev_export
nbdev_export()