-
Notifications
You must be signed in to change notification settings - Fork 119
Expand file tree
/
Copy pathcore.py
More file actions
129 lines (109 loc) · 4.93 KB
/
core.py
File metadata and controls
129 lines (109 loc) · 4.93 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
"""Source code for `llms_txt` Python module, containing helpers to create and use llms.txt files"""
# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_core.ipynb.
# %% auto #0
__all__ = ['opt_re', 'named_re', 'search', 'parse_link', 'parse_llms_file', 'get_doc_content', 'mk_ctx', 'get_sizes',
'create_ctx', 'llms_txt2ctx']
# %% ../nbs/01_core.ipynb #484b1085
import re
# %% ../nbs/01_core.ipynb #38bed7c7
from fastcore.utils import *
from fastcore.xml import *
from fastcore.script import *
import httpx
from urllib.parse import urlparse
# %% ../nbs/01_core.ipynb #2cbf4527
def opt_re(s):
"Pattern to optionally match `s`"
return f'(?:{s})?'
def named_re(nm, pat):
"Pattern to match `pat` in a named capture group"
return f'(?P<{nm}>{pat})'
def search(pat, txt, flags=0):
"Dictionary of matched groups in `pat` within `txt`"
res = re.search(pat, txt, flags=flags)
return res.groupdict() if res else None
# %% ../nbs/01_core.ipynb #5e8cbd7d
def parse_link(txt):
"Parse a link section from llms.txt"
title = named_re('title', r'[^\]]+')
url = named_re('url', r'[^\)]+')
desc = named_re('desc', r'.*')
desc_pat = opt_re(fr":\s*{desc}")
pat = fr'-\s*\[{title}\]\({url}\){desc_pat}'
return re.search(pat, txt).groupdict()
# %% ../nbs/01_core.ipynb #23dee0c8
def _parse_links(links):
return [parse_link(l) for l in re.split(r'\n+', links.strip()) if l.strip()]
# %% ../nbs/01_core.ipynb #60a080c3
def _parse_llms(txt):
start,*rest = re.split(fr'^##\s*(.*?$)', txt, flags=re.MULTILINE)
d = dict(chunked(rest, 2))
sects = {k: _parse_links(v) for k,v in d.items()}
return start.strip(),sects
# %% ../nbs/01_core.ipynb #32b8641d
def parse_llms_file(txt):
"Parse llms.txt file contents in `txt` to an `AttrDict`"
start,sects = _parse_llms(txt)
title = named_re('title', r'.+?$')
summ = named_re('summary', '.+?$')
summ_pat = opt_re(fr"^>\s*{summ}$")
info = named_re('info', '.*')
pat = fr'^#\s*{title}\n+{summ_pat}\n+{info}'
d = search(pat, start, (re.MULTILINE|re.DOTALL))
d['sections'] = sects
return dict2obj(d)
# %% ../nbs/01_core.ipynb #891efae3
from fastcore.xml import Sections,Project,Doc
# %% ../nbs/01_core.ipynb #39c2321a
def _local_docs_pth(path): return path/'_proc'
def _get_config(): return find_file_parents('pyproject.toml')
def get_doc_content(url):
"Fetch content from local file if in nbdev repo."
if (path:=_get_config()):
relative_path = urlparse(url).path.lstrip('/')
local_path = _local_docs_pth(path) / relative_path
if local_path.exists(): return local_path.read_text()
return httpx.get(url).text
# %% ../nbs/01_core.ipynb #7639dab2
def _doc(kw):
"Create a `Doc` FT object with the text retrieved from `url` as the child, and `kw` as attrs."
print(dict(kw))
url = kw.pop('url')
txt = get_doc_content(url)
re_comment = re.compile('^<!--.*-->$', flags=re.MULTILINE)
re_base64_img = re.compile(r'<img[^>]*src="data:image/[^"]*"[^>]*>')
txt = '\n'.join([o for o in txt.splitlines() if not re_comment.search(o) and not re_base64_img.search(o)])
return Doc(txt, **kw)
# %% ../nbs/01_core.ipynb #3e0c2ff6
def _section(nm, items, n_workers=None):
"Create a section containing a `Doc` object for each child."
return ft(nm, *parallel(_doc, items, n_workers=n_workers, threadpool=True))
# %% ../nbs/01_core.ipynb #c0541f10
def mk_ctx(d, optional=True, n_workers=None):
"Create a `Project` with a `Section` for each H2 part in `d`, optionally skipping the 'optional' section."
skip = '' if optional else 'Optional'
sections = [_section(k, v, n_workers=n_workers) for k,v in d.sections.items() if k!=skip]
return Project(title=d.title, summary=d.summary)(d.info, *sections)
# %% ../nbs/01_core.ipynb #52143a58
def get_sizes(ctx):
"Get the size of each section of the LLM context"
return {o.tag:{p.title:len(p.children[0]) for p in o.children} for o in ctx.children if hasattr(o,'tag')}
# %% ../nbs/01_core.ipynb #fd3e7ce4
def create_ctx(txt, optional=False, n_workers=None):
"A `Project` with a `Section` for each H2 part in `txt`, optionally skipping the 'optional' section."
d = parse_llms_file(txt)
ctx = mk_ctx(d, optional=optional, n_workers=n_workers)
return to_xml(ctx, do_escape=False)
# %% ../nbs/01_core.ipynb #d636436d
@call_parse
def llms_txt2ctx(
fname:str, # File name to read
optional:bool_arg=False, # Include 'optional' section?
n_workers:int=None, # Number of threads to use for parallel downloading
save_nbdev_fname:str=None #save output to nbdev `{docs_path}` instead of emitting to stdout
):
"Print a `Project` with a `Section` for each H2 part in file read from `fname`, optionally skipping the 'optional' section."
ctx = create_ctx(Path(fname).read_text(), optional=optional, n_workers=n_workers)
if save_nbdev_fname and (cfg:=_get_config()):
(_local_docs_pth(cfg) / save_nbdev_fname).mk_write(ctx)
else: print(ctx)