In [None]:
#|default_exp xml

# xml source

In [None]:
#| export
import hashlib,xml.etree.ElementTree as ET
from collections import namedtuple
from ghapi.all import GhApi

from fastcore.utils import *
from fastcore.meta import delegates
from fastcore.xtras import hl_md
from fastcore.xml import to_xml, Document, Documents, Document_content, Src, Source,Out,Outs,Cell,Notebook,Md,Code
from fastcore.script import call_parse

In [None]:
from copy import deepcopy

## JSON to XML

In [None]:
#| export
def json_to_xml(d:dict, # JSON dictionary to convert
                rnm:str # Root name
               )->str:
    "Convert `d` to XML."
    root = ET.Element(rnm)
    def build_xml(data, parent):
        if isinstance(data, dict):
            for key, value in data.items(): build_xml(value, ET.SubElement(parent, key))
        elif isinstance(data, list):
            for item in data: build_xml(item, ET.SubElement(parent, 'item'))
        else: parent.text = str(data)
    build_xml(d, root)
    ET.indent(root)
    return ET.tostring(root, encoding='unicode')

JSON doesn't map as nicely to XML as the data structure used in `fastcore.xml`, but for simple XML trees it can be convenient -- for example:

In [None]:
a = dict(surname='Howard', firstnames=['Jeremy','Peter'],
         address=dict(state='Queensland',country='Australia'))
hl_md(json_to_xml(a, 'person'))

```xml
<person>
  <surname>Howard</surname>
  <firstnames>
    <item>Jeremy</item>
    <item>Peter</item>
  </firstnames>
  <address>
    <state>Queensland</state>
    <country>Australia</country>
  </address>
</person>
```

## Including documents

### Notebooks

In [None]:
nbp = Path('00_xml.ipynb')
nb = dict2obj(nbp.read_json())
cells = nb.cells
cell = cells[-1]
cell

```python
{ 'cell_type': 'code',
  'execution_count': {},
  'id': '1e9ee5c1',
  'metadata': {},
  'outputs': [],
  'source': ['#|hide\n', '#|eval: false\n', 'from nbdev.doclinks import nbdev_export\n', 'nbdev_export()']}
```

In [None]:
#| export
def get_mime_text(data):
    "Get text from MIME bundle, preferring markdown over plain"
    if 'text/markdown' in data: return ''.join(list(data['text/markdown']))
    if 'text/plain' in data: return ''.join(list(data['text/plain']))

In [None]:
#| export
def cell2out(o):
    "Convert single notebook output to XML format"
    if hasattr(o, 'data'): 
        txt = get_mime_text(o.data)
        if txt: return Out(txt, mime='markdown' if 'text/markdown' in o.data else 'plain')
    if hasattr(o, 'text'):
        txt = o.text if isinstance(o.text, str) else ''.join(o.text)
        return Out(txt, type='stream', name=o.get('name', 'stdout'))
    if hasattr(o, 'ename'): return Out(f"{o.ename}: {o.evalue}", type='error')

In [None]:
for o in cell.outputs: print(to_xml(cell2out(o)))

In [None]:
#| export
def cell2xml(cell, out=True):
    "Convert notebook cell to concise XML format"
    src = ''.join(getattr(cell, 'source', ''))
    f = Code if cell.cell_type=='code' else Md
    if not out: return f(src)
    parts = [Source(src)]
    out_items = L(getattr(cell,'outputs',[])).map(cell2out).filter()
    if out_items: parts.append(Outs(*out_items))
    return f(*parts)

In [None]:
cell2xml(cell)

```html
<code><source>#|hide
#|eval: false
from nbdev.doclinks import nbdev_export
nbdev_export()</code>
```

In [None]:
cell2xml(cell, out=False)

```html
<code>#|hide
#|eval: false
from nbdev.doclinks import nbdev_export
nbdev_export()</code>
```

In [None]:
#| export
def nb2xml(fname=None, nb=None, out=True):
    "Convert notebook to XML format"
    assert bool(fname)^bool(nb), "Pass either `fname` or `nb`"
    if not nb: nb = dict2obj(fname.read_json())
    cells_xml = [to_xml(cell2xml(c, out=out), do_escape=False) for c in nb.cells if c.cell_type in ('code','markdown')]
    return to_xml(Notebook(*cells_xml), do_escape=False)

In [None]:
nbsml = deepcopy(nb)
del(nbsml.cells[2:])

print(nb2xml(nb=nbsml))

<notebook><code><source>#|default_exp xml</code><md><source># xml source</md></notebook>


### Documents

According [to Anthropic](https://docs.anthropic.com/claude/docs/long-context-window-tips), "*it's essential to structure your prompts in a way that clearly separates the input data from the instructions*". They recommend using something like the following:

```xml
Here are some documents for you to reference for your task:
    
<documents>
<document index="1">
<source>
(URL, file name, hash, etc)
</source>
<document_content>
(the text content)
</document_content>
</document>
</documents>
```

We will create some small helper functions to make it easier to generate context in this format, although we're use `<src>` instead of `<source>` to avoid conflict with that HTML tag. Although it's based on Anthropic's recommendation, it's likely to work well with other models too.

In [None]:
#| export
doctype = namedtuple('doctype', ['src', 'content'])

We'll use `doctype` to store our pairs.

In [None]:
#| export
def _add_nls(s):
    "Add newlines to start and end of `s` if missing"
    if not s: return s
    if s[ 0]!='\n': s = '\n'+s
    if s[-1]!='\n': s = s+'\n'
    return s

Since Anthropic's example shows newlines before and after each tag, we'll do the same.

In [None]:
to_xml(Src('a'))

'<src>a</src>'

In [None]:
to_xml(Document('a'))

'<document>a</document>'

In [None]:
#| export
def mk_doctype(content:str,  # The document content
           src:Optional[str]=None # URL, filename, etc; defaults to `md5(content)` if not provided
          ) -> namedtuple:
    "Create a `doctype` named tuple"
    if src is None: src = hashlib.md5(content.encode()).hexdigest()[:8]
    return doctype(_add_nls(str(src).strip()), _add_nls(content.strip()))

This is a convenience wrapper to ensure that a `doctype` has the needed information in the right format.

In [None]:
doc = 'This is a "sample"'
mk_doctype(doc)

doctype(src='\n47e19350\n', content='\nThis is a "sample"\n')

In [None]:
#| export
def mk_doc(index:int,  # The document index
           content:str,  # The document content
           src:Optional[str]=None, # URL, filename, etc; defaults to `md5(content)` if not provided
           **kwargs
          ) -> tuple:
    "Create an `ft` format tuple for a single doc in Anthropic's recommended format"
    dt = mk_doctype(content, src)
    content = Document_content(NotStr(dt.content))
    src = Src(NotStr(dt.src))
    return Document(src, content, index=index, **kwargs)

We can now generate XML for one document in the suggested format:

In [None]:
mk_doc(1, doc, title="test")

```html
<document index="1" title="test"><src>
47e19350
</src><document-content>
This is a "sample"
</document-content></document>
```

In [None]:
#| export
def docs_xml(docs:list[str],  # The content of each document
             srcs:Optional[list]=None,  # URLs, filenames, etc; each one defaults to `md5(content)` if not provided
             prefix:bool=False, # Include Anthropic's suggested prose intro?
             details:Optional[list]=None, # Optional list of dicts with additional attrs for each doc
             title:str=None # Optional title attr for Documents element
            )->str:
    "Create an XML string containing `docs` in Anthropic's recommended format"
    pre = 'Here are some documents for you to reference for your task:\n\n' if prefix else ''
    if srcs is None: srcs = [None]*len(docs)
    if details is None: details = [{}]*len(docs)
    docs = (mk_doc(i+1, d, s, **kw) for i,(d,s,kw) in enumerate(zip(docs,srcs,details)))
    kw = dict(title=title) if title else {}
    return pre + to_xml(Documents(*docs, **kw), do_escape=False)

Putting it all together, we have our final XML format:

In [None]:
docs = [doc, 'And another one']
srcs = [None, 'doc.txt']
print(docs_xml(docs, srcs))

<documents><document index="1"><src>
47e19350
</src><document-content>
This is a "sample"
</document-content></document><document index="2"><src>
doc.txt
</src><document-content>
And another one
</document-content></document></documents>


## Context creation

Now that we can generate Anthropic's XML format, let's make it easy for a few common cases.

### File list to context

For generating XML context from files, we'll just read them as text and use the file names as `src`.

In [None]:
#| export
def read_file(fname, out=True, max_size=None):
    "Read file content, converting notebooks to XML if needed"
    fname = Path(fname)
    if fname.suffix == '.ipynb': res = nb2xml(fname, out=out)
    else: res = fname.read_text()
    if max_size and len(res)>max_size: return f"[Skipped: {fname.name} exceeds {max_size} bytes]"
    return res

In [None]:
#| export
@delegates(docs_xml)
def files2ctx(
    fnames:list[Union[str,Path]], # List of file names to add to context
    out:bool=True, # Include notebook cell outputs?
    srcs:Optional[list]=None, # Use the labels instead of `fnames`
    max_size:int=None, # Skip files larger than this (bytes)
    **kwargs
)->str: # XML for LM context
    "Convert files to XML context, handling notebooks"
    fnames = [Path(o) for o in fnames]
    contents = [read_file(o, out=out, max_size=max_size) for o in fnames]
    return docs_xml(contents, srcs or fnames, **kwargs)

In [None]:
fnames = ['samples/sample_core.py', 'samples/sample_styles.css']
hl_md(files2ctx(fnames, max_size=120))

```xml
<documents><document index="1"><src>
samples/sample_core.py
</src><document-content>
[Skipped: sample_core.py exceeds 120 bytes]
</document-content></document><document index="2"><src>
samples/sample_styles.css
</src><document-content>
.cell { margin-bottom: 1rem; }
.cell > .sourceCode { margin-bottom: 0; }
.cell-output > pre { margin-bottom: 0; }
</document-content></document></documents>
```

### Folder to context

In [None]:
#| export
@delegates(globtastic)
def folder2ctx(
    folder:Union[str,Path], # Folder to read
    prefix:bool=False, # Include Anthropic's suggested prose intro?
    out:bool=True, # Include notebook cell outputs?
    include_base:bool=True, # Include full path in src?
    title:str=None, # Optional title attr for Documents element
    max_size:int=100_000, # Skip files larger than this (bytes)
    max_total:int=10_000_000,  # Max total output size in bytes
    readme_first:bool=False,  # Prioritize README files at start of context?
    files_only:bool=False,  # Return dict of {filename: size} instead of context?
    **kwargs
)->Union[str,dict]:
    "Convert folder contents to XML context, handling notebooks"
    folder = Path(folder)
    fnames = globtastic(folder, **kwargs)
    if files_only: return {str(Path(f).relative_to(folder)): Path(f).stat().st_size for f in fnames}
    if readme_first: fnames = sorted(fnames, key=lambda f: (0 if 'readme' in Path(f).name.lower() else 1, f))
    srcs = fnames if include_base else [Path(f).relative_to(folder) for f in fnames]
    res = files2ctx(fnames, prefix=prefix, out=out, srcs=srcs, title=title, max_size=max_size)
    suf = f"\n\n[TRUNCATED: output size {{_outsz_}} exceeded max size {max_total} bytes]"
    if max_total and len(res) > max_total: res = truncstr(res, max_total, suf=suf, sizevar='_outsz_')
    return res

In [None]:
print(folder2ctx('samples', prefix=True, file_glob='*.py'))

Here are some documents for you to reference for your task:

<documents><document index="1"><src>
samples/sample_core.py
</src><document-content>
import inspect
empty = inspect.Parameter.empty
models = 'claude-3-opus-20240229','claude-3-sonnet-20240229','claude-3-haiku-20240307'
</document-content></document></documents>


In [None]:
#| export
#| hide
@call_parse
@delegates(folder2ctx)
def folder2ctx_cli(
    folder:str, # Folder name containing files to add to context
    out:bool=True, # Include notebook cell outputs?
    **kwargs # Passed to `folder2ctx`
)->str: # XML for Claude context
    "CLI to convert folder contents to XML context, handling notebooks"
    print(folder2ctx(folder, out=out, **kwargs))

:::{.callout-tip}

After you install `toolslm`, `folder2ctx` becomes available from the command line.

:::

In [None]:
#| eval: false
!folder2ctx -h

usage: folder2ctx [-h] [--recursive] [--symlinks] [--file_glob FILE_GLOB]
                  [--file_re FILE_RE] [--folder_re FOLDER_RE]
                  [--skip_file_glob SKIP_FILE_GLOB]
                  [--skip_file_re SKIP_FILE_RE]
                  [--skip_folder_re SKIP_FOLDER_RE] [--func FUNC]
                  [--ret_folders] [--sort] [--exts EXTS] [--prefix] [--out]
                  [--include_base] [--title TITLE] [--max_size MAX_SIZE]
                  [--max_total MAX_TOTAL] [--readme_first]
                  folder

CLI to convert folder contents to XML context, handling notebooks

positional arguments:
  folder                           Folder name containing files to add to
                                   context

options:
  -h, --help                       show this help message and exit
  --recursive                      search subfolders (default: False)
  --symlinks                       follow symlinks? (default: False)
  --file_glob FILE_GLO

e files matching glob
  --file_re FILE_RE                Only include files matching regex
  --folder_re FOLDER_RE            Only enter folders matching regex
  --skip_file_glob SKIP_FILE_GLOB  Skip files matching glob
  --skip_file_re SKIP_FILE_RE      Skip files matching regex
  --skip_folder_re SKIP_FOLDER_RE  Skip folders matching regex,
  --func FUNC                      function to apply to each matched file
                                   (default: <function join>)
  --ret_folders                    return folders, not just files (default:
                                   False)
  --sort                           sort files by name within each folder
                                   (default: False)
  --exts EXTS
  --prefix                         Include Anthropic's suggested prose intro?
                                   (default: False)
  --out                            Include notebook cell outputs? (default:
                                   False

In [None]:
#| export
def parse_gh_url(url):
    "Parse GitHub URL into (owner, repo, type, ref, path) or None"
    m = re.match(r'https?://(?:www\.)?github\.com/([^/]+)/([^/]+)(?:/([^/]+)(?:/([^/]+)(?:/(.+))?)?)?', url)
    return dict(zip('owner repo typ ref path'.split(), m.groups())) if m else None

In [None]:
#| export
@delegates(folder2ctx)
def repo2ctx(
    owner:str,  # GitHub repo owner or "owner/repo" or a full github URL
    repo:str=None,   # GitHub repo name (leave empty if using "owner/repo" or URL format for owner param)
    ref:str=None,  # Git ref (branch/tag/sha) (get from URL not provided); defaults to repo's default branch
    folder:str=None,  # Only include files under this path (get from URL not provided)
    show_filters:bool=True,  # Include filter info in title?
    token:str=None,  # GitHub token (uses GITHUB_TOKEN env var if None)
    **kwargs  # Passed to `folder2ctx`
)->Union[str,dict]:  # XML for LM context, or dict of file sizes
    "Convert GitHub repo to XML context without cloning"
    import tempfile, tarfile, io
    if owner.startswith('http'):
        parsed = parse_gh_url(owner)
        if not parsed: raise ValueError(f"Invalid GitHub URL: {owner}")
        owner,repo = parsed['owner'], parsed['repo']
        ref = ref or parsed.get('ref')
        folder = folder or parsed.get('path')
    if repo is None: owner, repo = owner.split('/')
    api = GhApi(token=token)
    if ref is None: ref = api.repos.get(owner, repo).default_branch
    data = api.repos.download_tarball_archive(owner, repo, ref)
    title = f"GitHub repository contents from {owner}/{repo}/{ref}"
    if folder: title += f'/{folder}'
    if show_filters:
        parts = [f"{k}: {', '.join(v) if isinstance(v, (list,tuple)) else v}" for k,v in kwargs.items() if v]
        if parts: title += f" (filters applied -- {' | '.join(parts)})"
    tf = tarfile.open(fileobj=io.BytesIO(data))
    with tempfile.TemporaryDirectory() as tmp:
        tf.extractall(tmp, filter='data')
        subdir = Path(tmp) / tf.getmembers()[0].name.split('/')[0]
        if folder: subdir = subdir/folder
        return folder2ctx(subdir, include_base=False, title=title, readme_first=True, **kwargs)

In [None]:
#| eval: false
print(repo2ctx('answerdotai/toolslm', exts=('ipynb','py'), skip_file_re='^_', out=False, max_total=500))

<documents title="GitHub repository contents from answerdotai/toolslm/main (filters applied -- exts: ipynb, py | skip_file_re: ^_ | max_total: 500)"><document index="1"><src>
00_xml.ipynb
</src><document-content>
<notebook><code>#|default_exp xml</code><md># xml source</md><code>#| export
import hashlib,xml.etree.ElementTree as ET
from collections import namedtuple
from ghapi.all import GhApi

from fastcore.utils import *
from fastcore.

[TRUNCATED: output size 90253 exceeded max size 500 bytes]


In [None]:
#| eval: false
print(repo2ctx('answerdotai/toolslm', exts=('ipynb','py'), skip_file_re='^_', out=False, files_only=True))

{'00_xml.ipynb': 28232, '01_funccall.ipynb': 65902, '02_shell.ipynb': 6295, '03_download.ipynb': 12306, '04_md_hier.ipynb': 8091, 'index.ipynb': 3189, 'setup.py': 2596, 'samples/sample_core.py': 134, 'toolslm/download.py': 4451, 'toolslm/funccall.py': 11160, 'toolslm/md_hier.py': 11010, 'toolslm/shell.py': 1566, 'toolslm/xml.py': 8095}


In [None]:
#| eval: false
print(repo2ctx('https://github.com/AnswerDotAI/toolslm/tree/main/samples'))

<documents title="GitHub repository contents from AnswerDotAI/toolslm/main/samples"><document index="1"><src>
sample_core.py
</src><document-content>
import inspect
empty = inspect.Parameter.empty
models = 'claude-3-opus-20240229','claude-3-sonnet-20240229','claude-3-haiku-20240307'
</document-content></document><document index="2"><src>
sample_styles.css
</src><document-content>
.cell { margin-bottom: 1rem; }
.cell > .sourceCode { margin-bottom: 0; }
.cell-output > pre { margin-bottom: 0; }
</document-content></document></documents>


## Export -

In [None]:
#|hide
#|eval: false
from nbdev.doclinks import nbdev_export
nbdev_export()