In [None]:
#|default_exp xml

# xml source

In [None]:
#| export
import hashlib,xml.etree.ElementTree as ET
from collections import namedtuple

from fastcore.utils import *
from fastcore.meta import delegates
from fastcore.xtras import hl_md
from fastcore.xml import to_xml, Document, Documents, Document_content, Src, Source,Out,Outs,Cell,Notebook,Md,Code
from fastcore.script import call_parse

In [None]:
from copy import deepcopy

## JSON to XML

In [None]:
#| exports
def json_to_xml(d:dict, # JSON dictionary to convert
                rnm:str # Root name
               )->str:
    "Convert `d` to XML."
    root = ET.Element(rnm)
    def build_xml(data, parent):
        if isinstance(data, dict):
            for key, value in data.items(): build_xml(value, ET.SubElement(parent, key))
        elif isinstance(data, list):
            for item in data: build_xml(item, ET.SubElement(parent, 'item'))
        else: parent.text = str(data)
    build_xml(d, root)
    ET.indent(root)
    return ET.tostring(root, encoding='unicode')

JSON doesn't map as nicely to XML as the data structure used in `fastcore.xml`, but for simple XML trees it can be convenient -- for example:

In [None]:
a = dict(surname='Howard', firstnames=['Jeremy','Peter'],
         address=dict(state='Queensland',country='Australia'))
hl_md(json_to_xml(a, 'person'))

## Including documents

### Notebooks

In [None]:
nbp = Path('00_xml.ipynb')
nb = dict2obj(nbp.read_json())
cells = nb.cells
cell = cells[-1]
cell

In [None]:
#| exports
def get_mime_text(data):
    "Get text from MIME bundle, preferring markdown over plain"
    if 'text/markdown' in data: return ''.join(list(data['text/markdown']))
    if 'text/plain' in data: return ''.join(list(data['text/plain']))

In [None]:
#| exports
def cell2out(o):
    "Convert single notebook output to XML format"
    if hasattr(o, 'data'): 
        txt = get_mime_text(o.data)
        if txt: return Out(txt, mime='markdown' if 'text/markdown' in o.data else 'plain')
    if hasattr(o, 'text'):
        txt = o.text if isinstance(o.text, str) else ''.join(o.text)
        return Out(txt, type='stream', name=o.get('name', 'stdout'))
    if hasattr(o, 'ename'): return Out(f"{o.ename}: {o.evalue}", type='error')

In [None]:
for o in cell.outputs: print(to_xml(cell2out(o)))

In [None]:
#| exports
def cell2xml(cell, out=True):
    "Convert notebook cell to concise XML format"
    src = ''.join(getattr(cell, 'source', ''))
    f = Code if cell.cell_type=='code' else Md
    if not out: return f(src)
    parts = [Source(src)]
    out_items = L(getattr(cell,'outputs',[])).map(cell2out).filter()
    if out_items: parts.append(Outs(*out_items))
    return f(*parts)

In [None]:
cell2xml(cell)

In [None]:
cell2xml(cell, out=False)

In [None]:
#| exports
def nb2xml(fname=None, nb=None, out=True):
    assert bool(fname)^bool(nb), "Pass either `fname` or `nb`"
    if not nb: nb = dict2obj(fname.read_json())
    cells_xml = [to_xml(cell2xml(c, out=out), do_escape=False) for c in nb.cells if c.cell_type in ('code','markdown')]
    return Notebook(*cells_xml)

In [None]:
nbsml = deepcopy(nb)
del(nbsml.cells[2:])

print(nb2xml(nb=nbsml))

### Documents

According [to Anthropic](https://docs.anthropic.com/claude/docs/long-context-window-tips), "*it's essential to structure your prompts in a way that clearly separates the input data from the instructions*". They recommend using something like the following:

```xml
Here are some documents for you to reference for your task:
    
<documents>
<document index="1">
<source>
(URL, file name, hash, etc)
</source>
<document_content>
(the text content)
</document_content>
</document>
</documents>
```

We will create some small helper functions to make it easier to generate context in this format, although we're use `<src>` instead of `<source>` to avoid conflict with that HTML tag. Although it's based on Anthropic's recommendation, it's likely to work well with other models too.

In [None]:
#| exports
doctype = namedtuple('doctype', ['src', 'content'])

We'll use `doctype` to store our pairs.

In [None]:
#| exports
def _add_nls(s):
    "Add newlines to start and end of `s` if missing"
    if not s: return s
    if s[ 0]!='\n': s = '\n'+s
    if s[-1]!='\n': s = s+'\n'
    return s

Since Anthropic's example shows newlines before and after each tag, we'll do the same.

In [None]:
to_xml(Src('a'))

In [None]:
to_xml(Document('a'))

In [None]:
#| exports
def mk_doctype(content:str,  # The document content
           src:Optional[str]=None # URL, filename, etc; defaults to `md5(content)` if not provided
          ) -> namedtuple:
    "Create a `doctype` named tuple"
    if src is None: src = hashlib.md5(content.encode()).hexdigest()[:8]
    return doctype(_add_nls(str(src).strip()), _add_nls(content.strip()))

This is a convenience wrapper to ensure that a `doctype` has the needed information in the right format.

In [None]:
doc = 'This is a "sample"'
mk_doctype(doc)

In [None]:
#| exports
def mk_doc(index:int,  # The document index
           content:str,  # The document content
           src:Optional[str]=None, # URL, filename, etc; defaults to `md5(content)` if not provided
           **kwargs
          ) -> tuple:
    "Create an `ft` format tuple for a single doc in Anthropic's recommended format"
    dt = mk_doctype(content, src)
    content = Document_content(NotStr(dt.content))
    src = Src(NotStr(dt.src))
    return Document(src, content, index=index, **kwargs)

We can now generate XML for one document in the suggested format:

In [None]:
mk_doc(1, doc, title="test")

In [None]:
#| exports
def docs_xml(docs:list[str],  # The content of each document
             srcs:Optional[list]=None,  # URLs, filenames, etc; each one defaults to `md5(content)` if not provided
             prefix:bool=True, # Include Anthropic's suggested prose intro?
             details:Optional[list]=None # Optional list of dicts with additional attrs for each doc
            )->str:
    "Create an XML string containing `docs` in Anthropic's recommended format"
    pre = 'Here are some documents for you to reference for your task:\n\n' if prefix else ''
    if srcs is None: srcs = [None]*len(docs)
    if details is None: details = [{}]*len(docs)
    docs = (mk_doc(i+1, d, s, **kw) for i,(d,s,kw) in enumerate(zip(docs,srcs,details)))
    return pre + to_xml(Documents(docs), do_escape=False)

Putting it all together, we have our final XML format:

In [None]:
docs = [doc, 'And another one']
srcs = [None, 'doc.txt']
print(docs_xml(docs, srcs))

## Context creation

Now that we can generate Anthropic's XML format, let's make it easy for a few common cases.

### File list to context

For generating XML context from files, we'll just read them as text and use the file names as `src`.

In [None]:
#| exports
def read_file(fname):
    "Read file content, converting notebooks to XML if needed"
    fname = Path(fname)
    if fname.suffix == '.ipynb': return nb2xml(fname)
    return fname.read_text()

In [None]:
#| exports
def files2ctx(
    fnames:list[Union[str,Path]], # List of file names to add to context
    prefix:bool=True # Include Anthropic's suggested prose intro?
)->str: # XML for LM context
    "Convert files to XML context, handling notebooks"
    fnames = [Path(o) for o in fnames]
    contents = [read_file(o) for o in fnames]
    return docs_xml(contents, fnames, prefix=prefix)

In [None]:
fnames = ['samples/sample_core.py', 'samples/sample_styles.css']
hl_md(files2ctx(fnames))

### Folder to context

In [None]:
#| exports
@delegates(globtastic)
def folder2ctx(
    folder:Union[str,Path], # Folder name containing files to add to context
    prefix:bool=True, # Include Anthropic's suggested prose intro?
    **kwargs # Passed to `globtastic`
)->str: # XML for Claude context
    fnames = globtastic(folder, **kwargs)
    return files2ctx(fnames, prefix=prefix)

In [None]:
print(folder2ctx('samples', prefix=False, file_glob='*.py'))

In [None]:
#| exports
#| hide
@call_parse
@delegates(folder2ctx)
def folder2ctx_cli(
    folder:str, # Folder name containing files to add to context
    **kwargs # Passed to `folder2ctx`
)->str: # XML for Claude context
    print(folder2ctx(folder, **kwargs))

:::{.callout-tip}

After you install `toolslm`, `folder2ctx` becomes available from the command line.  You can see how to use it with the following command:

```bash
folder2ctx -h
```
:::

## Export -

In [None]:
#|hide
#|eval: false
from nbdev.doclinks import nbdev_export
nbdev_export()