In [None]:
#|default_exp helpers

# Helpers source

## Setup

In [None]:
#| export
import hashlib,xml.etree.ElementTree as ET
from collections import namedtuple

from claudette.core import *
from fastcore.utils import *
from fastcore.meta import delegates
from IPython import display

## XML helpers

Claude works well with XML inputs, but XML can be a bit clunky to work with manually. Therefore, we create a couple of more streamlined approaches for XML generation. You don't need to use these if you don't find them useful -- you can always just use plain strings for XML directly.

In [None]:
#| exports
def xt(tag:str, # XML tag name
       c:Optional[list]=None, # Children
       **kw):
    "Helper to create appropriate data structure for `to_xml`."
    kw = {k.lstrip('_'):str(v) for k,v in kw.items()}
    return tag,c,kw

An XML node contains a tag, optional children, and optional attributes. `xt` creates a tuple of these three things, which we will use to general XML shortly. Attributes are passed as kwargs; since these might conflict with reserved words in Python, you can optionally add a `_` prefix and it'll be stripped off.

In [None]:
xt('x-custom', ['hi'], _class='bar')

('x-custom', ['hi'], {'class': 'bar'})

In [None]:
#| exports
g = globals()
tags = 'div img h1 h2 h3 h4 h5 p hr span html'.split()
for o in tags: g[o] = partial(xt, o)

If you have to use a lot of tags of the same type, it's convenient to use `partial` to create specialised functions for them. Here, we're creating functions for some common HTML tags. Here's an example of using them:

In [None]:
a = html([
    p('This is a paragraph'),
    hr(),
    img(src='http://example.prg'),
    div([
        h1('This is a header'),
        h2('This is a sub-header', style='k:v'),
    ], _class='foo')
])
a

('html',
 [('p', 'This is a paragraph', {}),
  ('hr', None, {}),
  ('img', None, {'src': 'http://example.prg'}),
  ('div',
   [('h1', 'This is a header', {}),
    ('h2', 'This is a sub-header', {'style': 'k:v'})],
   {'class': 'foo'})],
 {})

In [None]:
#| exports
def hl_md(s, lang='xml'):
    "Syntax highlight `s` using `lang`."
    if display: return display.Markdown(f'```{lang}\n{s}\n```')
    print(s)

When we display XML in a notebook, it's nice to highlight it, so we create a function to simplify that:

In [None]:
hl_md('<test><xml foo="bar">a child</xml></test>')

```xml
<test><xml foo="bar">a child</xml></test>
```

In [None]:
#| exports
def to_xml(node:tuple, # XML structure in `xt` format
           hl=False # Syntax highlight response?
          ):
    "Convert `node` to an XML string."
    def mk_el(tag, cs, attrs):
        el = ET.Element(tag, attrib=attrs)
        if isinstance(cs, list): el.extend([mk_el(*o) for o in cs])
        elif cs is not None: el.text = str(cs)
        return el

    root = mk_el(*node)
    ET.indent(root, space='  ' if hl else '')
    res = ET.tostring(root, encoding='unicode')
    return hl_md(res) if hl else res

Now we can convert that HTML data structure we created into XML:

In [None]:
to_xml(a, hl=True)

```xml
<html>
  <p>This is a paragraph</p>
  <hr />
  <img src="http://example.prg" />
  <div class="foo">
    <h1>This is a header</h1>
    <h2 style="k:v">This is a sub-header</h2>
  </div>
</html>
```

In [None]:
#| exports
def json_to_xml(d:dict, # JSON dictionary to convert
                rnm:str # Root name
               )->str:
    "Convert `d` to XML."
    root = ET.Element(rnm)
    def build_xml(data, parent):
        if isinstance(data, dict):
            for key, value in data.items(): build_xml(value, ET.SubElement(parent, key))
        elif isinstance(data, list):
            for item in data: build_xml(item, ET.SubElement(parent, 'item'))
        else: parent.text = str(data)
    build_xml(d, root)
    ET.indent(root)
    return ET.tostring(root, encoding='unicode')

JSON doesn't map as nicely to XML as the data structure used in the previous section, but for simple XML trees it can be convenient -- for example:

In [None]:
a = dict(surname='Howard', firstnames=['Jeremy','Peter'],
         address=dict(state='Queensland',country='Australia'))
hl_md(json_to_xml(a, 'person'))

```xml
<person>
  <surname>Howard</surname>
  <firstnames>
    <item>Jeremy</item>
    <item>Peter</item>
  </firstnames>
  <address>
    <state>Queensland</state>
    <country>Australia</country>
  </address>
</person>
```

## Including documents

According [to Anthropic](https://docs.anthropic.com/claude/docs/long-context-window-tips), "*it's essential to structure your prompts in a way that clearly separates the input data from the instructions*". They recommend using the following format:

```xml
Here are some documents for you to reference for your task:
    
<documents>
<document index="1">
<source>
(URL, file name, hash, etc)
</source>
<document_content>
(the text content)
</document_content>
</document>
</documents>
```

We will create some small helper functions to make it easier to generate context in this format.

In [None]:
#| exports
doctype = namedtuple('doctype', ['source', 'content'])

We'll use `doctype` to store our pairs.

In [None]:
#| exports
def _add_nls(s):
    "Add newlines to start and end of `s` if missing"
    if s[ 0]!='\n': s = '\n'+s
    if s[-1]!='\n': s = s+'\n'
    return s

Since Anthropic's example shows newlines before and after each tag, we'll do the same.

In [None]:
#| exports
def mk_doctype(content:str,  # The document content
           source:Optional[str]=None # URL, filename, etc; defaults to `md5(content)` if not provided
          ) -> namedtuple:
    "Create a `doctype` named tuple"
    if source is None: source = hashlib.md5(content.encode()).hexdigest()[:8]
    return doctype(_add_nls(str(source).strip()), _add_nls(content.strip()))

This is a convenience wrapper to ensure that a `doctype` has the needed information in the right format.

In [None]:
doc = 'This is a sample'
mk_doctype(doc)

doctype(source='\nb8898fab\n', content='\nThis is a sample\n')

In [None]:
#| exports
def mk_doc(index:int,  # The document index
           content:str,  # The document content
           source:Optional[str]=None # URL, filename, etc; defaults to `md5(content)` if not provided
          ) -> tuple:
    "Create an `xt` format tuple for a single doc in Anthropic's recommended format"
    dt = mk_doctype(content, source)
    content = xt('document_content', dt.content)
    source =  xt('source', dt.source)
    return xt('document', [source, content], index=index)

We can now generate XML for one document in the suggested format:

In [None]:
print(to_xml(mk_doc(1, doc)))

<document index="1">
<source>
b8898fab
</source>
<document_content>
This is a sample
</document_content>
</document>


In [None]:
#| exports
def docs_xml(docs:list[str],  # The content of each document
             sources:Optional[list]=None,  # URLs, filenames, etc; each one defaults to `md5(content)` if not provided
             prefix:bool=True # Include Anthropic's suggested prose intro?
            )->str:
    "Create an XML string containing `docs` in Anthropic's recommended format"
    pre = 'Here are some documents for you to reference for your task:\n\n' if prefix else ''
    if sources is None: sources = [None]*len(docs)
    docs = [mk_doc(i+1, *o) for i,o in enumerate(zip(docs,sources))]
    return pre + to_xml(xt('documents', docs))

Putting it all together, we have our final XML format:

In [None]:
docs = [doc, 'And another one']
sources = [None, 'doc.txt']
print(docs_xml(docs, sources))

Here are some documents for you to reference for your task:

<documents>
<document index="1">
<source>
b8898fab
</source>
<document_content>
This is a sample
</document_content>
</document>
<document index="2">
<source>
doc.txt
</source>
<document_content>
And another one
</document_content>
</document>
</documents>


## Context creation

Now that we can generate Anthropic's XML format, let's make it easy for a few common cases.

### File list to context

For generating XML context from files, we'll just read them as text and use the file names as `source`.

In [None]:
#| exports
def files2ctx(
    fnames:list[Union[str,Path]], # List of file names to add to context
    prefix:bool=True # Include Anthropic's suggested prose intro?
)->str: # XML for Claude context
    fnames = [Path(o) for o in fnames]
    contents = [o.read_text() for o in fnames]
    return docs_xml(contents, fnames, prefix=prefix)

In [None]:
fnames = ['samples/sample_core.py', 'samples/sample_styles.css']
hl_md(files2ctx(fnames))

```xml
Here are some documents for you to reference for your task:

<documents>
<document index="1">
<source>
samples/sample_core.py
</source>
<document_content>
import inspect
empty = inspect.Parameter.empty
models = 'claude-3-opus-20240229','claude-3-sonnet-20240229','claude-3-haiku-20240307'
</document_content>
</document>
<document index="2">
<source>
samples/sample_styles.css
</source>
<document_content>
.cell { margin-bottom: 1rem; }
.cell &gt; .sourceCode { margin-bottom: 0; }
.cell-output &gt; pre { margin-bottom: 0; }
</document_content>
</document>
</documents>
```

### Folder to context

In [None]:
#| exports
@delegates(globtastic)
def folder2ctx(
    folder:Union[str,Path], # Folder name containing files to add to context
    prefix:bool=True, # Include Anthropic's suggested prose intro?
    **kwargs # Passed to `globtastic`
)->str: # XML for Claude context
    fnames = globtastic(folder, **kwargs)
    return files2ctx(fnames, prefix=prefix)

In [None]:
print(folder2ctx('samples', prefix=False, file_glob='*.py'))

<documents>
<document index="1">
<source>
samples/sample_core.py
</source>
<document_content>
import inspect
empty = inspect.Parameter.empty
models = 'claude-3-opus-20240229','claude-3-sonnet-20240229','claude-3-haiku-20240307'
</document_content>
</document>
</documents>


## Export -

In [None]:
#|hide
#|eval: false
from nbdev.doclinks import nbdev_export
nbdev_export()