# Basic Callbacks

In [None]:
#| default_exp ie_func.basic

In [None]:
#| hide
from nbdev.showdoc import show_doc
%load_ext autoreload
%autoreload 2

In [None]:
#| export
import re
from typing import Iterable, Sequence,Union
from numbers import Real
from pathlib import Path

from spannerlib.span import Span
from spannerlib.utils import DefaultIEs,DefaultAGGs,visualize_callback_df

## Variable schema utils

In [None]:
#| export
def span_arity(arity):
    """return a schema of Spans with given arity"""
    return [Span]*arity

def str_arity(arity):
    """return a schema of strings with given arity"""
    return [str]*arity

def object_arity(arity):
    """return a schema of objects with given arity"""
    return [object]*arity

## Debugging IEs

In [None]:
#| export
def print_ie(
        fstring, # the format string used to print the objects
        *objects, # the objects to be printed
    ):
    """
    prints the objects using the format string fstring to the console
    used for debugging.
    """
    res = fstring.format(*objects)
    print(res)
    return [res]

DefaultIEs().add(
    "print",
    print_ie,
    object_arity,
    [object]
)

In [None]:
_ = print_ie("Hello, {}!", "world")
_ = print_ie("Hello, {!r}", {'complicated': 'object'})

Hello, world!
Hello, {'complicated': 'object'}


## regex functions

In [None]:
#| export

def rgx(pattern: str, # the regex pattern to be matched
    text: Union[str,Span], # the text to be matched on, can be either a string or a span.
    ):
    """
    An IE function which runs regex using python's `re` and 
    yields tuples of spans according to the number of capture groups in the pattern.
    capture groups are ordered by their starting position in the pattern.
    In the case of no capture groups, the function yields a single span of the entire match.
    """
    text = Span(text)
    compiled_rgx = re.compile(pattern)
    num_groups = compiled_rgx.groups
    if num_groups == 0:
        for match in re.finditer(compiled_rgx, str(text)):
            i,j = match.span()
            yield (text[i:j])
    else:
        for match in re.finditer(compiled_rgx, str(text)):
            indices = list((match.span(i) for i in range(1,num_groups+1)))
            yield tuple([text[i:j] for i,j in indices])



DefaultIEs().add(
    'rgx',
    rgx,
    [str, (str,Span)],
    span_arity
)


In [None]:
text = "aaaaa@bbbbbbaa@bb"
pattern = '(?P<c>(?P<a>a*)@(?P<b>b*))'
assert list(rgx(pattern,text)) == [
    ('aaaaa@bbbbbb', 'aaaaa', 'bbbbbb'),
    ('aa@bb', 'aa', 'bb')
]

In [None]:
text = "aaaaa@bbbbbbaa@bb"
# anonymous groups are not captured, so we return the entire match
pattern = '((?:a*)@(?:b*))'
assert list(rgx(pattern,text)) == [
    ('aaaaa@bbbbbb',),
    ('aa@bb',)
]
list(rgx(pattern,text))

[([@a254e9,0,12) "aaaaa@bbbb...",), ([@a254e9,12,17) "aa@bb",)]

In [None]:
document = Span('dddaaaaa@bbbbbbaa@bb',name = 'doc1')
document

[@doc1,0,20) "dddaaaaa@b..."

In [None]:
list(rgx('(a*)@(b*)',document))

[([@doc1,3,8) "aaaaa", [@doc1,9,15) "bbbbbb"),
 ([@doc1,15,17) "aa", [@doc1,18,20) "bb")]

In [None]:
assert list(rgx('(a*)@(b*)',document)) == [
    (Span(document,3,8),Span(document,9,15)),
    (Span(document,15,17), Span(document,18,20))]
list(rgx('(a*)@(b*)',document))

[([@doc1,3,8) "aaaaa", [@doc1,9,15) "bbbbbb"),
 ([@doc1,15,17) "aa", [@doc1,18,20) "bb")]

In [None]:
sub_doc = document.slice(3,None)
assert list(rgx('(a*)@(b*)',sub_doc)) == list(rgx('(a*)@(b*)',document))


In [None]:
#| export
def rgx_split(delim, # the delimeter pattern to split on
    text, # the text to be split, can be either string or Span
    initial_tag="Start Tag" # the tag to be used incase the first split is not at the start of the text
    ):
    """
    An IE function which given a delimeter rgx pattern and a text, 
    returns tuples of spans of the form (delimeter_match, text_before_next_delimeter).
    Note that rgx pattern should not have any groups.
    """
    delim_iter = rgx(delim,text)
    text = Span(text)
    try:
        first_span = next(delim_iter)
        if first_span.start != 0:
            yield(Span(initial_tag),text[:first_span.start])
    except StopIteration:
        return
    prev_span = first_span
    for next_span in delim_iter:
        yield (prev_span, text[prev_span.end:next_span.start])
        prev_span = next_span

    yield (prev_span, text[prev_span.end:])

DefaultIEs().add(
    'rgx_split',
    rgx_split,
    [str, (str,Span)],
    [Span,Span],
)


In [None]:
assert list(rgx_split('a|x','bbbannnnxdddaca')) == [
    ('Start Tag', 'bbb'),
    ('a', 'nnnn'),
    ('x', 'ddd'),
    ('a', 'c'),
    ('a', '')]

assert list(rgx_split('a|x','abbbannnnxdddaca')) == [
    ('a', 'bbb'),
    ('a', 'nnnn'),
    ('x', 'ddd'),
    ('a', 'c'),
    ('a', '')]


In [None]:
#| export
def rgx_is_match(delim, # the delimeter pattern to split on
    text, # the text to be split, can be either string or Span
    ):
    """
    An IE function which given a delimeter rgx pattern and a text, 
    returns True if any match is found, False otherwise.
    """
    for _ in rgx(delim,text):
        return [True]
    return [False]

DefaultIEs().add(
    'rgx_is_match',
    rgx_is_match,
    [str, (str,Span)],
    [bool],
)


In [None]:
assert rgx_is_match('(a*)@(b*)',document) == [True]
assert rgx_is_match('(a*)@(e+)',document) == [False]

## Expression eval

In [None]:
#| export
def expr_eval(template, # The expression template to be evaluated. 
    *inputs, # the inputs to be substituted in the template
    ):
    """
    Evaluate an expression template with the given inputs. 
    The template should contain numerical indices that correspond to the positions of the inputs.

    Returns:
        The result of evaluating the expression template with the given inputs.

    Raises:
        ValueError: If the expression template is invalid or the number of inputs does not match
                    the number of indices in the template.
    """
    try:
        expr = template.format(*[f'arg_{i}' for i in range(len(inputs))])
    except (KeyError, IndexError):
        raise ValueError(f"Invalid expression template {template} for inputs {inputs}\n"
                         f"Make sure the expression template has only numerical indices and the number of inputs match the number of indices")
    yield eval(expr, None, {f'arg_{i}': arg for i, arg in enumerate(inputs)})

DefaultIEs().add(
    'expr_eval',
    expr_eval,
    object_arity,
    [object]
)

In [None]:
assert next(expr_eval('{0} + {1}',1,2)) == 3

In [None]:
a = Span('aaaa',1,3)
b = Span('bbbb',3,4)

In [None]:
assert next(expr_eval('{0}.end == {1}.start',a,b))
assert not next(expr_eval('{0}.doc == {1}.doc',a,b))
assert next(expr_eval('({0}.doc != {1}.doc) & ({0}.end == {1}.start)',a,b))

In [None]:
#| export
def not_ie(val):
    """
    An IE function which negates the input value.
    """
    return [(not val)]

DefaultIEs().add(
    'not',
    not_ie,
    [bool],
    [bool]
)

In [None]:
assert not_ie(True) == [False]
assert not_ie(False) == [True]

## Span operations

In [None]:
#| export
def as_str(obj):
    """casts objects to strings"""
    yield str(obj),

DefaultIEs().add(
    'as_str',
    as_str,
    [object],
    [str]
)


In [None]:
#| export
def span_contained(s1, s2):
    """yields True if s1 is contained in s2, otherwise yield False"""
    if s1.doc == s2.doc and s1.start >= s2.start and s1.end <= s2.end:
        yield True
    else:
        yield False

DefaultIEs().add(
    'span_contained',
    span_contained,
    [Span,Span],
    [bool]
)

In [None]:
# usage example
doc1 = Span('hello darkness my old friend',name='doc1')
doc2 = Span('I come to talk to you again',name='doc2')

span1 = Span(doc1,1, 10)
span2 = Span(doc1,0, 11)
span3 = Span(doc1,2, 12)
span4 = Span(doc2,3,5)



assert list(span_contained(span1,span2)) == [True]
assert list(span_contained(span2,span1)) == [False]
assert list(span_contained(span1,span3)) == [False]
assert list(span_contained(span1,span4)) == [False]

In [None]:
#| export
def deconstruct_span(span):
    """yields the doc id, start and end of the span"""
    yield span.name, span.start, span.end

DefaultIEs().add(
    'deconstruct_span',
    deconstruct_span,
    [Span],
    [str,int,int]
)

In [None]:
doc = Span('hello darkness my old friend',name='doc1')
doc2 = Span('I come to talk to you again')

assert list(deconstruct_span(doc)) == [('doc1', 0, 28)]
assert list(deconstruct_span(doc2))== [('f8f5e8', 0, 27)]

In [None]:
#| export
def read(text_path, # the path to the text file to read from
    ):
    """Reads from file and return it's content as a string"""
    yield Path(text_path).read_text()

In [None]:
#| export
def read_span(
    text_path, # the path to the text file to read from
    ):
    """Reads from file and return it's content, as a span with the name of the file as the doc id.
    """
    yield Span(Path(text_path).read_text(),name=text_path)

In [None]:
#| export
DefaultIEs().add(
    'read',
    read,
    [str],
    [str]
)

DefaultIEs().add(
    'read_span',
    read_span,
    [str],
    [Span]
)

In [None]:
path = Path('sample1.txt')
path.write_text('hello darkness my old friend')
text = list(read('sample1.txt'))[0]
text_span = list(read_span('sample1.txt'))[0]

path.unlink()

assert text == "hello darkness my old friend"
assert text_span == text
text_span

[@sample1.txt,0,28) "hello dark..."

## Basic Aggs

Spannerlib also supports some pandas aggregation functions

In [None]:
#| exports
DefaultAGGs().add('count','count',[object],[int])
DefaultAGGs().add('sum','sum',[Real],[Real])
DefaultAGGs().add('avg','avg',[Real],[Real])
DefaultAGGs().add('max','max',[Real],[Real])
DefaultAGGs().add('min','min',[Real],[Real])

## Callback names and Schemas



In [None]:
#| hide
from itables import show

In [None]:
#| echo: false
df = visualize_callback_df()
show(df,paging=False)

name,function,input_schema,output_schema,type
print,print_ie,object_arity,['object'],IE Function
rgx,rgx,"['str', ('str', 'Span')]",span_arity,IE Function
rgx_split,rgx_split,"['str', ('str', 'Span')]","['Span', 'Span']",IE Function
rgx_is_match,rgx_is_match,"['str', ('str', 'Span')]",['bool'],IE Function
expr_eval,expr_eval,object_arity,['object'],IE Function
not,not_ie,['bool'],['bool'],IE Function
as_str,as_str,['object'],['str'],IE Function
span_contained,span_contained,"['Span', 'Span']",['bool'],IE Function
deconstruct_span,deconstruct_span,['Span'],"['str', 'int', 'int']",IE Function
read,read,['str'],['str'],IE Function


In [None]:
#|hide
import nbdev; nbdev.nbdev_export()
     