# Python Regex

In [None]:
#| default_exp ie_func.python_regex

In [None]:
#| hide
from nbdev.showdoc import show_doc
%load_ext autoreload
%autoreload 2

In [None]:
#| export
import re
from typing import Iterable, Sequence

from spannerlib.span import Span

## rgx

In [None]:
#| export

def rgx(pattern: str, text: str) -> Iterable[Sequence]:
    """
    An IE function which runs regex using python's `re` and yields tuples of strings.

    @param text: The input text for the regex operation.
    @param regex_pattern: the pattern of the regex operation.
    @return: tuples of strings that represents the results.
    """
    text = Span(text)
    compiled_rgx = re.compile(pattern)
    num_groups = compiled_rgx.groups
    if num_groups == 0:
        for match in re.finditer(compiled_rgx, str(text)):
            i,j = match.span()
            yield (text[i:j])
    else:
        for match in re.finditer(compiled_rgx, str(text)):
            indices = (match.span(i) for i in range(1,num_groups+1))
            yield tuple([text[i:j] for i,j in indices])

PYRGX =[
    'rgx',
    rgx,
    [str, str],
    lambda arity: [str]*arity
]


In [None]:
text = "aaaaa@bbbbbbaa@bb"
pattern = '(?P<c>(?P<a>a*)@(?P<b>b*))'
assert list(rgx(pattern,text)) == [
    ('aaaaa@bbbbbb', 'aaaaa', 'bbbbbb'),
    ('aa@bb', 'aa', 'bb')
]
list(rgx(pattern,text))

[([@a254e9,0,12) "aaaaa@bbbb...",
  [@a254e9,0,5) "aaaaa",
  [@a254e9,6,12) "bbbbbb"),
 ([@a254e9,12,17) "aa@bb", [@a254e9,12,14) "aa", [@a254e9,15,17) "bb")]

In [None]:
text = "aaaaa@bbbbbbaa@bb"
pattern = '((?:a*)@(?:b*))'
assert list(rgx(pattern,text)) == [
    ('aaaaa@bbbbbb',),
    ('aa@bb',)
]
list(rgx(pattern,text))

[([@a254e9,0,12) "aaaaa@bbbb...",), ([@a254e9,12,17) "aa@bb",)]

In [None]:
document = Span('dddaaaaa@bbbbbbaa@bb',name = 'doc1')
document

[@doc1,0,20) "dddaaaaa@b..."

In [None]:
list(rgx('(a*)@(b*)',document))

[([@doc1,3,8) "aaaaa", [@doc1,9,15) "bbbbbb"),
 ([@doc1,15,17) "aa", [@doc1,18,20) "bb")]

In [None]:
assert list(rgx('(a*)@(b*)',document)) == [
    (Span(document,3,8),Span(document,9,15)),
    (Span(document,15,17), Span(document,18,20))]
list(rgx('(a*)@(b*)',document))

[([@doc1,3,8) "aaaaa", [@doc1,9,15) "bbbbbb"),
 ([@doc1,15,17) "aa", [@doc1,18,20) "bb")]

In [None]:
sub_doc = document.slice(3,None)
assert list(rgx(sub_doc,'(a*)@(b*)')) == list(rgx(document,'(a*)@(b*)'))


## rgx split

In [None]:
#| export
def rgx_split(delim,text,initial_tag="Start Tag"):
    """
    An IE function which given a delimeter rgx pattern and a text, 
    returns tuples of spans of the form (delimeter_match, text_before_next_delimeter).
    Note that rgx pattern should not have any groups.

    @param delim: the delimeter pattern to use.
    @param text: the text to split
    @return: tuples of strings that represents splitting the text according to delim, 
        yields tuples of the form (delimeter_match, text_before_next_delimeter).
    """
    delim_iter = rgx(delim,text)
    try:
        first_span = next(delim_iter)
        if first_span.start != 0:
            yield(initial_tag,text[:first_span.start])
    except StopIteration:
        return
    prev_span = first_span
    for next_span in delim_iter:
        yield (prev_span, text[prev_span.end:next_span.start])
        prev_span = next_span

    yield (prev_span, text[prev_span.end:])

PYRGX_SPLIT =[
    'rgx_split',
    rgx_split,
    [str, str],
    [Span,Span],
]


In [None]:
assert list(rgx_split('a|x','bbbannnnxdddaca')) == [
    ('Start Tag', 'bbb'),
    ('a', 'nnnn'),
    ('x', 'ddd'),
    ('a', 'c'),
    ('a', '')]

assert list(rgx_split('a|x','abbbannnnxdddaca')) == [
    ('a', 'bbb'),
    ('a', 'nnnn'),
    ('x', 'ddd'),
    ('a', 'c'),
    ('a', '')]


## Span to string

In [None]:
#| export
def as_str(span):
    yield str(span),

AS_STRING = [
    'as_str',
    as_str,
    [Span],
    [str]
]


## Expression eval

In [None]:
eval

[0;31mSignature:[0m [0meval[0m[0;34m([0m[0msource[0m[0;34m,[0m [0mglobals[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mlocals[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0;34m/[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Evaluate the given source in the context of globals and locals.

The source may be a string representing a Python expression
or a code object as returned by compile().
The globals must be a dictionary and locals can be any mapping,
defaulting to the current globals and locals.
If only globals is given, locals defaults to it.
[0;31mType:[0m      builtin_function_or_method

In [None]:
#| export
def expr_eval(template,*inputs):
    try:
        expr = template.format(*[f'arg_{i}' for i in range(len(inputs))])
    except (KeyError,IndexError):
        raise ValueError(f"Invalid expression template {template} for inputs {inputs}\n"
                f"make sure the expression template has only numerical indices and the number of inputs match the number of indices")
    yield eval(expr,None,{f'arg_{i}':arg for i,arg in enumerate(inputs)})

EXPR_EVAL = [
    'expr_eval',
    expr_eval,
    lambda arity: [object]*arity,
    [object]
]

In [None]:
assert next(expr_eval('{0} + {1}',1,2)) == 3

In [None]:
a = Span('aaaa',1,3)
b = Span('bbbb',3,4)

In [None]:
assert next(expr_eval('{0}.end == {1}.start',a,b))
assert not next(expr_eval('{0}.doc == {1}.doc',a,b))
assert next(expr_eval('({0}.doc != {1}.doc) & ({0}.end == {1}.start)',a,b))

In [None]:
#|hide
import nbdev; nbdev.nbdev_export()
     