# Python Regex

In [None]:
#| default_exp ie_func.basic

In [None]:
#| hide
from nbdev.showdoc import show_doc
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
#| export
import re
from typing import Iterable, Sequence
from numbers import Real
from pathlib import Path

from spannerlib.span import Span
from spannerlib.utils import DefaultIEs,DefaultAGGs

## Debugging utilities

In [None]:
#| export
def print_ie(fstring,*objects):
    res = fstring.format(*objects)
    print(res)
    yield res

DefaultIEs().add(
    "print",
    print_ie,
    lambda len: [object]*len,
    [object]
)

## Basic Aggs

In [None]:
#| export
DefaultAGGs().add('count','count',[object],[int])
DefaultAGGs().add('sum','sum',[Real],[Real])
DefaultAGGs().add('avg','avg',[Real],[Real])
DefaultAGGs().add('max','max',[Real],[Real])
DefaultAGGs().add('min','min',[Real],[Real])

## rgx

In [None]:
#| export

def rgx(pattern: str, text: str) -> Iterable[Sequence]:
    """
    An IE function which runs regex using python's `re` and yields tuples of strings.

    @param text: The input text for the regex operation.
    @param regex_pattern: the pattern of the regex operation.
    @return: tuples of strings that represents the results.
    """
    text = Span(text)
    compiled_rgx = re.compile(pattern)
    num_groups = compiled_rgx.groups
    if num_groups == 0:
        for match in re.finditer(compiled_rgx, str(text)):
            i,j = match.span()
            yield (text[i:j])
    else:
        for match in re.finditer(compiled_rgx, str(text)):
            indices = list((match.span(i) for i in range(1,num_groups+1)))
            yield tuple([text[i:j] for i,j in indices])

DefaultIEs().add(
    'rgx',
    rgx,
    [str, (str,Span)],
    lambda arity: [Span]*arity
)


In [None]:
text = "aaaaa@bbbbbbaa@bb"
pattern = '(?P<c>(?P<a>a*)@(?P<b>b*))'
assert list(rgx(pattern,text)) == [
    ('aaaaa@bbbbbb', 'aaaaa', 'bbbbbb'),
    ('aa@bb', 'aa', 'bb')
]

[(0, 12), (0, 5), (6, 12)]
[(12, 17), (12, 14), (15, 17)]


In [None]:
text = "aaaaa@bbbbbbaa@bb"
pattern = '((?:a*)@(?:b*))'
assert list(rgx(pattern,text)) == [
    ('aaaaa@bbbbbb',),
    ('aa@bb',)
]
list(rgx(pattern,text))

[(0, 12)]
[(12, 17)]
[(0, 12)]
[(12, 17)]


[([@a254e9,0,12) "aaaaa@bbbb...",), ([@a254e9,12,17) "aa@bb",)]

In [None]:
document = Span('dddaaaaa@bbbbbbaa@bb',name = 'doc1')
document

[@doc1,0,20) "dddaaaaa@b..."

In [None]:
list(rgx('(a*)@(b*)',document))

[(3, 8), (9, 15)]
[(15, 17), (18, 20)]


[([@doc1,3,8) "aaaaa", [@doc1,9,15) "bbbbbb"),
 ([@doc1,15,17) "aa", [@doc1,18,20) "bb")]

In [None]:
assert list(rgx('(a*)@(b*)',document)) == [
    (Span(document,3,8),Span(document,9,15)),
    (Span(document,15,17), Span(document,18,20))]
list(rgx('(a*)@(b*)',document))

[(3, 8), (9, 15)]
[(15, 17), (18, 20)]
[(3, 8), (9, 15)]
[(15, 17), (18, 20)]


[([@doc1,3,8) "aaaaa", [@doc1,9,15) "bbbbbb"),
 ([@doc1,15,17) "aa", [@doc1,18,20) "bb")]

In [None]:
sub_doc = document.slice(3,None)
assert list(rgx('(a*)@(b*)',sub_doc)) == list(rgx('(a*)@(b*)',document))


[(0, 5), (6, 12)]
[(12, 14), (15, 17)]
[(3, 8), (9, 15)]
[(15, 17), (18, 20)]


## rgx split

In [None]:
#| export
def rgx_split(delim,text,initial_tag="Start Tag"):
    """
    An IE function which given a delimeter rgx pattern and a text, 
    returns tuples of spans of the form (delimeter_match, text_before_next_delimeter).
    Note that rgx pattern should not have any groups.

    @param delim: the delimeter pattern to use.
    @param text: the text to split
    @return: tuples of strings that represents splitting the text according to delim, 
        yields tuples of the form (delimeter_match, text_before_next_delimeter).
    """
    delim_iter = rgx(delim,text)
    text = Span(text)
    try:
        first_span = next(delim_iter)
        if first_span.start != 0:
            yield(initial_tag,text[:first_span.start])
    except StopIteration:
        return
    prev_span = first_span
    for next_span in delim_iter:
        yield (prev_span, text[prev_span.end:next_span.start])
        prev_span = next_span

    yield (prev_span, text[prev_span.end:])

DefaultIEs().add(
    'rgx_split',
    rgx_split,
    [str, (str,Span)],
    [Span,Span],
)


In [None]:
assert list(rgx_split('a|x','bbbannnnxdddaca')) == [
    ('Start Tag', 'bbb'),
    ('a', 'nnnn'),
    ('x', 'ddd'),
    ('a', 'c'),
    ('a', '')]

assert list(rgx_split('a|x','abbbannnnxdddaca')) == [
    ('a', 'bbb'),
    ('a', 'nnnn'),
    ('x', 'ddd'),
    ('a', 'c'),
    ('a', '')]


## Expression eval

In [None]:
#| export
def expr_eval(template,*inputs):
    try:
        expr = template.format(*[f'arg_{i}' for i in range(len(inputs))])
    except (KeyError,IndexError):
        raise ValueError(f"Invalid expression template {template} for inputs {inputs}\n"
                f"make sure the expression template has only numerical indices and the number of inputs match the number of indices")
    yield eval(expr,None,{f'arg_{i}':arg for i,arg in enumerate(inputs)})

DefaultIEs().add(
    'expr_eval',
    expr_eval,
    lambda arity: [object]*arity,
    [object]
)

In [None]:
assert next(expr_eval('{0} + {1}',1,2)) == 3

In [None]:
a = Span('aaaa',1,3)
b = Span('bbbb',3,4)

In [None]:
assert next(expr_eval('{0}.end == {1}.start',a,b))
assert not next(expr_eval('{0}.doc == {1}.doc',a,b))
assert next(expr_eval('({0}.doc != {1}.doc) & ({0}.end == {1}.start)',a,b))

## Span operations

In [None]:
#| export
def as_str(obj):
    yield str(obj),

DefaultIEs().add(
    'as_str',
    as_str,
    [object],
    [str]
)


In [None]:
#| export
def span_contained(s1, s2):
    """
    yield True if s1 is contained in s2, otherwise yield False

    Parameters:
        span1 (span)
        span2 (span)

    Returns:
        (s1,s2) if s1 is contained in s2, otherwise returns nothing
    """
    if s1.doc == s2.doc and s1.start >= s2.start and s1.end <= s2.end:
        yield True
    else:
        yield False

DefaultIEs().add(
    'span_contained',
    span_contained,
    [Span,Span],
    [bool]
)

In [None]:
# usage example
doc1 = Span('hello darkness my old friend',name='doc1')
doc2 = Span('I come to talk to you again',name='doc2')

span1 = Span(doc1,1, 10)
span2 = Span(doc1,0, 11)
span3 = Span(doc1,2, 12)
span4 = Span(doc2,3,5)



assert list(span_contained(span1,span2)) == [True]
assert list(span_contained(span2,span1)) == [False]
assert list(span_contained(span1,span3)) == [False]
assert list(span_contained(span1,span4)) == [False]

In [None]:
#| export
def deconstruct_span(span):
    """
    yields the doc id, start and end of the span
    """
    yield span.name, span.start, span.end

DefaultIEs().add(
    'deconstruct_span',
    deconstruct_span,
    [Span],
    [str,int,int]
)

In [None]:
doc = Span('hello darkness my old friend',name='doc1')
doc2 = Span('I come to talk to you again')

assert list(deconstruct_span(doc)) == [('doc1', 0, 28)]
assert list(deconstruct_span(doc2))== [('f8f5e8', 0, 27)]

In [None]:
#| export
def read(text_path):
    """
    Reads from file and return it's content.

    Parameters:
        text_path (str): The path to the text file to read from.

    Returns:
        str: The content of the file.
    """
    yield Path(text_path).read_text()


def read_span(text_path):
    """
    Reads from file and return it's content.

    Parameters:
        text_path (str): The path to the text file to read from.

    Returns:
        str: The content of the file.
    """
    yield Span(Path(text_path).read_text(),name=text_path)

DefaultIEs().add(
    'read',
    read,
    [str],
    [str]
)

DefaultIEs().add(
    'read_span',
    read_span,
    [str],
    [Span]
)

In [None]:

path = Path('sample1.txt')
path.write_text('hello darkness my old friend')
text = list(read('sample1.txt'))[0]
text_span = list(read_span('sample1.txt'))[0]

path.unlink()

assert text == "hello darkness my old friend"
assert text_span == text
text_span

[@sample1.txt,0,28) "hello dark..."

In [None]:
#|hide
import nbdev; nbdev.nbdev_export()
     