# Python Regex

In [None]:
#| default_exp ie_func.python_regex

In [None]:
#| hide
from nbdev.showdoc import show_doc
%load_ext autoreload
%autoreload 2

In [None]:
#| export
import re
from typing import Iterable, Sequence

from spannerlib.span import Span

In [None]:
# TODO add booleans to the type system


In [None]:
#| export
def rgx(pattern: str, text: str) -> Iterable[Sequence]:
    """
    An IE function which runs regex using python's `re` and yields tuples of strings.

    @param text: The input text for the regex operation.
    @param regex_pattern: the pattern of the regex operation.
    @return: tuples of strings that represents the results.
    """
    text = Span(text)
    compiled_rgx = re.compile(pattern)
    num_groups = compiled_rgx.groups
    if num_groups == 0:
        for match in re.finditer(compiled_rgx, str(text)):
            i,j = match.span()
            yield (text[i:j])
    else:
        for match in re.finditer(compiled_rgx, str(text)):
            indices = (match.span(i) for i in range(1,num_groups+1))
            yield tuple([text[i:j] for i,j in indices])

PYRGX =[
    'rgx',
    rgx,
    [str, str],
    lambda arity: [str]*arity
]


In [None]:
text = "aaaaa@bbbbbbaa@bb"
pattern = '(?P<c>(?P<a>a*)@(?P<b>b*))'
assert list(rgx(pattern,text)) == [
    ('aaaaa@bbbbbb', 'aaaaa', 'bbbbbb'),
    ('aa@bb', 'aa', 'bb')
]
list(rgx(pattern,text))

[([@a254e9,0,12) "aaaaa@bbbb...",
  [@a254e9,0,5) "aaaaa",
  [@a254e9,6,12) "bbbbbb"),
 ([@a254e9,12,17) "aa@bb", [@a254e9,12,14) "aa", [@a254e9,15,17) "bb")]

In [None]:
text = "aaaaa@bbbbbbaa@bb"
pattern = '((?:a*)@(?:b*))'
assert list(rgx(pattern,text)) == [
    ('aaaaa@bbbbbb',),
    ('aa@bb',)
]
list(rgx(pattern,text))

[([@a254e9,0,12) "aaaaa@bbbb...",), ([@a254e9,12,17) "aa@bb",)]

In [None]:
document = Span('dddaaaaa@bbbbbbaa@bb',name = 'doc1')
document

[@doc1,0,20) "dddaaaaa@b..."

In [None]:
list(rgx('(a*)@(b*)',document))

[([@doc1,3,8) "aaaaa", [@doc1,9,15) "bbbbbb"),
 ([@doc1,15,17) "aa", [@doc1,18,20) "bb")]

In [None]:
assert list(rgx('(a*)@(b*)',document)) == [
    (Span(document,3,8),Span(document,9,15)),
    (Span(document,15,17), Span(document,18,20))]
list(rgx('(a*)@(b*)',document))

[([@doc1,3,8) "aaaaa", [@doc1,9,15) "bbbbbb"),
 ([@doc1,15,17) "aa", [@doc1,18,20) "bb")]

In [None]:
sub_doc = document.slice(3,None)
assert list(rgx(sub_doc,'(a*)@(b*)')) == list(rgx(document,'(a*)@(b*)'))


[([@c1d7fe,0,8) "John Doe", [@c1d7fe,10,12) "35"),
 ([@c1d7fe,24,34) "Jane Smith", [@c1d7fe,36,38) "28")]

In [None]:
#| export
def as_str(span):
    yield str(span),

AS_STRING = [
    'as_str',
    as_str,
    [Span],
    [str]
]


In [None]:
#|hide
import nbdev; nbdev.nbdev_export()
     