# Python Regex

In [None]:
#| default_exp ie_func.python_regex

In [None]:
#| hide
from nbdev.showdoc import show_doc
%load_ext autoreload
%autoreload 2

In [None]:
#| export
import re
from typing import Iterable, Sequence

from spannerlib.span import Span

In [None]:
#| export
def rgx(text: str, regex_pattern: str) -> Iterable[Sequence]:
    """
    An IE function which runs regex using python's `re` and yields tuples of strings.

    @param text: The input text for the regex operation.
    @param regex_pattern: the pattern of the regex operation.
    @return: tuples of strings that represents the results.
    """
    compiled_rgx = re.compile(regex_pattern)
    num_groups = compiled_rgx.groups
    for match in re.finditer(compiled_rgx, text):
        if num_groups == 0:
            matched_strings = [match.group()]
        else:
            matched_strings = [group for group in match.groups()]
        yield matched_strings

In [None]:
#| export
PYRGX_STRING = [
    'rgx',
    rgx,
    [str, str],
    lambda output_arity: [str] * output_arity
]

In [None]:
assert list(rgx('aaaaa@bbbbbbaa@bb','(a*)@(b*)')) == [['aaaaa', 'bbbbbb'], ['aa', 'bb']]

In [None]:
#| export
def rgx_span(text: str, regex_pattern: str) -> Iterable[Sequence]:
    """
    An IE function which runs regex using python's `re` and yields tuples of spans.

    @param text: The input text for the regex operation.
    @param regex_pattern: the pattern of the regex operation.
    @return: tuples of spans that represents the results.
    """
    compiled_rgx = re.compile(regex_pattern)
    num_groups = compiled_rgx.groups
    if isinstance(text, Span):
        text = text.as_str()
    for match in re.finditer(compiled_rgx,text):
        if num_groups == 0:
            matched_spans = (Span(text,match.span()[0],match.span()[1]))
        else:
            matched_spans = [Span(text,match.span(i)[0],match.span(i)[1]) for i in range(1, num_groups + 1)]
        yield matched_spans

In [None]:
#| export
PYRGX = [
    'rgx_span',
    rgx_span,
    [str, str],
    lambda output_arity: [Span] * output_arity
]

In [None]:
document = Span('aaaaa@bbbbbbaa@bb',name = 'doc1')
document

[@doc1,0,17) "aaaaa@bbbb..."

In [None]:
for m in re.finditer('(a*)@(b*)', 'aaaaa@bbbbbbaa@bb'):
    print(m.span(1)[0],m.span(2)[1])

0 12
12 17


In [None]:
list(rgx_span(document,'(a*)@(b*)'))

[[[@a254e9,0,5) "aaaaa", [@a254e9,6,12) "bbbbbb"],
 [[@a254e9,12,14) "aa", [@a254e9,15,17) "bb"]]

In [None]:
assert list(rgx_span(document,'(a*)@(b*)')) == [
    [Span(document,0,5),Span(document,6,12)],
    [Span(document,12,14), Span(document,15,17)]]
list(rgx_span(document,'(a*)@(b*)'))

[[[@a254e9,0,5) "aaaaa", [@a254e9,6,12) "bbbbbb"],
 [[@a254e9,12,14) "aa", [@a254e9,15,17) "bb"]]

In [None]:
#| export
def as_str(span):
    return [span.as_str()]

AS_STRING = [
    'as_str',
    as_str,
    [Span],
    [str]
]


In [None]:
#|hide
import nbdev; nbdev.nbdev_export()
     