In [1]:
#export
__all__ = ["grep", "grepTemplate"]
import re, k1lib
from k1lib.cli.init import BaseCli; import k1lib.cli as cli
from collections import deque; from typing import Iterator, Union, Callable, Any

In [33]:
#export
def extGuard(cond, s=""):
    if cond: Exception(f"Can't use extract mode. {s}")
inf = float("inf")
class grep(BaseCli):
    def __init__(self, pattern:Union[str, Callable[[Any], bool]], before:int=0, after:int=0, N:int=float("inf"), sep:bool=False, col:int=None, extract:str=None):
        """Find lines that has the specified pattern.
Example::

    # returns ['d', 'd']
    "abcde12d34" | grep("d") | deref()
    # returns ['c', 'd', '2', 'd'], 2 sections of ['c', 'd'] and ['2', 'd']
    "abcde12d34" | grep("d", 1) | deref()
    # returns ['c', 'd']
    "abcde12d34" | grep("d", 1, N=1) | deref()
    # returns ['d', 'e', 'd', '3', '4'], 2 sections of ['d', 'e'] and ['d', '3', '4']
    "abcde12d34" | grep("d", 0, 3).till("e") | deref()
    # returns [['0', '1', '2'], ['3', '1', '4']]
    "0123145" | grep("1", 2, 1, sep=True) | deref()

You can also separate out the sections::

    # returns [['c', 'd'], ['2', 'd']]
    "abcde12d34" | grep("d", 1, sep=True) | deref()
    # returns [['c', 'd']]
    "abcde12d34" | grep("d", 1, N=1, sep=True) | deref()
    # returns [['1', '2', '3'], ['1', '4', '5']]
    "0123145" | grep("1", sep=True).till() | deref()

You can also put in predicates instead of regex patterns::

    # returns ['d', 'd']
    "abcde12d34" | grep(lambda x: x == "d") | deref()
    # also returns ['d', 'd']
    "abcde12d34" | filt(lambda x: x == "d") | deref()
    # returns ['d', 'e', 'd', '3', '4']
    "abcde12d34" | grep(lambda x: x == "d").till(lambda x: x == "e") | deref()

The first scenario looks like a regular filter function, already implemented by :class:`~k1lib.cli.filt.filt`,
but :class:`grep` brings in more clustering features for the price of reduced
execution speed. So for simple scenarios it's advised that you use :class:`~k1lib.cli.filt.filt`.

See also: :class:`~k1lib.cli.structural.groupBy`

Also, there's a `whole tutorial <../tutorials/cli.html>`_ devoted to just this cli

Also also, if each element in the input iterator is not a string/bytes, and
you're searching using regex, then it will get its representation and searches
in it.

.. admonition:: Extract mode

    Sometimes, you want to extract a subsection of a matched string, like extracting
    links in a html file, then you can do something like this::

        # returns ['a.io', 'b.com', 'c.net']
        ["href='a.io'", "href='b.com'", "href='c.net'"] | grep("href='(?P<g>.*)'", extract="g") | deref()
        # returns [['a.io', 3], ['b.com', 4], ['c.net', 5]]
        [["href='a.io'", 3], ["href='b.com'", 4], ["href='c.net'", 5]] | grep("href='(?P<g>.*)'", extract="g", col=0) | deref()
    
    Essentially, you're defining the group with name "g" to be any string within a quote block
    following "href", and then it will just extract out the group that you want. Because the
    purpose of this mode is to extract matched objects, a few of the arguments don't really make
    sense and thus are disabled, like "before", "after", "sep", "N"

Regex quick cheatsheet:

- `\\d`: digit (\\D for inverse)
- `^`: begin of string ($ for end of string)
- `\\w`: unicode word (\\W for inverse)
- `(?!...)`: matches if the inside does not match
- `(?P<name>...)`: matches as group "name"
- `A|B`: matches A or B
- `[aml]`: set of characters "a", "m" and "k"
- `a{3,5}`: matches character "a" 3 to 5 times ("aaa", "aaaa" and "aaaaa")
- `a*`: matches "a" 0 or more times (`a*?` matches "a" 0 or more times non-greedy)

:param pattern: regex pattern to search for in a line
:param before: lines before the hit. Outputs independent lines
:param after: lines after the hit. Outputs independent lines
:param N: max sections to output
:param sep: whether to separate out the sections as lists
:param col: searches for pattern in a specific column"""
        super().__init__()
        if isinstance(pattern, str):
            self._f = re.compile(pattern).search; self.mode = 0 # make func quickly accessible
        else: self._f = cli.op.solidify(pattern); self.mode = 1 # mode for either regex or normal funcs
        self.before = before; self.after = after; self.col = col; self.N = N; self.sep = sep
        self.tillPattern = None; self.tillAfter = None; self._tillF = lambda x: False; self.extract = extract
        if extract:
            extGuard(before, "`before` has to be zero")
            extGuard(after, "`after` has to be zero")
            extGuard(sep, "`sep` has to be False")
            extGuard(col is not None, "`col` has to be None. Cut out the column if you want")
            extGuard(N < inf, "`N` has to be infinite. Just use head() if you want to limit the number of results")
    def till(self, pattern:Union[str, Callable[[Any], bool]]=None):
        """Greps until some other pattern appear. Inclusive, so you might want to
trim the last line. Example::

    # returns ['5', '6', '7', '8'], includes last item
    range(10) | join("") | grep("5").till("8") | deref()
    # returns ['d', 'e', 'd', '3', '4']
    "abcde12d34" | grep("d").till("e") | deref()
    # returns ['d', 'e']
    "abcde12d34" | grep("d", N=1).till("e") | deref()

If initial pattern and till pattern are the same, then you don't have use this method at
all. Instead, do something like this::

    # returns ['1', '2', '3']
    "0123145" | grep("1", after=1e9, N=1) | deref()"""
        if self.extract: extGuard(True, "Can't use .till() in extract mode as it makes no sense")
        if pattern is None: self._tillF = self._f
        elif isinstance(pattern, str): self._tillF = re.compile(pattern).search
        else: self._tillF = cli.op.solidify(pattern)
        self.tillAfter = self.after; self.after = inf; return self
    def __ror__(self, it:Iterator[str]) -> Iterator[str]:
        self.sectionIdx = 0; col = self.col; _f = self._f; _tillF = self._tillF
        if self.sep:
            elems = []; idx = 0
            s = self._clone(); s.sep = False
            for line in (it | s):
                if s.sectionIdx > idx: # outputs whatever remaining
                    if len(elems) > 0: yield list(elems)
                    idx = s.sectionIdx; elems = []
                elems.append(line)
            yield list(elems); return
        if self.extract:
            group = self.extract
            if col is None:
                for line in it:
                    res = _f(line)
                    if res: yield res.group(group)
            else:
                for line in it:
                    line = list(line); res = _f(line[col])
                    if res: line[col] = res.group(group); yield line
                    
            return
        queue = deque([], self.before); counter = 0 # remaining lines after to display
        cRO = k1lib.RunOnce(); cRO.done()
        for line in it:
            if col != None: line = list(line); elem = line[col]
            else: elem = line
            if self.mode == 0 and not isinstance(elem, (str, bytes)): elem = f"{elem}"
            if _f(elem): # new section
                self.sectionIdx += 1; counter = self.after+1; cRO.revert()
                if self.sectionIdx > self.N: return
                yield from queue; queue.clear(); yield line
            elif _tillF(elem) and counter == inf: # closing section
                counter = self.tillAfter + 1; cRO.revert(); yield line
            if counter == 0:
                queue.append(line) # saves recent past lines
            elif counter > 0: # yielding "after" section
                if cRO.done(): yield line
                counter -= 1
    def __invert__(self):
        """Flips the pattern, just like how :class:`~k1lib.cli.filt.filt`
works. Example::

    # returns ['a', 'b', 'c', 'e', '1', '2', '3', '4']
    "abcde12d34" | ~grep("d") | deref()"""
        if self.extract: extGuard(True, "Can't invert search condition in extract mode as it makes no sense")
        f = self._f; self._f = lambda s: not f(s); return self
    def _clone(self):
        answer = grep(self._f, self.before, self.after, self.N, self.sep, self.col)
        answer._tillF = self._tillF; answer.tillAfter = self.tillAfter; return answer

In [34]:
# joined, normal
assert "abcde12d34" | grep("d") | cli.deref() == ['d', 'd']
assert "abcde12d34" | grep(lambda x: x == "d") | cli.deref() == ['d', 'd']
assert "abcde12d34" | cli.filt(lambda x: x == "d") | cli.deref() == ["d", "d"]
assert "abcde12d34" | grep("d", 1) | cli.deref() == ['c', 'd', '2', 'd']
assert "abcde12d34" | grep("d", 1, N=1) | cli.deref() == ['c', 'd']
assert "0123456789" | grep("4", after=1e9, N=1) | cli.deref() == ['4', '5', '6', '7', '8', '9']
# joined, till
assert "abcde12d34" | grep("d", N=1).till("e") | cli.deref() == ['d', 'e']
assert "abcde12d34" | grep("d").till("e") | cli.deref() == ['d', 'e', 'd', '3', '4']
assert "abcde12d34" | grep(cli.op() == "d").till(cli.op() == "e") | cli.deref() == ['d', 'e', 'd', '3', '4']
assert "abcde12d34" | grep(lambda x: x == "d").till(lambda x: x == "e") | cli.deref() == ['d', 'e', 'd', '3', '4']
assert ["abcde12d34", "abcde12d34"] | grep("d", sep=True).till("e").all() | cli.deref() == [[['d', 'e'], ['d', '3', '4']], [['d', 'e'], ['d', '3', '4']]]
assert range(10) | cli.join("") | grep("5").till("8") | cli.deref() == ['5', '6', '7', '8']
assert range(10) | cli.join("") | grep("5", N=1).till("8") | cli.deref() == ['5', '6', '7', '8']
assert "0123145" | grep("1", N=1).till("1") | cli.deref() == ['1', '2', '3']
assert "0123145" | grep("1", N=1).till() | cli.deref() == ['1', '2', '3']
assert "0123145" | grep("1", after=1e9, N=1) | cli.deref() == ['1', '2', '3']
# separated
assert "abcde12d34" | grep("d", 1, sep=True) | cli.deref() == [['c', 'd'], ['2', 'd']]
assert "abcde12d34" | grep("d", 1, N=1, sep=True) | cli.deref() == [['c', 'd']]
assert "0123145" | grep("1", sep=True).till() | cli.deref() == [['1', '2', '3'], ['1', '4', '5']]
assert "0123145" | grep("1", 4, 2, sep=True) | cli.deref() == [['0', '1', '2', '3'], ['1', '4', '5']]
assert "0123145" | grep("1", 2, 1, sep=True) | cli.deref() == [['0', '1', '2'], ['3', '1', '4']]
assert [['/reset', 2902], ['/users', 6], ['/users', 11], ['/user/2', 4],
 ['/user/2', 5], ['/properties', 3], ['/properties', 8], ['/user/2', 4],
 ['/users', 4], ['/user/2', 6], ['/properties', 4], ['/properties', 6]]\
| grep("/user/\d+", col=0) | cli.deref() == [['/user/2', 4], ['/user/2', 5], ['/user/2', 4], ['/user/2', 6]]
assert "abcde12d34" | ~grep("d") | cli.deref() == ['a', 'b', 'c', 'e', '1', '2', '3', '4']
# extract mode
assert ["href='a.io'", "href='b.com'", "href='c.net'"] | grep("href='(?P<g>.*)'", extract="g") | cli.deref() == ['a.io', 'b.com', 'c.net']
assert [["href='a.io'", 3], ["href='b.com'", 4], ["href='c.net'", 5]] | grep("href='(?P<g>.*)'", extract="g", col=0) | cli.deref() == [['a.io', 3], ['b.com', 4], ['c.net', 5]]

In [35]:
#export
class grepTemplate(BaseCli):
    def __init__(self, pattern:str, template:str):
        """Searches over all lines, pick out the match, and expands
it to the templateand yields"""
        super().__init__()
        self.pattern = re.compile(pattern); self.template = template
    def __ror__(self, it:Iterator[str]):
        super().__ror__(it)
        for line in it:
            matchObj = self.pattern.search(line)
            if matchObj is None: continue
            yield matchObj.expand(self.template)

In [36]:
!../../export.py cli/grep

2023-06-29 12:43:55,515	INFO worker.py:1364 -- Connecting to existing Ray cluster at address: 192.168.1.133:6379...
2023-06-29 12:43:55,519	INFO worker.py:1544 -- Connected to Ray cluster. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m
6573    1   39%   
10122   0   61%   
rm: cannot remove '__pycache__': No such file or directory
Found existing installation: k1lib 1.4.1
Uninstalling k1lib-1.4.1:
  Successfully uninstalled k1lib-1.4.1
running install
running bdist_egg
running egg_info
creating k1lib.egg-info
writing k1lib.egg-info/PKG-INFO
writing dependency_links to k1lib.egg-info/dependency_links.txt
writing requirements to k1lib.egg-info/requires.txt
writing top-level names to k1lib.egg-info/top_level.txt
writing manifest file 'k1lib.egg-info/SOURCES.txt'
reading manifest file 'k1lib.egg-info/SOURCES.txt'
adding license file 'LICENSE'
writing manifest file 'k1lib.egg-info/SOURCES.txt'
installing library code to build/bdist.linux-x86_64/egg
running install_lib
running build