In [48]:
#export
"""
This is for all short utilities that converts from 1 data type to another. They
might feel they have different styles, as :class:`toFloat` converts object iterator to
float iterator, while :class:`toPIL` converts single image url to single PIL image,
whereas :class:`toSum` converts float iterator into a single float value.

The general convention is, if the intended operation sounds simple (convert to floats,
strings, types, ...), then most likely it will convert iterator to iterator, as you
can always use the function directly if you only want to apply it on 1 object.

If it sounds complicated (convert to PIL image, tensor, ...) then most likely it will
convert object to object. Lastly, there are some that just feels right to input
an iterator and output a single object (like getting max, min, std, mean values)."""
__all__ = ["toTensor", "toRange", "toList",
           "toSum", "toProd", "toAvg", "toMean", "toStd", "toMax", "toMin", "toArgmin", "toArgmax",
           "toPIL", "toImg", "toRgb", "toRgba", "toGray", "toDict",
           "toFloat", "toInt", "toBytes", "toDataUri", "toAnchor", "toHtml",
           "toAscii", "toHash", "toCsv", "toAudio", "toUnix", "toIso", "toYMD", "toLinks",
           "toMovingAvg", "toCm"]
import re, k1lib, math, os, numpy as np, io, json, base64, unicodedata, inspect
from k1lib.cli.init import BaseCli, T, yieldT; import k1lib.cli as cli, k1lib.cli.init as init
from k1lib.cli.typehint import *; mpl = k1lib.dep("matplotlib"); plt = k1lib.dep("matplotlib.pyplot")
from collections import deque, defaultdict; from typing import Iterator, Any, List, Set, Tuple, Dict, Callable, Union
settings = k1lib.settings.cli
try: import PIL; hasPIL = True
except: hasPIL = False
try: import torch; hasTorch = True
except: torch = k1lib.dep("torch"); hasTorch = False
try: import rdkit; hasRdkit = True
except: hasRdkit = False
try: import graphviz; hasGraphviz = True
except: hasGraphviz = False
try: import plotly; import plotly.express as px; hasPlotly = True
except: hasPlotly = False

In [2]:
cli.init.patchNumpy()

In [3]:
#export
class toTensor(BaseCli):
    def __init__(self, dtype=None):
        """Converts generator to :class:`torch.Tensor`. Essentially
``torch.tensor(list(it))``. Default dtype is float32

Also checks if input is a PIL Image. If yes, turn it into a :class:`torch.Tensor`
and return."""
        self.dtype = dtype or torch.float32
    def __ror__(self, it:Iterator[float]) -> "torch.Tensor":
        try:
            import PIL; pic=it
            if isinstance(pic, PIL.Image.Image): # stolen from torchvision ToTensor transform
                mode_to_nptype = {'I': np.int32, 'I;16': np.int16, 'F': np.float32}
                img = torch.from_numpy(np.array(pic, mode_to_nptype.get(pic.mode, np.uint8), copy=True))
                if pic.mode == '1': img = 255 * img
                img = img.view(pic.size[1], pic.size[0], len(pic.getbands()))
                return img.permute((2, 0, 1)).contiguous().to(self.dtype) # put it from HWC to CHW format
        except: pass
        if isinstance(it, np.ndarray): return torch.tensor(it).to(self.dtype)
        return torch.tensor(list(it)).to(self.dtype)

In [4]:
assert (range(3) | toTensor(torch.float64)).dtype == torch.float64

In [5]:
#export
class toList(BaseCli): # this still exists cause some LLVM optimizations are done on this, and too tired to change that at the moment
    def __init__(self):
        """Converts generator to list. 
Example::

    # returns [0, 1, 2, 3, 4]
    range(5) | toList()
    # returns [0, 1, 2, 3, 4]
    range(5) | aS(list)

So this cli is sort of outdated. It still works fine, nothing wrong
with it, but just do ``aS(list)`` instead. It's not removed to
avoid breaking old projects."""
        super().__init__()
    def _typehint(self, inp):
        if isinstance(inp, tListIterSet): return tList(inp.child)
        if isinstance(inp, tCollection): return inp
        return tList(tAny())
    def __ror__(self, it:Iterator[Any]) -> List[Any]: return list(it)
    def _jsF(self, meta):
        fIdx = init._jsFAuto(); dataIdx = init._jsDAuto()
        return f"const {fIdx} = ({dataIdx}) => {dataIdx}", fIdx
def _toRange(it):
    for i, _ in enumerate(it): yield i
class toRange(BaseCli):
    def __init__(self):
        """Returns iter(range(len(it))), effectively.
Example::

    # returns [0, 1, 2]
    [3, 2, 5] | toRange() | deref()"""
        super().__init__()
    def __ror__(self, it:Iterator[Any]) -> Iterator[int]:
        try: return range(len(it))
        except: return _toRange(it)
    def _jsF(self, meta):
        fIdx = init._jsFAuto(); dataIdx = init._jsDAuto()
        return f"const {fIdx} = ({dataIdx}) => {dataIdx}.toRange()", fIdx
tOpt.addPass(lambda cs, ts, _: [cs[0]], [toRange, toRange])

In [6]:
assert [2, 3.5, "ah", torch.randn(2, 3)] | tCheck() | cli.apply(type) | cli.deref() == [int, float, str, torch.Tensor]
assert range(10, 20) | tCheck() | toRange() == range(0, 10)
assert [3, 2, 5] | toRange() | cli.deref() == [0, 1, 2]

In [7]:
#export
settings.add("arrayTypes", (torch.Tensor, np.ndarray) if hasTorch else (np.ndarray,), "default array types used to accelerate clis")
def genericTypeHint(inp):
    if isinstance(inp, tListIterSet): return inp.child
    if isinstance(inp, tCollection): return inp.children[0]
    if isinstance(inp, tArrayTypes): return inp.child
    return tAny()
class toSum(BaseCli):
    def __init__(self):
        """Calculates the sum of list of numbers. Can pipe in :class:`torch.Tensor` or :class:`numpy.ndarray`.
Example::

    # returns 45
    range(10) | toSum()"""
        super().__init__()
    def _all_array_opt(self, it, level):
        bm = np if isinstance(it, np.ndarray) else (torch if hasTorch and isinstance(it, torch.Tensor) else None)
        return NotImplemented if bm is None else bm.sum(it, tuple(range(level, len(it.shape))))
    def _typehint(self, inp): return genericTypeHint(inp)
    def __ror__(self, it:Iterator[float]):
        if isinstance(it, settings.arrayTypes): return it.sum()
        return sum(it)
    def _jsF(self, meta):
        fIdx = init._jsFAuto(); dataIdx = init._jsDAuto()
        return f"const {fIdx} = ({dataIdx}) => {dataIdx}.toSum()", fIdx
class toProd(BaseCli):
    def __init__(self):
        """Calculates the product of a list of numbers. Can pipe in :class:`torch.Tensor` or :class:`numpy.ndarray`.
Example::

    # returns 362880
    range(1,10) | toProd()"""
        super().__init__()
    def _all_array_opt(self, it, level):
        if isinstance(it, np.ndarray): return np.prod(it, tuple(range(level, len(it.shape))))
        elif hasTorch and isinstance(it, torch.Tensor):
            for i in range(level, len(it.shape)): it = torch.prod(it, level)
            return it
        return NotImplemented
    def _typehint(self, inp): return genericTypeHint(inp)
    def __ror__(self, it):
        if isinstance(it, settings.arrayTypes): return it.prod()
        else: return math.prod(it)
    def _jsF(self, meta):
        fIdx = init._jsFAuto(); dataIdx = init._jsDAuto()
        return f"const {fIdx} = ({dataIdx}) => {dataIdx}.toProd()", fIdx
class toAvg(BaseCli):
    def __init__(self):
        """Calculates average of list of numbers. Can pipe in :class:`torch.Tensor` or :class:`numpy.ndarray`.
Example::

    # returns 4.5
    range(10) | toAvg()
    # returns nan
    [] | toAvg()"""
        super().__init__()
    def _all_array_opt(self, it, level):
        bm = np if isinstance(it, np.ndarray) else (torch if hasTorch and isinstance(it, torch.Tensor) else None)
        return NotImplemented if bm is None else bm.mean(it, tuple(range(level, len(it.shape))))
    def _typehint(self, inp):
        i = None
        if isinstance(inp, tListIterSet): i = inp.child
        if isinstance(inp, tCollection): i = inp.children[0]
        if isinstance(inp, tArrayTypes): i = inp.child
        if i is not None: return float if i == int else i
        return tAny()
    def __ror__(self, it:Iterator[float]):
        if isinstance(it, settings.arrayTypes): return it.mean()
        s = 0; i = -1
        for i, v in enumerate(it): s += v
        i += 1
        if not k1lib.settings.cli.strict and i == 0: return float("nan")
        return s / i
    def _jsF(self, meta):
        fIdx = init._jsFAuto(); dataIdx = init._jsDAuto()
        return f"const {fIdx} = ({dataIdx}) => {dataIdx}.toAvg()", fIdx
if hasTorch:
    torchVer = int(torch.__version__.split(".")[0])
    if torchVer >= 2:
        def torchStd(it, ddof, dim=None): return torch.std(it, dim, correction=ddof)
    else:
        def torchStd(it, ddof, dim=None):
            if ddof == 0: return torch.std(it, dim, unbiased=False)
            if ddof == 1: return torch.std(it, dim, unbiased=True)
            raise Exception(f"Please install PyTorch 2, as version 1 don't support correction factor of {ddof}")
else:
    def torchStd(it, ddof): raise Exception("PyTorch not installed")
class toStd(BaseCli):
    def __init__(self, ddof:int=0):
        """Calculates standard deviation of list of numbers. Can pipe in :class:`torch.Tensor`
or :class:`numpy.ndarray` to be faster. Example::

    # returns 2.8722813232690143
    range(10) | toStd()
    # returns nan
    [] | toStd()

:param ddof: "delta degree of freedom". The divisor used in calculations is ``N - ddof``"""
        self.ddof = ddof
    def _all_array_opt(self, it, level):
        n = len(it.shape); ddof = self.ddof; dim = tuple(range(level, n))
        if isinstance(it, np.ndarray): return np.std(it, ddof=ddof, axis=dim)
        elif hasTorch and isinstance(it, torch.Tensor): return torchStd(it, ddof, dim)
        return NotImplemented
    def __ror__(self, it):
        ddof = self.ddof
        if isinstance(it, settings.arrayTypes):
            if isinstance(it, np.ndarray): return np.std(it, ddof=ddof)
            elif hasTorch and isinstance(it, torch.Tensor): return torchStd(it, ddof)
        return np.std(np.array(list(it)))
    def _jsF(self, meta):
        fIdx = init._jsFAuto(); dataIdx = init._jsDAuto()
        return f"const {fIdx} = ({dataIdx}) => {dataIdx}.toStd()", fIdx
toMean = toAvg

In [8]:
assert range(10) | tCheck() | toSum() == 45
assert range(1,10) | tCheck() | toProd() == 362880
assert range(10) | tCheck() | toAvg() == 4.5
assert id([] | toAvg()) == id(float("nan"))
assert np.linspace(2, 3) | toAvg() == 2.5
assert 2.8 < (range(10) | toStd()) < 2.9
a = np.random.randn(2, 3, 4); ta = torch.tensor(a)
assert a  | toSum() .all()  | cli.op().shape == (2,);   assert a  | toProd().all()  | cli.op().shape == (2,)
assert a  | toAvg() .all()  | cli.op().shape == (2,);   assert a  | toStd() .all()  | cli.op().shape == (2,)
assert ta | toSum() .all()  | cli.op().shape == (2,);   assert ta | toProd().all()  | cli.op().shape == (2,)
assert ta | toAvg() .all()  | cli.op().shape == (2,);   assert ta | toStd() .all()  | cli.op().shape == (2,)
assert a  | toSum() .all(2) | cli.op().shape == (2, 3); assert a  | toProd().all(2) | cli.op().shape == (2, 3)
assert a  | toAvg() .all(2) | cli.op().shape == (2, 3); assert a  | toStd() .all(2) | cli.op().shape == (2, 3)
assert ta | toSum() .all(2) | cli.op().shape == (2, 3); assert ta | toProd().all(2) | cli.op().shape == (2, 3)
assert ta | toAvg() .all(2) | cli.op().shape == (2, 3); assert ta | toStd() .all(2) | cli.op().shape == (2, 3)

In [9]:
#export
class toMax(BaseCli):
    def __init__(self):
        """Calculates the max of a bunch of numbers. Can pipe in :class:`torch.Tensor` or :class:`numpy.ndarray`.
Example::

    # returns 6
    [2, 5, 6, 1, 2] | toMax()"""
        super().__init__()
    def _all_array_opt(self, it, level):
        if isinstance(it, np.ndarray): return np.max(it, tuple(range(level, len(it.shape))))
        elif hasTorch and isinstance(it, torch.Tensor):
            for i in range(level, len(it.shape)): it = torch.max(it, level)[0]
            return it
        return NotImplemented
    def __ror__(self, it:Iterator[float]) -> float:
        if isinstance(it, settings.arrayTypes): return it.max()
        return max(it)
    def _jsF(self, meta):
        fIdx = init._jsFAuto(); dataIdx = init._jsDAuto()
        return f"const {fIdx} = ({dataIdx}) => {dataIdx}.toMax()", fIdx
class toMin(BaseCli):
    def __init__(self):
        """Calculates the min of a bunch of numbers. Can pipe in :class:`torch.Tensor` or :class:`numpy.ndarray`.
Example::

    # returns 1
    [2, 5, 6, 1, 2] | toMin()"""
        super().__init__()
    def _all_array_opt(self, it, level):
        if isinstance(it, np.ndarray): return np.min(it, tuple(range(level, len(it.shape))))
        elif hasTorch and isinstance(it, torch.Tensor):
            for i in range(level, len(it.shape)): it = torch.min(it, level)[0]
            return it
        return NotImplemented
    def __ror__(self, it:Iterator[float]) -> float:
        if isinstance(it, settings.arrayTypes): return it.min()
        return min(it)
    def _jsF(self, meta):
        fIdx = init._jsFAuto(); dataIdx = init._jsDAuto()
        return f"const {fIdx} = ({dataIdx}) => {dataIdx}.toMin()", fIdx

In [10]:
assert [2, 5, 6, 1, 2] | toMax() == 6
assert [2, 5, 6, 1, 2] | toMin() == 1
a = np.random.randn(2, 3, 4); ta = torch.tensor(a)
assert a  | toMax().all(2) | cli.op().shape == (2, 3)
assert ta | toMax().all(2) | cli.op().shape == (2, 3)
assert a  | toMin().all(2) | cli.op().shape == (2, 3)
assert ta | toMin().all(2) | cli.op().shape == (2, 3)

In [11]:
#export
class toArgmin(BaseCli):
    def __init__(self):
        """Get the input iterator's index of the min value.
Example::

    [2, 3, 4, 1, 5] | toArgmin() # returns 3
"""
        pass
    def __ror__(self, it):
        if isinstance(it, k1lib.settings.cli.arrayTypes): return it.argmin().item()
        else:
            try: len(it); return np.array(it) | self
            except: np.array(list(it)) | self
class toArgmax(BaseCli):
    def __init__(self):
        """Get the input iterator's index of the max value.
Example::

    [2, 3, 4, 1, 5] | toArgmin() # returns 4
"""
        pass
    def __ror__(self, it):
        if isinstance(it, k1lib.settings.cli.arrayTypes): return it.argmax().item()
        else:
            try: len(it); return np.array(it) | self
            except: np.array(list(it)) | self

In [12]:
assert [2, 3, 4, 1, 5] | toArgmin() == 3
assert [2, 3, 4, 1, 5] | toArgmax() == 4

In [13]:
#export
settings.add("font", None, "default font file. Best to use .ttf files, used by toPIL()")
settings.add("chem", k1lib.Settings().add("imgSize", 200, "default image size used in toPIL() when drawing rdkit molecules"), "chemistry-related settings")
def cropToContentNp(ogIm, pad=10):
    dim = len(ogIm.shape); im = ogIm
    if dim > 2: im = im.mean(0)
    coords = np.argwhere(im.max()-im); x_min, y_min = coords.min(axis=0); x_max, y_max = coords.max(axis=0)
    return ogIm[x_min-pad:x_max+1+pad, y_min-pad:y_max+1+pad] if dim == 2 else ogIm[:,x_min-pad:x_max+1+pad, y_min-pad:y_max+1+pad]
def cropToContentPIL(im, pad=0):
    im = im | toTensor(int) | cli.op().numpy() | cli.aS(cropToContentNp, pad)
    return torch.from_numpy(im).permute(1, 2, 0) | toImg() if len(im.shape) > 2 else im | toImg()
class toPIL(BaseCli):
    def __init__(self, closeFig=True, crop=True):
        """Converts multiple data types into a PIL image.
Example::

    # grabs first image in the current folder
    ls(".") | toPIL().all() | item()
    # converts from tensor/array to image
    torch.randn(100, 200) | toPIL()
    # grabs image, converts to byte stream, and converts back to image
    "abc.jpg" | toPIL() | toBytes() | toPIL()
    # converts paragraphs to image
    ["abc", "def"] | toPIL()
    # converts SMILES string to molecule, then to image
    "c1ccc(C)cc1" | toMol() | toImg()
    # sketches a graphviz plot, converts to svg then renders the svg as an image
    ["ab", "bc", "ca"] | (kgv.sketch() | kgv.edges()) | toHtml() | toImg()

You can also save a matplotlib figure by piping in a :class:`matplotlib.figure.Figure` object::

    x = np.linspace(0, 4)
    plt.plot(x, x**2)
    plt.gcf() | toPIL()

.. note::
    
    If you are working with image tensors, which is typically have
    dimensions of (C, H, W), you have to permute it to PIL's (H, W, C)
    first before passing it into this cli.
    
    Also it's expected that
    your tensor image ranges from 0-255, and not 0-1. Make sure you
    renormalize it

:param closeFig: if input is a matplotlib figure, then closes the figure after generating the image
:param crop: whether to crop white spaces around an image or not"""
        import PIL; self.PIL = PIL; self.closeFig = closeFig; self.crop = crop
    def _typehint(self, inp):
        return PIL.Image.Image
    def __ror__(self, path) -> "PIL.Image.Image":
        if isinstance(path, Svg):
            import tempfile; a = tempfile.NamedTemporaryFile()
            import cairosvg; cairosvg.svg2png(bytestring=path,write_to=a.name); im = a.name | toImg()
            return im
        if isinstance(path, str):
            return self.PIL.Image.open(os.path.expanduser(path))
        if isinstance(path, bytes):
            return self.PIL.Image.open(io.BytesIO(path))
        if isinstance(path, torch.Tensor): path = path.numpy()
        if isinstance(path, np.ndarray):
            return self.PIL.Image.fromarray(path.astype("uint8"))
        if isinstance(path, mpl.figure.Figure):
            canvas = path.canvas; canvas.draw()
            img = self.PIL.Image.frombytes('RGB', canvas.get_width_height(), canvas.tostring_rgb())
            if self.closeFig: plt.close(path)
            return img | cli.aS(cropToContentPIL)
        if hasGraphviz and isinstance(path, graphviz.Digraph):
            import tempfile; a = tempfile.NamedTemporaryFile()
            path.render(a.name, format="jpeg");
            fn = f"{a.name}.jpeg"; im = fn | toImg()
            try: os.remove(fn)
            except: pass
            return im
        if hasRdkit and isinstance(path, rdkit.Chem.rdchem.Mol):
            sz = settings.chem.imgSize
            return self.__ror__(rdkit.Chem.Draw.MolsToGridImage([path], subImgSize=[sz, sz]).data) | (cli.aS(cropToContentPIL) if self.crop else cli.iden())
        path = path | cli.deref()
        if len(path) > 0 and isinstance(path[0], str):
            from PIL import ImageDraw
            h = path | cli.shape(0); w = path | cli.shape(0).all() | cli.aS(max)
            image = self.PIL.Image.new("L", ((w+1)*20, (h+1)*60), 255)
            font = PIL.ImageFont.truetype(settings.font, 18) if settings.font else None
            ImageDraw.Draw(image).text((20, 20), path | cli.join("\n"), 0, font=font)
            return np.array(image)/255 | (cli.aS(cropToContentNp) if self.crop else iden()) | cli.op()*255 | toImg()
        return NotImplemented
toImg = toPIL

In [14]:
res = []
torch.randn(100, 200) | toImg() | cli.aS(res.append)
x = np.linspace(0, 4); plt.plot(x, x**2); plt.gcf() | toPIL() | cli.aS(res.append)
["abc", "def"] | toImg() | cli.aS(res.append)
#"c1ccc(C)cc1" | cli.toMol() | toImg() | cli.aS(res.append) # rdkit installed means jpg stops working, fucking rdkit
g = k1lib.digraph(); g("A", "B"); g | toImg() | cli.aS(res.append); res

[<PIL.Image.Image image mode=L size=200x100>,
 <PIL.Image.Image image mode=RGB size=524x391>,
 <PIL.Image.Image image mode=L size=37x42>,
 <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=83x155>]

In [15]:
#export
class toRgb(BaseCli):
    def __init__(self):
        """Converts greyscale/rgb PIL image to rgb image.
Example::

    # reads image file and converts it to rgb
    "a.png" | toPIL() | toRgb()"""
        import PIL; self.PIL = PIL
    def _typehint(self, inp): return inp
    def __ror__(self, i):
        if i.getbands() == ("R", "G", "B"): return i
        rgbI = self.PIL.Image.new("RGB", i.size)
        rgbI.paste(i); return rgbI
class toRgba(BaseCli):
    def __init__(self):
        """Converts random PIL image to rgba image.
Example::

    # reads image file and converts it to rgba
    "a.png" | toPIL() | toRgba()"""
        import PIL; self.PIL = PIL
    def _typehint(self, inp): return inp
    def __ror__(self, i):
        if i.getbands() == ("R", "G", "B", "A"): return i
        rgbI = self.PIL.Image.new("RGBA", i.size)
        rgbI.paste(i); return rgbI
class toGray(BaseCli):
    def __init__(self):
        """Converts random PIL image to a grayscale image.
Example::

    # reads image file and converts it to rgba
    "a.png" | toPIL() | toGray()"""
        import PIL; self.PIL = PIL
    def _typehint(self, inp): return inp
    def __ror__(self, i):
        if i.getbands() == ("L"): return i
        return self.PIL.ImageOps.grayscale(i)

In [16]:
#export
class toDict(BaseCli):
    def __init__(self, rows=True, f=None):
        """Converts 2 Iterators, 1 key, 1 value into a dictionary.
Example::

    # returns {1: 3, 2: 4}
    [[1, 3], [2, 4]] | toDict()
    # returns {1: 3, 2: 4}
    [[1, 2], [3, 4]] | toDict(False)

If ``rows`` is a string, then it will build a dictionary from key-value
pairs delimited by this character. For example::

    ['gene_id "ENSG00000290825.1"',
     'transcript_id "ENST00000456328.2"',
     'gene_type "lncRNA"',
     'gene_name "DDX11L2"',
     'transcript_type "lncRNA"',
     'transcript_name "DDX11L2-202"',
     'level 2',
     'transcript_support_level "1"',
     'tag "basic"',
     'tag "Ensembl_canonical"',
     'havana_transcript "OTTHUMT00000362751.1"'] | toDict(" ")

That returns::

    {'gene_id': '"ENSG00000290825.1"',
     'transcript_id': '"ENST00000456328.2"',
     'gene_type': '"lncRNA"',
     'gene_name': '"DDX11L2"',
     'transcript_type': '"lncRNA"',
     'transcript_name': '"DDX11L2-202"',
     'level': '2',
     'transcript_support_level': '"1"',
     'tag': '"Ensembl_canonical"',
     'havana_transcript': '"OTTHUMT00000362751.1"'}

:param rows: if True, reads input in row by row, else reads
    in list of columns
:param f: if specified, return a defaultdict that uses this function as its generator"""
        self.rows = rows
        if f is not None: self.f = lambda d: defaultdict(f, d)
        else: self.f = lambda x: x
    def __ror__(self, it:Tuple[Iterator[T], Iterator[T]]) -> dict:
        r = self.rows; f = self.f
        if r:
            if isinstance(r, str): return it | cli.apply(cli.aS(lambda x: x.split(" ")) | cli.head(1).split() | cli.item() + cli.join(" ")) | toDict()
            return f({_k:_v for _k, _v in it})
        return f({_k:_v for _k, _v in zip(*it)})
    def _jsF(self, meta):
        fIdx = init._jsFAuto(); dataIdx = init._jsDAuto()
        if not self.rows: raise Exception("toDict._jsF() doesn't support .rows=False yet")
        return f"const {fIdx} = ({dataIdx}) => {dataIdx}.toDict()", fIdx

In [17]:
assert [[1, 2], [3, 4]] | toDict(False) == {1: 3, 2: 4}
assert [[1, 3], [2, 4]] | toDict() == {1: 3, 2: 4}
assert ['gene_id "ENSG00000290825.1"',
 'transcript_id "ENST00000456328.2"',
 'gene_type "lncRNA"',
 'gene_name "DDX11L2"',
 'transcript_type "lncRNA"',
 'transcript_name "DDX11L2-202"',
 'level 2',
 'transcript_support_level "1"',
 'tag "basic"',
 'tag "Ensembl_canonical"',
 'havana_transcript "OTTHUMT00000362751.1"'] | toDict(" ") == {'gene_id': '"ENSG00000290825.1"',
 'transcript_id': '"ENST00000456328.2"',
 'gene_type': '"lncRNA"',
 'gene_name': '"DDX11L2"',
 'transcript_type': '"lncRNA"',
 'transcript_name': '"DDX11L2-202"',
 'level': '2',
 'transcript_support_level': '"1"',
 'tag': '"Ensembl_canonical"',
 'havana_transcript': '"OTTHUMT00000362751.1"'}

In [18]:
#export
def _toop(toOp, c, force, defaultValue):
    return cli.apply(toOp, c) | (cli.apply(lambda x: x or defaultValue, c) if force else cli.filt(cli.op() != None, c))
def _toFloat(e) -> Union[float, None]:
    try: return float(e)
    except: return None
class toFloat(BaseCli):
    def __init__(self, *columns, mode=2):
        """Converts every row into a float. Example::

    # returns [1, 3, -2.3]
    ["1", "3", "-2.3"] | toFloat() | deref()
    # returns [[1.0, 'a'], [2.3, 'b'], [8.0, 'c']]
    [["1", "a"], ["2.3", "b"], [8, "c"]] | toFloat(0) | deref()

With weird rows::

    # returns [[1.0, 'a'], [8.0, 'c']]
    [["1", "a"], ["c", "b"], [8, "c"]] | toFloat(0) | deref()
    # returns [[1.0, 'a'], [0.0, 'b'], [8.0, 'c']]
    [["1", "a"], ["c", "b"], [8, "c"]] | toFloat(0, force=True) | deref()

This also works well with :class:`torch.Tensor` and :class:`numpy.ndarray`,
as they will not be broken up into an iterator::

    # returns a numpy array, instead of an iterator
    np.array(range(10)) | toFloat()

:param columns: if nothing, then will convert each row. If available, then
    convert all the specified columns
:param mode: different conversion styles
    - 0: simple ``float()`` function, fastest, but will throw errors if it can't be parsed
    - 1: if there are errors, then replace it with zero
    - 2: if there are errors, then eliminate the row"""
        self.columns = columns; self.mode = mode
    def __ror__(self, it):
        columns = self.columns; mode = self.mode
        if len(columns) == 0:
            if isinstance(it, np.ndarray): return it.astype(float)
            if isinstance(it, torch.Tensor): return it.float()
            if mode == 0: return (float(e) for e in it)
            return it | _toop(_toFloat, None, mode == 1, 0.0)
        else: return it | cli.init.serial(*(_toop(_toFloat, c, mode == 1, 0.0) for c in columns))
    def _jsF(self, meta):
        fIdx = init._jsFAuto(); dataIdx = init._jsDAuto(); cols = self.columns
        if len(cols) == 0:
            if mode == 0: return f"const {fIdx} = ({dataIdx}) => {dataIdx}.map((v) => parseFloat(v))", fIdx
            if mode == 1: return f"const {fIdx} = ({dataIdx}) => {dataIdx}.map((v) => {{ const a = parseFloat(v); return a === a ? a : 0 }})", fIdx
            if mode == 2: return f"const {fIdx} = ({dataIdx}) => {{ const ans = []; for (const v of {dataIdx}) {{ const a = parseFloat(v); if (a === a) ans.push(a); }}; return ans; }}"
        else: return f"""\
const {fIdx} = ({dataIdx}) => {{
    const ans = [];
    for (const row of {dataIdx}) {{
        {'ans.push(row.map(parseFloat));' if mode == 0 else ''}
        {'ans.push(row.map(parseFloat).map((v) => (v === v ? v : 0)));' if mode == 1 else ''}
        {'const rowp = row.map(parseFloat);if (rowp.map((v) => v === v).every((v) => v)) ans.push(rowp);' if mode == 2 else ''}
    }}
    return ans;
}}""", fIdx

In [19]:
#export
def _toInt(e) -> Union[int, None]:
    try: return int(float(e))
    except: return None
class toInt(BaseCli):
    def __init__(self, *columns, mode=2):
        """Converts every row into an integer. Example::

    # returns [1, 3, -2]
    ["1", "3", "-2.3"] | toInt() | deref()

:param columns: if nothing, then will convert each row. If available, then
    convert all the specified columns
:param mode: different conversion styles
    - 0: simple ``float()`` function, fastest, but will throw errors if it can't be parsed
    - 1: if there are errors, then replace it with zero
    - 2: if there are errors, then eliminate the row

See also: :meth:`toFloat`"""
        self.columns = columns; self.mode = mode;
    def __ror__(self, it):
        columns = self.columns; mode = self.mode
        if len(columns) == 0:
            if isinstance(it, np.ndarray): return it.astype(int)
            if isinstance(it, torch.Tensor): return it.int()
            if mode == 0: return (int(e) for e in it)
            return it | _toop(_toInt, None, mode == 1, 0)
        else: return it | cli.init.serial(*(_toop(_toInt, c, mode == 1, 0.0) for c in columns))
    def _jsF(self, meta):
        fIdx = init._jsFAuto(); dataIdx = init._jsDAuto(); cols = self.columns
        if len(cols) == 0:
            if mode == 0: return f"const {fIdx} = ({dataIdx}) => {dataIdx}.map((v) => parseInt(v))", fIdx
            if mode == 1: return f"const {fIdx} = ({dataIdx}) => {dataIdx}.map((v) => {{ const a = parseInt(v); return a === a ? a : 0 }})", fIdx
            if mode == 2: return f"const {fIdx} = ({dataIdx}) => {{ const ans = []; for (const v of {dataIdx}) {{ const a = parseInt(v); if (a === a) ans.push(a); }}; return ans; }}"
        else: return f"""\
const {fIdx} = ({dataIdx}) => {{
    const ans = [];
    for (const row of {dataIdx}) {{
        {'ans.push(row.map(parseInt));' if mode == 0 else ''}
        {'ans.push(row.map(parseInt).map((v) => (v === v ? v : 0)));' if mode == 1 else ''}
        {'const rowp = row.map(parseInt);if (rowp.map((v) => v === v).every((v) => v)) ans.push(rowp);' if mode == 2 else ''}
    }}
    return ans;
}}""", fIdx

In [20]:
assert ["1", "3", "-2.3"] | toFloat() | cli.deref() == [1, 3, -2.3]
assert [["1", "a"], ["2.3", "b"], [8, "c"]] | toFloat(0) | cli.deref() == [[1.0, 'a'], [2.3, 'b'], [8.0, 'c']]
assert ["1", "3", "-2.3"] | toInt() | cli.deref() == [1, 3, -2]
assert [["1", "a"], ["c", "b"], [8, "c"]] | toFloat(0, mode=1) | cli.deref() == [[1.0, 'a'], [0.0, 'b'], [8.0, 'c']]
assert [["1", "a"], ["c", "b"], [8, "c"]] | toFloat(0) | cli.deref() == [[1.0, 'a'], [8.0, 'c']]
assert isinstance(np.array(range(10)) | toFloat(), np.ndarray)

In [21]:
#export
class toBytes(BaseCli):
    def __init__(self, dataType=None):
        """Converts several object types to bytes.
Example::

    # converts string to bytes
    "abc" | toBytes()
    # converts image to bytes in jpg format
    torch.randn(200, 100) | toImg() | toBytes()
    # converts image to bytes in png format
    torch.randn(200, 100) | toImg() | toBytes("PNG")

    "some_file.mp3" | toAudio() | toBytes("mp3")

.. admonition:: Custom datatype

    It is possible to build objects that can interoperate with this cli,
    like this::

        class custom1:
            def __init__(self, config=None): ...
            def _toBytes(self): return b"abc"
        class custom2:
            def __init__(self, config=None): ...
            def _toBytes(self, dataType):
                if dataType == "png": return b"123"
                else: return b"456"

        custom1() | toBytes()      # returns b"abc"
        custom2() | toBytes()      # returns b"456"
        custom2() | toBytes("png") # returns b"123"

    When called upon, :class:`toBytes` will detect that the input has the ``_toBytes``
    method, which will prompt it to execute that method of the complex object. Of
    course, this means that you can return anything, not necessarily bytes, but to
    maintain intuitiveness, you should return either bytes or iterator of bytes

:param dataType: depending on input. If it's an image then this can be png, jpg. If
    it's a sound then this can be mp3, wav or things like that"""
        self.dataType = dataType
    def __ror__(self, it):
        if isinstance(it, str): return it.encode()
        if hasPIL:
            if isinstance(it, PIL.Image.Image):
                it = it | toRgb(); buffered = io.BytesIO()
                it.save(buffered, format=(self.dataType or "JPEG")); return buffered.getvalue()
        if hasattr(it, "_toBytes"):
            n = len(inspect.getfullargspec(it._toBytes).args[1:])
            if n == 0: return it._toBytes()
            elif n == 1: return it._toBytes(self.dataType)
            else: raise Exception(f"{it.__class__.__name__} have 2 or more arguments, which is unsupported")
        import dill; return dill.dumps(it)

In [22]:
torch.randn(200, 100) | toImg() | toBytes() | cli.aS(len) | cli.aS(print)

13274


In [23]:
#export
mpld3 = k1lib.dep("mpld3")
class Svg(str): pass
class DataUri:
    def __init__(self, uri:str):
        self.uri = uri # "data:image/png;base64, ..."
        self.mime = uri.split(";")[0].split(":")[-1] # "image/png"
        self.mimeBase = self.mime.split("/")[0] # "image"
    def _repr_html_(self):
        if self.mimeBase == "image": return f"<img src=\"{self.uri}\"/>"
        if self.mime == "text/html": return base64.b64decode(self.uri.split("base64,")[-1]).decode()
    def __repr__(self):
        uri = self.uri
        return f"<DataUri mime='{self.mime}', self.uri='{(uri[:75] + '...') if len(uri) > 75 else uri}'>"
def _dataUriHtml(it): return DataUri(f"data:text/html;base64, {base64.b64encode(it.encode()).decode()}")
class toDataUri(BaseCli):
    def __init__(self):
        """Converts incoming object into data uri scheme.
Data uris are the things that look like "data:image/png;base64, ...",
or "data:text/html;base64, ...". This is a convenience tool mainly
for other tools, and not quite useful directly. Example::

    randomImg = cat("https://mlexps.com/ergun.png", False) | toImg() # returns PIL image
    randomImg | toDataUri()              # returns k1lib.cli.conv.DataUri object with .mime field "image/png" and .uri field "data:image/png;base64, ..."
    randomImg | toDataUri() | toHtml()   # returns hmtl string `<img src="data:image/png;base64, ..."/>`
    randomImg | toHtml()                 # same like above. toHtml() actually calls toDataUri() behind the scenes
    randomImg | toDataUri() | toAnchor() # creates anchor tag (aka link elements "<a></a>") that, when clicked, displays the image in a new tab
    randomImg | toAnchor()               # same as above. toAnchor() actually calls toDataUri() behind the scenes
"""
        self.throw = False # can be configured by outside clis, like toHtml()
    def __ror__(self, it):
        if isinstance(it, str): return _dataUriHtml(it)
        if isinstance(it, DataUri): return it
        if hasPIL and isinstance(it, PIL.Image.Image):
            it = it | toBytes(dataType="PNG") | cli.aS(base64.b64encode) | cli.op().decode()
            return DataUri(f"data:image/png;base64, {it}")
        try: return DataUri(it._toDataUri())
        except Exception as e:
            if self.throw: raise Exception(f"toDataUri() called on an unfamiliar object, and the object doesn't implement _toDataUri(). Error: {e}")
            return _dataUriHtml(it | toHtml())
class toAnchor(BaseCli):
    def __init__(self, text:str="click here"):
        """Converts incoming object into a html anchor tag that, when clicked,
displays the incoming object's html in another tab. Example::

    randomImg = cat("https://mlexps.com/ergun.png", False) | toImg() # returns PIL image
    randomImg | toAnchor() # returns html string `<a href="data:image/png;base64, ..."></a>`

On some browsers, there's sort of a weird bug where a new tab would open, but
there's nothing displayed on that tab. If you see this is happening, just press
F5 or Ctrl+R to refresh the page and it should display everything nicely

:param text: text to display inside of the anchor"""
        self.text = text
    def __ror__(self, it:str):
        s = it | toDataUri() | cli.op().uri
        return f"<a href=\"{s}\" target=\"_blank\">{self.text}</a>"
class toHtml(BaseCli):
    def __init__(self):
        """Converts several object types to bytes.
Example::

    # converts PIL image to html <img> tag
    torch.randn(200, 100) | toImg() | toHtml()
    # converts graphviz graph to svg text (which is essentially html)
    g = k1.digraph(); g(*"abc"); g(*"bcd"); g | toHtml()

    # converts plotly graphs to html
    import plotly.express as px; import pandas as pd
    df = pd.DataFrame({'x': [1, 2, 3, 4, 5], 'y': [10, 11, 12, 14, 15]})
    fig = px.line(df, x='x', y='y', title='Simple Line Chart')
    fig | toHtml()
    
    # converts matplotlib plot to image, and then to html. Do this if you want a static plot
    x = np.linspace(-2, 2); y = x**2
    plt.plot(x, x**2); plt.gcf() | toImg() | toHtml()
    # converts matplotlib plot to D3.js html sketch
    plt.plot(x, x**2); plt.gcf() | toHtml()
"""
        pass
    def __ror__(self, it):
        if isinstance(it, str): return it
        if hasPlotly and isinstance(it, plotly.graph_objs._figure.Figure):
            out = io.StringIO(); it.write_html(out); out.seek(0); return out.read()
        if isinstance(it, mpl.figure.Figure): res = mpld3.fig_to_html(it); plt.close(it); return res
        if hasGraphviz and isinstance(it, graphviz.Digraph):
            import tempfile; a = tempfile.NamedTemporaryFile()
            it.render(a.name, format="svg");
            fn = f"{a.name}.svg"; im = cli.cat(fn) | cli.join("")
            try: os.remove(fn)
            except: pass
            return Svg(im)
        try:
            res = it._repr_html_()
            if res: return res
        except: pass
        try:
            res = it._toHtml()
            if res: return res
        except: pass
        try:
            f = toDataUri(); f.throw = True
            res = (it | f)._repr_html_()
            if res: return res
        except: pass
        return it.__repr__()

In [24]:
assert object() | toDataUri() | toHtml() | cli.op().startswith("<object object at 0x")
import IPython; torch.randn(200, 100) | toImg() | toHtml() | cli.aS(IPython.display.HTML)

In [25]:
randomImg = cli.cat("https://mlexps.com/ergun.png", False) | toImg()
randomImg | toAnchor() | cli.aS(IPython.display.HTML)

In [26]:
plt.plot(x, x**2); plt.gcf() | toImg() | toHtml() | cli.aS(IPython.display.HTML)

In [27]:
x = np.linspace(-2, 2); y = x**2
plt.plot(x, x**2); plt.gcf() | toHtml() | cli.aS(IPython.display.HTML)

In [28]:
g = k1lib.digraph(); g(*"abc"); g(*"bcd")
g | toHtml() | cli.aS(IPython.display.HTML)

In [29]:
#export
try:
    from rdkit import Chem
    from rdkit.Chem import Draw
    from rdkit.Chem import AllChem
    from rdkit.Chem.Draw import IPythonConsole
    IPythonConsole.drawOptions.addAtomIndices = True
    __all__ = [*__all__, "toMol", "toSmiles"]
    def toMol():
        """Smiles to molecule.
Example::

    "c1ccc(C)cc1" | toMol()"""
        return cli.aS(Chem.MolFromSmiles)
    def toSmiles():
        """Molecule to smiles.
Example::

    "c1ccc(C)cc1" | toMol() | toSmiles()"""
        return cli.aS(Chem.MolToSmiles)
except: pass

In [30]:
try:
    from rdkit import Chem
    "c1ccc(C)cc1" | toMol() | toImg()
    "c1ccc(C)cc1" | cli.aS(Chem.MolFromSmiles) | cli.aS(Chem.AllChem.EmbedMolecule) | cli.aS(print)
except ModuleNotFoundError: pass

In [31]:
#export
import unicodedata, hashlib
def toAscii():
    """Converts complex unicode text to its base ascii form.
Example::

    "hà nội" | toAscii() # returns "ha noi"

Taken from https://stackoverflow.com/questions/2365411/convert-unicode-to-ascii-without-errors-in-python"""
    return cli.aS(lambda word: unicodedata.normalize('NFKD', word).encode('ascii', 'ignore'))
def toHash() -> str:
    """Converts some string into some hash string.
Example::

    "abc" | toHash() # returns 'gASVJAAAAAAAAABDILp4Fr+PAc/qQUFA3l2uIiOwA2Gjlhd6nLQQ/2HyABWtlC4='

Why not just use the builtin function ``hash("abc")``? Because it generates different
hashes for different interpreter sessions, and that breaks many of my applications that
need the hash value to stay constant forever."""
    def hashF(msg:str) -> str: m = hashlib.sha256(); m.update(f"{msg}".encode()); return k1lib.encode(m.digest())
    return cli.aS(hashF)

In [32]:
assert "abc" | toHash() == 'gASVJAAAAAAAAABDILp4Fr+PAc/qQUFA3l2uIiOwA2Gjlhd6nLQQ/2HyABWtlC4='

In [33]:
#export
import csv; pd = k1lib.dep("pandas")
class toCsv(BaseCli):
    def __init__(self, allSheets=False):
        """Converts a csv file name into a table.
Example::

    "abc.csv"  | toCsv()     # returns table of values
    "def.xlsx" | toCsv()     # returns table of values in the first sheet
    "def.xlsx" | toCsv(True) # returns List[Sheet name (str), table of values]

.. warning::

    Note that this is pretty slow compared to just splitting by semicolons. If your
    dataset doesn't have anything complicated like semicolons in quotes, then just
    do ``op().split(",").all()``
    
    If your dataset does have complicated quotes, then I'd suggest reading the csv
    using this cli, then convert it to a tsv file (tab-separated value). Then you can
    always just split the string using tab characters

:param allSheets: if input is an Excel sheet, whether to read in all sheets or
    just the first sheet. No effect if input is a normal csv file"""
        self.allSheets = allSheets
    def __ror__(self, fn:str):
        fn = os.path.expanduser(fn)
        if fn.endswith(".xls") or fn.endswith(".xlsx"):
            if self.allSheets: return [[k, v.values] for k,v in pd.read_excel(fn, sheet_name=None).items()]
            else: return pd.read_excel(fn).values
        def gen():
            with open(fn) as f: yield from csv.reader(f)
        return gen()

In [34]:
assert "test/runinfo.csv" | toCsv() | cli.shape(0) > 0

In [35]:
#export
import validators, shutil, html, io, os; pydub = k1lib.dep("pydub")
class Audio:
    def __init__(self, raw:"pydub.audio_segment.AudioSegment"): self.raw = raw
    def resample(self, rate) -> "self":
        """Resamples the audio"""
        if rate:
            self.raw = self.raw.set_frame_rate(rate)
            self.data = np.array(self.raw.get_array_of_samples())/2.15e9
            self.rate = self.raw.frame_rate
        return self
    def _toBytes(self, dataType) -> bytes: f = io.BytesIO(); self.raw.export(f, format=(dataType or "wav")); return f.read()
    def __repr__(self): return f"<Audio duration={k1lib.fmt.time(self.raw.duration_seconds)} rate={self.raw.frame_rate}>"
    def __len__(self): return int(self.raw.frame_count())
    def __getitem__(self, slice_):
        if not isinstance(slice_, slice): return None
        data = np.array(self.raw.get_array_of_samples()) | cli.batched(self.raw.channels) | cli.op()[slice_]
        return Audio(pydub.AudioSegment(data.tobytes(), frame_rate=self.raw.frame_rate, sample_width=self.raw.sample_width, channels=self.raw.channels))
    def _repr_html_(self): # plays a short sample, first 10s or sth like that
        return f"{html.escape(self.__repr__())}<br>{self.raw[:10000]._repr_html_()}"
class toAudio(BaseCli):
    def __init__(self, rate=None):
        """Reads audio from either a file or a URL or from bytes directly.
Example::

    au = "some_file.wav" | toAudio() # can display in a notebook, which will preview the first 10 second
    au | toBytes()      # exports audio as .wav file
    au | toBytes("mp3") # exports audio as .mp3 file
    au.resample(16000)  # resamples audio to new rate
    au | head(0.1)      # returns new Audio that has the first 10% of the audio only
    au | splitW(8, 2)   # splits Audio into 2 Audios, first one covering 80% and second one covering 20% of the track
    au.raw              # internal pydub.AudioSegment object. If displayed in a notebook, will play the whole thing

You can also use this on any Youtube video or random mp3 links online and on raw bytes::

    "https://www.youtube.com/watch?v=FtutLA63Cp8" | toAudio() # grab Bad Apple song from internet
    cat("some_file.wav", False) | toAudio() # grab from raw bytes of mp3 or wav, etc.
"""
        self.rate = rate
    def __ror__(self, it:"str|byte") -> Audio:
        if isinstance(it, str):
            if os.path.exists(os.path.expanduser(it)): fn = os.path.expanduser(it); tmp = False
            elif validators.url(it):
                if not shutil.which("yt-dlp"): raise Exception(f"'{it}' looks like a link, but the required 'yt-dlp' binary is not found. Please install it by doing `pip install yt-dlp`")
                fn = None | cli.cmd(f"yt-dlp -o - -x {it}", mode=0, text=False) | cli.item() | cli.file(); tmp = True
            else: raise Exception(f"The file '{it}' does not exist, and it doesn't look like a URL")
        elif isinstance(it, bytes): fn = it | cli.file(); tmp = True
        else: raise Exception(f"Unknown {type(it)} audio type")
        res = Audio(pydub.AudioSegment.from_file(fn)).resample(self.rate)
        if tmp: os.remove(fn)
        return res

In [36]:
#export
dateutil = k1lib.dep("dateutil")
class toUnix(BaseCli):
    def __init__(self, tz:"str | dateutil.tz.tz.tzfile"=None):
        """Tries anything piped in into a unix timestamp. If can't convert
then return None. Example::

Local time zone independent::

    "2023" | toUnix()                      # returns 2023, or 2023 seconds after unix epoch. Might be undesirable, but has to support raw ints/floats
    "2023-11-01T00Z" | toUnix()            # midnight Nov 1st 2023 GMT
    "2023-11-01T00:00:00-04:00" | toUnix() # midnight Nov 1st 2023 EST
    "2023-11-01" | toUnix("US/Pacific")    # midnight Nov 1st 2023 PST
    "2023-11-01" | toUnix("UTC")           # midnight Nov 1st 2023 UTC

Local time zone dependent (assumes EST)::

    "2023-11" | toUnix() # if today's Nov 2nd EST, then this would be 1698897600, or midnight Nov 2nd 2023 EST
    "2023-11-04" | toUnix() # midnight Nov 4th 2023 EST

Feel free to experiment more, but in general, this is pretty versatile in what it can
convert. With more effort, I'd probably make this so that every example given will not
depend on local time, but since I just use this to calculate time differences, I don't
really care.

:param tz: Timezone, like "US/Eastern", "US/Pacific". If not specified, then assumes local timezone"""
        if tz: self.tz = tz if isinstance(tz, dateutil.tz.tz.tzfile) else dateutil.tz.gettz(tz)
        else: self.tz = None
    def __ror__(self, t):
        try: return float(t)
        except:
            try:
                a = dateutil.parser.parse(t)
                if self.tz: a = a.replace(tzinfo=self.tz)
                return a.timestamp()
            except: return None

In [37]:
#export
from datetime import datetime as dt
class toIso(BaseCli):
    def __init__(self):
        """Converts unix timestamp into ISO 8601 string format.
Example::

    1701382420 | toIso()            # returns '2023-11-30T17:13:40', which is correct in EST time
    1701382420 | toIso() | toUnix() # returns 1701382420, the input timestamp, showing it's correct
    1701382420.123456789 | toIso()  # returns '2023-11-30T17:13:40.123457'

As you might have noticed, this cli depends on the timezone of the host computer
"""
        pass
    def __ror__(self, it):
        return dt.fromtimestamp(it).isoformat()

In [38]:
assert 1701382420 | toIso() == '2023-11-30T17:13:40'
assert 1701382420.123456789 | toIso() == '2023-11-30T17:13:40.123457'
t = 1463288494
assert t | toIso() | toUnix() == t

In [107]:
#export
class toYMD(BaseCli):
    def __init__(self, idx=None, mode=int):
        """Converts unix timestamp into tuple (year, month, day, hour, minute, second).
Example::

    1701382420 | toYMD()  # returns [2023, 11, 30, 17, 13, 40] in EST timezone
    1701382420 | toYMD(0) # returns 2023
    1701382420 | toYMD(1) # returns 11
    
    1701382395 | toYMD(mode=str) # returns ['2023', '11', '30', '17', '13', '15']

:param idx: if specified, take the desired element only. If 0, then take year, 1, then month, etc.
:param mode: either int or str. If str, then returns nicely adjusted numbers"""
        self.idx = idx; self.mode = mode
    def __ror__(self, it):
        d = dt.fromtimestamp(it)
        if self.mode == int: res = [d.year, d.month, d.day, d.hour, d.minute, d.second]
        else: res = [f"{d.year}", f"{d.month}".rjust(2,"0"), f"{d.day}".rjust(2,"0"),
                     f"{d.hour}".rjust(2,"0"), f"{d.minute}".rjust(2,"0"), f"{d.second}".rjust(2,"0")]
        return res if self.idx is None else res[self.idx]

In [108]:
assert 1701382420 | toYMD() == [2023, 11, 30, 17, 13, 40]; assert 1701382420 | toYMD(1) == 11; assert 1701382420 | toYMD(2) == 30
assert 1701382395 | toYMD(mode=str) == ['2023', '11', '30', '17', '13', '15']

In [41]:
#export
settings.add("toLinks", k1lib.Settings()\
    .add("splitChars", ["<br>", "<div ", *"\n\t<> ,;"], "characters/strings to split the lines by, so that each link has the opportunity to be on a separate line, so that the first instance in a line don't overshadow everything after it")\
    .add("protocols", ["http", "https", "ftp"], "list of recognized protocols to search for links, like 'http' and so on"), "conv.toLinks() settings");
class toLinks(BaseCli):
    def __init__(self, f=None):
        """Extracts links and urls from a paragraph.
Example::

    paragraph = [
        "http://a.c",
        "http://a2.c some other text in between <a href='http://b.d'>some link</a> fdvb"
    ]
    # returns {'http://a.c', 'http://a2.c', 'http://b.d'}
    paragraph | toLinks() | deref()

If the input is a string instead of an iterator of strings, then
it will :meth:`~k1lib.cli.inp.cat` it first, then look for links
inside the result. For example::

    "https://en.wikipedia.org/wiki/Cheese" | toLinks()

At the time of writing, that returns a lot of links::

    {'/wiki/Rind-washed_cheese',
     '#cite_ref-online_5-7',
     'https://web.archive.org/web/20160609031000/http://www.theguardian.com/lifeandstyle/wordofmouth/2012/jun/27/how-eat-cheese-and-biscuits',
     'https://is.wikipedia.org/wiki/Ostur',
     '/wiki/Meat_and_milk',
     '/wiki/Wayback_Machine',
     '/wiki/File:WikiCheese_-_Saint-Julien_aux_noix_01.jpg',
     'https://gv.wikipedia.org/wiki/Caashey',
     '/wiki/Eyes_(cheese)',
     '/wiki/Template_talk:Condiments',
     '#Pasteurization',
     '/wiki/Tuscan_dialect',
     '#cite_note-23',
     '#cite_note-aha2017-48',

So, keep in mind that lots of different things can be considered a
link. That includes absolute links ('https://gv.wikipedia.org/wiki/Caashey'),
relative links within that particular site ('/wiki/Tuscan_dialect'), and
relative links within the page ('#Pasteurization').

How it works underneath is that it's looking for a string like "https://..."
and a string like "href='...'", which usually have a link inside. For the
first detection style, you can specify extra protocols that you want to
search for using ``settings.cli.toLinks.protocols = [...]``.

Also, this will detect links nested within each other multiple times.
For example, the link 'https://web.archive.org/web/20160609031000/http://www.theguardian.com/lifeandstyle/wordofmouth/2012/jun/27/how-eat-cheese-and-biscuits'
will appear twice in the result, once as itself, but also 'https://www.theguardian.com/lifeandstyle/wordofmouth/2012/jun/27/how-eat-cheese-and-biscuits'

Note that if you really try, you will be able to find an example where this won't
work, so don't expect 100% reliability. But for ost use cases, this should perform
splendidly."""
        self.f = f or cli.iden()
        chars = " \t,;" # random characters to split, so that the first instance in a line doesn't overshadow the ones after
        self.preprocess = cli.serial(*[(cli.op().split(ch).all() | cli.joinSt()) for ch in settings.toLinks.splitChars])
        protocols = "|".join([f"({p})" for p in settings.toLinks.protocols])
        self.g = cli.grep(f"(?P<g>({protocols})" + "://[^\(\)\[\]\<\>\{\}\'\" ]*)", extract="g")
        self.href = cli.grep('href="(?P<g>.+)"', extract="g") & cli.grep("href='(?P<g>.+)'", extract="g") | cli.joinSt()
        self.post = cli.joinSt() | cli.aS(set)
    def __ror__(self, it):
        if hasattr(it, "_toLinks"): return it._toLinks(self.f) if len(inspect.getfullargspec(it._toLinks).args) == 2 else it._toLinks()
        host = ""
        if isinstance(it, str): host = it; it = cli.cat(it) # reads the website first
        it = it | self.preprocess | cli.aS(list)
        return it | self.href & self.g | self.post | self.f | cli.aS(set)

In [42]:
"https://en.wikipedia.org/wiki/Cheese" | toLinks() | cli.grep("guardian") | cli.deref()

['#cite_ref-guardian_43-2',
 'https://web.archive.org/web/20160609031000/http://www.theguardian.com/lifeandstyle/wordofmouth/2012/jun/27/how-eat-cheese-and-biscuits',
 '#cite_ref-guardian_43-1',
 'https://web.archive.org/web/20220410045214/https://www.theguardian.com/lifeandstyle/shortcuts/2012/jan/10/cheese-most-shoplifted-food-item',
 'https://www.theguardian.com/lifeandstyle/wordofmouth/2012/jun/27/how-eat-cheese-and-biscuits',
 '#cite_ref-guardian_43-0',
 'https://www.theguardian.com/lifeandstyle/shortcuts/2012/jan/10/cheese-most-shoplifted-food-item',
 '#cite_note-guardian-43']

In [43]:
["http://a.c", "http://a2.c some other text in between <a href='http://b.d'>some link</a> fdvb"] | toLinks() | cli.deref()

{'http://a.c', 'http://a2.c', 'http://b.d'}

In [80]:
#export
class toMovingAvg(BaseCli):
    def __init__(self, col:int=None, alpha=0.9, debias=True, v:float=0, dt:float=1):
        """Smoothes out sequential data using momentum.
Example::

    # returns [4.8, 4.62, 4.458]. 4.8 because 0.9*5 + 0.1*3 = 4.8, and so on
    [3, 3, 3] | toMovingAvg(v=5, debias=False) | deref()

Sometimes you want to ignore the initial value, then you can turn on debias mode::

    x = np.linspace(0, 10, 100); y = np.cos(x)
    plt.plot(x, y)
    plt.plot(x, y | toMovingAvg(debias=False)             | deref())
    plt.plot(x, y | toMovingAvg(debias=False, alpha=0.95) | deref())
    plt.plot(x, y | toMovingAvg(debias=True)              | deref())
    plt.plot(x, y | toMovingAvg(debias=True,  alpha=0.95) | deref())
    plt.legend(["Signal", "Normal - 0.9 alpha", "Normal - 0.95 alpha", "Debiased - 0.9 alpha", "Debiased - 0.95 alpha"], framealpha=0.3)
    plt.grid(True)

.. image:: ../images/movingAvg.png

As you can see, normal mode still has the influence of the initial value at
0 and can't rise up fast, whereas the debias mode will ignore the initial
value and immediately snaps to the first value.

Also, the 2 graphs with 0.9 alpha snap together quicker than the 2 graphs
with 0.95 alpha. Here's the effect of several alpha values:

.. image:: ../images/movingAvg-alphas.png

:param col: column to apply moving average to
:param alpha: momentum term
:param debias: whether to turn on debias mode or not
:param v: initial value, doesn't matter in debias mode
:param dt: pretty much never used, hard to describe, belongs to debias mode, checkout source code for details"""
        self.col = col; self.initV = v; self.alpha = alpha; self.debias = debias; self.dt = dt
        if debias and v != 0: raise Exception("Debias mode activated! This means that the initial value doesn't matter, yet you've specified one")
        if alpha > 1 or alpha < 0: raise Exception("Alpha is outside the [0, 1] range. which does not make sense")
    def __ror__(self, it):
        m = value = self.initV; alpha = self.alpha; col = self.col
        if self.debias:
            dt = self.dt; t = 1; tooSmall = False
            if col is None:
                for v in it:
                    m = m * alpha + v * (1 - alpha)
                    if tooSmall: yield m # skips complex exponential calculation once it's small enough to speed things up
                    else:
                        exp = alpha**t; value = m / (1 - exp)
                        tooSmall = 10*exp < (1-alpha); t += dt; yield value
            else:
                for row in it:
                    m = m * alpha + row[col] * (1 - alpha)
                    if tooSmall: yield [*row[:col], m, *row[col+1:]]
                    else:
                        exp = alpha**t; value = m / (1 - exp)
                        tooSmall = 10**exp < (1-alpha); t += dt; yield [*row[:col], value, *row[col+1:]]
        else:
            if col is None:
                for v in it: m = m * alpha + v * (1 - alpha); yield m
            else:
                for row in it:
                    m = m * alpha + row[col] * (1 - alpha)
                    yield [*row[:col], m, *row[col+1:]]

In [79]:
assert [3, 3, 3] | toMovingAvg(v=5, debias=False) | cli.deref() == [4.8, 4.62, 4.458]
x = np.linspace(0, 10, 100); y = np.cos(x)
plt.plot(x, y)
plt.plot(x, y | toMovingAvg(debias=False)             | cli.deref())
plt.plot(x, y | toMovingAvg(debias=False, alpha=0.95) | cli.deref())
plt.plot(x, y | toMovingAvg(debias=True)              | cli.deref())
plt.plot(x, y | toMovingAvg(debias=True,  alpha=0.95) | cli.deref())
plt.legend(["Signal", "Normal - 0.9 alpha", "Normal - 0.95 alpha", "Debiased - 0.9 alpha", "Debiased - 0.95 alpha"], framealpha=0.3)
plt.grid(True); plt.gcf() | toImg() | toBytes() | cli.file("../../docs/images/movingAvg.png")
x = np.linspace(0, 10, 100); y = np.cos(x)
plt.plot(x, y, label="Signal")
[0.6, 0.8, 0.85, 0.9, 0.93, 0.95, 0.98, 0.99, 0.999] | cli.apply(lambda a: plt.plot(x, y | toMovingAvg(None, a) | cli.deref(), label=f"alpha={a}")) | cli.ignore()
plt.legend(framealpha=0.5); plt.grid(True); plt.gcf() | toImg() | toBytes() | cli.file("../../docs/images/movingAvg-alphas.png")

'../../docs/images/movingAvg-alphas.png'

In [82]:
#export
cm = k1lib.dep("matplotlib.cm")
class toCm(BaseCli):
    def __init__(self, col:int, cmap=None, title:str=None):
        """Converts the specified column to a bunch of color
values, and adds a colorbar automatically. "cm" = "color map". Example::

    import matplotlib.cm as cm
    exps = [1, 2, 3, 4, 5]
    x = np.linspace(-2, 2)
    data = exps | apply(lambda exp: [exp, x, x**exp]) | deref()

    # without toCm(), plots fine, demonstrates underlying mechanism, but doesn't allow plotting a separate colorbar
    data | normalize(0, mode=1) | apply(cm.viridis, 0) | ~apply(lambda c,x,y: plt.plot(x, y, color=c)) | ignore()
    # with toCm(), draws a colorbar automatically
    data | toCm(0, cm.viridis, "Exponential") | ~apply(lambda c,x,y: plt.plot(x, y, color=c)) | ignore()

.. image:: ../images/toCm.png

Functionality is kind of niche, but I need this over and over
again, so have to make it

:param col: column to convert float/int to color (tuple of 4 floats)
:param cmap: colormap to use. If not specified, defaults to ``cm.viridis``
:param title: title of the colorbar, optional"""
        self.col = col; self.cmap = cmap or cm.viridis; self.title = title
    def __ror__(self, it):
        col = self.col; cmap = self.cmap; title = self.title
        if col is None:
            if not isinstance(it, k1lib.settings.cli.arrayTypes): it = list(it)
            plt.colorbar(cm.ScalarMappable(norm=plt.Normalize(*it | cli.toMin() & cli.toMax()), cmap=cmap), ax=plt.gca(), label=title)
            return it | cli.normalize(None, 1) | cli.apply(cmap)
        else:
            it = it | cli.deref(2)
            plt.colorbar(cm.ScalarMappable(norm=plt.Normalize(*it | cli.cut(col) | cli.toMin() & cli.toMax()), cmap=cmap), ax=plt.gca(), label=title)
            return it | cli.normalize(col, 1) | cli.apply(cmap, col)

In [83]:
exps = [1, 2, 3, 4, 5]; x = np.linspace(-2, 2)
exps | cli.apply(lambda exp: [exp, x, x**exp]) | toCm(0, title="Exponential") | ~cli.apply(lambda c,x,y: plt.plot(x,y,color=c)) | cli.deref();
plt.gcf() | toImg() | toBytes() | cli.file("../../docs/images/toCm.png")

'../../docs/images/toCm.png'

In [110]:
!../../export.py cli/conv --upload=True

2023-12-15 18:43:59,125	INFO worker.py:1458 -- Connecting to existing Ray cluster at address: 192.168.1.19:6379...
2023-12-15 18:43:59,131	INFO worker.py:1633 -- Connected to Ray cluster. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m
----- exportAll
13023   0   60%   
8522    1   40%   
rm: cannot remove '__pycache__': No such file or directory
Found existing installation: k1lib 1.4.4.5
Uninstalling k1lib-1.4.4.5:
  Successfully uninstalled k1lib-1.4.4.5
running install
running bdist_egg
running egg_info
creating k1lib.egg-info
writing k1lib.egg-info/PKG-INFO
writing dependency_links to k1lib.egg-info/dependency_links.txt
writing requirements to k1lib.egg-info/requires.txt
writing top-level names to k1lib.egg-info/top_level.txt
writing manifest file 'k1lib.egg-info/SOURCES.txt'
reading manifest file 'k1lib.egg-info/SOURCES.txt'
adding license file 'LICENSE'
writing manifest file 'k1lib.egg-info/SOURCES.txt'
installing library code to build/bdist.linux-x86_64/egg
running inst

In [124]:
!../../export.py cli/conv

2024-01-01 22:08:15,121	INFO worker.py:1458 -- Connecting to existing Ray cluster at address: 192.168.1.19:6379...
2024-01-01 22:08:15,129	INFO worker.py:1633 -- Connected to Ray cluster. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m
----- exportAll
13429   0   60%   
8971    1   40%   
rm: cannot remove '__pycache__': No such file or directory
Found existing installation: k1lib 1.4.4.5
Uninstalling k1lib-1.4.4.5:
  Successfully uninstalled k1lib-1.4.4.5
running install
running bdist_egg
running egg_info
creating k1lib.egg-info
writing k1lib.egg-info/PKG-INFO
writing dependency_links to k1lib.egg-info/dependency_links.txt
writing requirements to k1lib.egg-info/requires.txt
writing top-level names to k1lib.egg-info/top_level.txt
writing manifest file 'k1lib.egg-info/SOURCES.txt'
reading manifest file 'k1lib.egg-info/SOURCES.txt'
adding license file 'LICENSE'
writing manifest file 'k1lib.egg-info/SOURCES.txt'
installing library code to build/bdist.linux-x86_64/egg
running inst

In [118]:
!../../export.py cli/conv --boostrap=True

2023-12-31 18:09:55,384	INFO worker.py:1458 -- Connecting to existing Ray cluster at address: 192.168.1.19:6379...
2023-12-31 18:09:55,393	INFO worker.py:1633 -- Connected to Ray cluster. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m
----- exportAll
13376   0   60%   
8940    1   40%   
rm: cannot remove '__pycache__': No such file or directory
Found existing installation: k1lib 1.4.4.5
Uninstalling k1lib-1.4.4.5:
  Successfully uninstalled k1lib-1.4.4.5
running install
running bdist_egg
running egg_info
creating k1lib.egg-info
writing k1lib.egg-info/PKG-INFO
writing dependency_links to k1lib.egg-info/dependency_links.txt
writing requirements to k1lib.egg-info/requires.txt
writing top-level names to k1lib.egg-info/top_level.txt
writing manifest file 'k1lib.egg-info/SOURCES.txt'
reading manifest file 'k1lib.egg-info/SOURCES.txt'
adding license file 'LICENSE'
writing manifest file 'k1lib.egg-info/SOURCES.txt'
installing library code to build/bdist.linux-x86_64/egg
running inst