In [None]:
import yaml
import copy
import itertools
from fibertree import Payload, Fiber, CoordPayload, Tensor, TensorImage, TensorCanvas, SpacetimeCanvas

# Main codec

In [None]:
# U = uncompressed
    # size of vector = shape of fiber
    # contents = 0 if nothing in position, payload otherwise
    # fibers serialized in position order
uncompressed = "U"

# Bu = untruncated bit vector
    # size of vector = shape of this fiber
    # contents = 0 in position if empty, 1 if not
    # when each rank is serialized, fibers are serialized in order
untruncated_bitvector = "Bu"

# Bt = truncated bit vector
    # cut off bit vector at last 1, store number of bits in previous rank's payloads
    # size of vector <= shape of fiber
    # when each rank is serialized, fibers are serialized in order
truncated_bitvector = "Bt"

# C = coordinate list
    # size of vector = occupancy of this fiber
    # contents = sorted / deduplicated coordinates in this fiber
    # when each rank is serialized, fibers are serialized in order
coord_list = "C"

# list of all valid formats
valid_formats =  [uncompressed, coord_list, untruncated_bitvector, truncated_bitvector] 
# ["U", "C", "R", "A", "B", "D", "Hf", "Hr"]

# types of bitvectors
bitvectors = [untruncated_bitvector, truncated_bitvector]

# TO BE IMPLEMENTED
# D = delta compressed
    # num elements in vector = occupancy of fiber
    # contents = delta-compressed coordinate list
    # serialize according to position order
# Hf = hash table per fiber
    # TODO

# compression format at some rank determines coordinate representation at that rank
# coordinates at this rank stored explicitly as metadata
explicit_coords = [coord_list, untruncated_bitvector, truncated_bitvector]

# coordinates at this rank are stored implicitly
implicit_coords = list(set(valid_formats) - set(explicit_coords))

# compression format at a rank determines *payload* representation at previous rank
# payloads at previous rank are implicit if sizeof(coords) == shape because you can index into it given the shape metadata
# need explicit payloads iff size is proportional to occupancy
implicit_payloads = [uncompressed, untruncated_bitvector]
explicit_payloads = list(set(valid_formats) - set(implicit_payloads))

class Codec:
    # format descriptor should be a tuple of valid formats
    # order descriptor specified SoA or AoS at each rank (currently unused)
    # AoS / SoA doesn't apply to some formats (e.g. U) -> C (SoA, default should be here) / Ca (AoS)
    def __init__(self, format_descriptor):
        # check 
        for fmt in format_descriptor:
            assert fmt in valid_formats
        self.format_descriptor = format_descriptor
        
        # assumes pre-flattened for now
        self.num_ranks = len(format_descriptor)
                 
    def get_format_descriptor(self):
        return self.format_descriptor
                 
    def get_num_ranks(self):
        return self.num_ranks
    
    # encode an explicit coordinate
    # fmt = format at this rank, coords_key = key to output dict, prev_ind = pos+1 after previous nz,
    # ind = position of current nz, output = output dict
    # output = zeroes added to coords (e.g. in bit vector)
    def encode_coord(self, fmt, coords_key, prev_ind, ind, output):
        # if format at this level is C, store coords explicitly
        elts_added = 0
        if fmt is coord_list:
            output[coords_key].append(ind)
        if fmt in bitvectors:
            for i in range(prev_ind, ind):
                output[coords_key].append(0)
                elts_added = elts_added + 1
            output[coords_key].append(1)
        return elts_added
    
    # given a tensor in HFA, encode according  to the format descriptor   
    # depth start at -1, a = tensor, ranks = rank names lower, output = output dict
    def encode(self, depth, a, ranks, output):
        # keys are in the form payloads_{rank name}, coords_{rank name}
        payloads_key = "payloads_{}".format(ranks[depth].lower())
        coords_key = "coords_{}".format(ranks[depth].lower())
        
        # deal with the root separately
        if depth == -1:           
            # recurse one level down without adding to output yet
            size = self.encode(depth + 1, a, ranks, output)
            
            # store at most one payload at the root (size of first rank)
            if self.format_descriptor[depth + 1] in explicit_payloads:
                payloads_key = "payloads_root"
                output[payloads_key].append(size)
                
            return None              
        
        fmt = self.format_descriptor[depth]
        dim_len = a.getShape()[0]
        
        # leaf level
        if depth == self.num_ranks - 1:
            # keep track of the occupancy of this fiber 
            occupancy = 0
            
            # if U, may have to add some zeroes, so we need indexing
            prev_payloads_nz = 0
            prev_coords_nz = 0
            
            # iterate nonzeroes in the fiber
            for ind, (val) in a:
                assert isinstance(val, Payload)
                
                # if coords are implicit, add zeroes between nzs
                if fmt in implicit_coords:
                   for i in range(prev_payloads_nz, ind):
                       output[payloads_key].append(0)
                   prev_payloads_nz = ind + 1
                    
                # output leaf-level payloads at nzs
                output[payloads_key].append(val.value)
                
                # if this rank has explicit coords
                if fmt in explicit_coords:
                    added = self.encode_coord(fmt, coords_key, prev_coords_nz, ind, output)
                    occupancy = occupancy + added
                    prev_coords_nz = ind + 1
                    
                # count nzs in fiber
                occupancy = occupancy + 1            
            
            # if coords are implicit, fill in zeroes at end of payloads
            if self.format_descriptor[depth] in implicit_coords:
                for i in range(prev_payloads_nz, dim_len):
                    output[payloads_key].append(0)
        
            # if untruncated bitvector, fill in zeroes at end of coords
            if self.format_descriptor[depth] is untruncated_bitvector:                
                for i in range(prev_coords_nz, dim_len):
                    output[coords_key].append(0)
                    
            return occupancy
                
        # internal levels
        else:
            next_fmt = self.format_descriptor[depth + 1]
            
            # keep track of occupancy of children and at current height
            cumulative_occupancy = 0
            fiber_occupancy = 0
            prev_nz = 0

            # if coords at this depth are implicit, recurse on *every* coordinate (may be empty)
            if fmt in implicit_coords: 
                for i in range(0, dim_len):
                    child_occupancy = self.encode(depth + 1, a.getPayload(i), ranks, output)
                    
                    # keep track of actual occupancy
                    if not a.getPayload(i).isEmpty():
                        fiber_occupancy = fiber_occupancy + 1
                    
                    # whether there are payloads here depends on the format of the next rank
                    if next_fmt not in implicit_payloads:
                        cumulative_occupancy = cumulative_occupancy + child_occupancy
                        output[payloads_key].append(cumulative_occupancy)
                        
            # if coords at this depth are explicit, only the nonzeroes appear at lower ranks             
            else:
                # iterate through nonzeroes at this rank
                for ind, (val) in a:
                    assert isinstance(val, Fiber)
                    # keep track of nonzeroes in this fiber
                    fiber_occupancy = fiber_occupancy + 1

                    # recursive call to sub-fibers (DFS traversal)
                    child_occupancy = self.encode(depth + 1, val, ranks, output)

                    # if this level needs to store the payloads explicitly
                    # depends on the next level format: if coords are proportional to nnz
                    if next_fmt not in implicit_payloads:
                        cumulative_occupancy = cumulative_occupancy + child_occupancy
                        output[payloads_key].append(cumulative_occupancy)   
                    
                    # store coords explicitly at this rank
                    if fmt in explicit_coords:
                        added = self.encode_coord(fmt, coords_key, prev_nz, ind, output)
                        fiber_occupancy = fiber_occupancy + added
                        prev_nz = ind + 1
                
                # if bitvector is untruncated, add zeroes to the end
                if fmt is untruncated_bitvector:
                    for i in range(prev_nz, dim_len):
                        output[coords_key].append(0)
                        
            return fiber_occupancy
    
    # After encode
    def decode(self, tensor):
        return None

# Testing codec

In [None]:
# depth start for encode
depth_start = -1

# generate blank output dict based on rank names
def get_output_dict(rank_names):
    output = dict()
    output["payloads_root"] = []

    for name in rank_names:
        coords_key = "coords_{}".format(name.lower())
        payloads_key = "payloads_{}".format(name.lower())

        output[coords_key] = []
        output[payloads_key] = []  
    return output

# given a tensor, descriptor, and dict of tensor encoded in that format
# print and write out yaml in that format
# TODO: change the output file name (currently just writes it to [descriptor string].yaml)
def write_yaml(tensor, descriptor, tensor_in_format):
    # header
    header = dict()
    header["name"] = "tensor-a" # TODO: take this as input later
    header["rank_ids"] = tensor.getRankIds()
    header["shapes"] = tensor.getShape()
    header["formats"] = descriptor
    rank_names = tensor.getRankIds()

    # hierarchical yaml according to ranks
    scratchpads = dict()
    if len(tensor_in_format["payloads_root"]) > 0:
        scratchpads["root"] = { "payloads" : tensor_in_format["payloads_root"] }
    
    # write one rank at a time
    for i in range(0, len(rank_names)):
        rank_name = rank_names[i].lower()
        coords_key = "coords_{}".format(rank_name)
        payloads_key = "payloads_{}".format(rank_name)
        key = "rank_" + str(i)
        rank_dict = dict()
        
        # only write if scratchpad is nonempty
        if len(tensor_in_format[coords_key]) > 0:
            rank_dict["coords"] = tensor_in_format[coords_key]
        if len(tensor_in_format[payloads_key]) > 0:
            rank_dict["payloads"] = tensor_in_format[payloads_key]
            
        if len(rank_dict) > 0:
            scratchpads[key] = rank_dict
            
    header["scratchpads"] = scratchpads
        
    data = dict()
    data["tensor"] = header
    outfilename = ''.join(descriptor) + '.yaml'

    with open(outfilename, "w") as f:
        print(yaml.dump(data, default_flow_style=None, sort_keys=False))
        yaml.dump(data, f)

# given a tensor and format descriptor, write the yaml for that format
def try_format(tensor, descriptor):
    print("\n" + str(descriptor))
    codec = Codec(tuple(descriptor))
        
    # get output dict based on rank names
    rank_names = small_mtx.getRankIds()
        
    # TODO: move output dict generation into codec
    output = get_output_dict(rank_names)
        
    codec.encode(depth_start, tensor.getRoot(), tensor.getRankIds(), output)
    
    # remove empty scratchpads
    write_yaml(tensor, descriptor, output)      
        
# generate all codecs
def try_all_formats(tensor, possible_formats, descriptor, depth):
    # once we have built a full descriptor, try it
    if depth == len(tensor.getRankIds()):
        try_format(tensor, descriptor)  

    else:
        # add a format to the descriptor and recurse
        for format in formats:
            temp = copy.deepcopy(descriptor)
            temp.append(format)
            try_all_formats(tensor, possible_formats, temp, depth + 1)           

# UC demo

In [None]:
formats = ["U", "C"]

# run through all formats
small_mtx_data = [[0, 1], [2, 3]]
ranks = ["M", "K"]
small_mtx = Tensor.fromUncompressed(ranks, small_mtx_data)

# UU, UC
try_format(small_mtx, ["U", "U"])
try_format(small_mtx, ["U", "C"])

# CU
small_mtx_data = [[0, 0], [0, 3]]
small_mtx = Tensor.fromUncompressed(ranks, small_mtx_data)
try_format(small_mtx, ["C", "U"])
try_format(small_mtx, ["U", "U"])

# CC
small_mtx_data = [[0, 0], [2, 3]]
small_mtx = Tensor.fromUncompressed(ranks, small_mtx_data)
try_format(small_mtx, ["C", "C"])
try_format(small_mtx, ["U", "U"])

# UxBu demo

In [None]:
formats = ["U", "Bu"]

# make small test tensor
small_mtx_data = [[0, 0], [0, 0], [0, 3]]
ranks = ["M", "K"]
small_mtx = Tensor.fromUncompressed(ranks, small_mtx_data)

# run through all formats
try_all_formats(small_mtx, formats, [], 0)

In [None]:
formats = ["Bu", "Bt"]

# make small test tensor
small_mtx_data = [[0, 0], [0, 1], [2, 0], [0, 0]]
ranks = ["M", "K"]
small_mtx = Tensor.fromUncompressed(ranks, small_mtx_data)

# run through all formats
try_all_formats(small_mtx, formats, [], 0)

# CxBu demo

In [None]:
formats = ["C", "Bu"]

# make small test tensor
small_mtx_data = [[0, 0], [0, 3]]
ranks = ["M", "K"]
small_mtx = Tensor.fromUncompressed(ranks, small_mtx_data)

# run through all formats
try_all_formats(small_mtx, formats, [], 0)

# ignore past here

In [None]:
# small 3d example

small_3d_data = [[[0, 1], [1, 0]],[[0, 1], [1, 0]],[[0, 1], [1, 0]],[[0, 1], [1, 0]]]
small_3d = Tensor.fromUncompressed(["W", "L", "H"], small_3d_data)

test_3d = Codec(("U", "U", "U"))

In [None]:
"""
    def encode(self, tensor):
        assert len(tensor.getRankIds()) == self.num_ranks
        
        # lower rank ids for loop gen
        rank_ids = tensor.getRankIds()
        lower_rank_ids = [rank_ids[i].lower() for i in range(0, len(rank_ids))]
        print(lower_rank_ids)
        
        # write the header
        
        #auto generate loop nest
        a = tensor
        loops = "a_{} = a.getRoot()\n".format(lower_rank_ids[0])

        for i in  range(0, len(lower_rank_ids)):
            cur_rank = lower_rank_ids[i]
            if i < self.num_ranks - 1:
                next_rank = lower_rank_ids[i+1]
                loops = loops + "for {}, (a_{}) in a_{}:\n".format(cur_rank, next_rank, cur_rank)    
            else:
                loops = loops + "for {}, (a_val) in a_{}:\n".format(cur_rank, cur_rank)
            loops = loops + "\t" * (i+1)
            
        # for now, just print the coords
        ranks = ",".join(lower_rank_ids)
        str = "print(f\"(({{ {0} }}), {{a_val}})\")".format(ranks)
        loops = loops + str
        print(loops)
        # run it
        exec(loops)
        
        a="def x():\n\t print(42)\n"
        
        exec(a, globals())
        x()

        return None
    """