# Imports

In [1]:
# misc
import os

# load/save files
import zipfile
import json

# plot
import matplotlib.pyplot as plt
from PIL import Image

# datascience libs
import numpy as np
import pandas as pd



try: # python
    path_ = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..")
except NameError: # jupyter notebook
    path_ = os.path.dirname(os.getcwd())

dataset_dir = os.path.join(path_, "datasets")
model_dir = os.path.join(path_, "models")

# Read mapping file

In [6]:
def emnist_get_mapping(filepath):
    max_index = 0
    data = {}
    with open(filepath) as f:
        while True:
            line = f.readline()
            if not line:
                break
            tmp = [int(s) for s in line.strip().split(' ') if s.isdigit()]
            data[tmp[0]] = tmp[1]
            max_index = max(max_index, tmp[0])
    # create array with size
    emnist_mapping = [-1] * (max_index + 1)
    # dict to array
    for key, val in data.items():
        emnist_mapping[key] = val
    return emnist_mapping

set_name = 'emnist-balanced'
emnist_mapping = emnist_get_mapping(os.path.join(dataset_dir, set_name + '-mapping.txt'))

print(emnist_mapping)

[48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 97, 98, 100, 101, 102, 103, 104, 110, 113, 114, 116]


# Converter

## Packer class

In [9]:
class BinPackerNode:
    def __init__(self, x=0, y=0, width=0,height=0, data=None, left=None,right=None):
        self.x = x
        self.y = y
        self.width = width
        self.height = height
        self.data = data
        self.left = left
        self.right = right

    def split(self, data, width, height):
        self.data = data
        self.left = BinPackerNode(self.x,self.y+height, self.width, self.height-height)
        self.right = BinPackerNode(self.x+width,self.y, self.width-width, height)
        return self
    
    @staticmethod
    def find(node, width, height):
        if node.data:
            return BinPackerNode.find(node.right, width, height) or BinPackerNode.find(node.left, width, height)
        elif width <= node.width and height <= node.height:
            return node
        return None


class BinPacker:
    def __init__(self, width, height, if_not_found="continue"):
        self.root = BinPackerNode(0,0,width,height)
        self.width = width
        self.height = height
        self.if_not_found = if_not_found # "continue" or "break"
    
    cbsort = {
        "w": (lambda a,b: b["width"] - a["width"]),
        "h": (lambda a,b: b["height"] - a["height"]),
        "a": (lambda a,b: b["width"]*b["height"] - a["width"]*a["height"]),
        "max": (lambda a,b: max(b["width"], b["height"]) - max(a["width"], a["height"])),
        "min": (lambda a,b: min(b["width"], b["height"]) - min(a["width"], a["height"])),
        "random": (lambda a,b: random.random() - 0.5),
        "height": (lambda a,b: BinPacker.msort(a, b, ['h','w'])),
        "width": (lambda a,b: BinPacker.msort(a, b, ['w','h'])),
        "area": (lambda a,b: BinPacker.msort(a, b, ['a','h','w'])),
        "maxside": (lambda a,b: BinPacker.msort(a, b, ['max','min','h','w'])),
    }
    
    @staticmethod
    def msort(a, b, criteria):
        diff = 0
        for n in range(len(criteria)):
            diff = BinPacker.cbsort[criteria[n]](a,b)
            if diff != 0:
                break
        return diff
    
    @staticmethod
    def swap(a,i,j):
        t = a[i]
        a[i] = a[j]
        a[j] = t

    @staticmethod
    def sort(arr, criteria = ['height']):
        for i in range(0, len(arr)-1):
            for j in range(i+1, len(arr)):
                if BinPacker.msort(arr[i], arr[j], criteria) > 0:
                    BinPacker.swap(arr,i,j)

    def fit(self, blocks_src, criteria = ['height']):
        res = []
        blocks = []
        
        for i in range(len(blocks_src)):
            blocks.append(blocks_src[i])

        BinPacker.sort(blocks, criteria)

        for i in range(len(blocks)):
            block = blocks[i]
            w = block["width"]
            h = block["height"]
            node = BinPackerNode.find(self.root, w,h)
            if not node:
                continue
            if not node.split(block["data"] if "data" in block else "empty", w,h):
                continue
            node.width = w
            node.height = h
            res.append(node)
        return res

In [3]:
df = pd.read_csv(os.path.join(dataset_dir, 'emnist-balanced-test.csv'), header=None)

In [4]:
"""filepath = os.path.join(dataset_dir, 'emnist-balanced-test.csv')
with open(filepath) as f:
    line = f.readline()
print(line)"""

"filepath = os.path.join(dataset_dir, 'emnist-balanced-test.csv')\nwith open(filepath) as f:\n    line = f.readline()\nprint(line)"

## Converter

In [53]:
def makedirs(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
        
def write_image(raw_data, output_dir, i, X_set='train'):
    image_filename = os.path.join(output_dir, f"{X_set}_{i}.webp")
    #pixels = np.array(raw_data, dtype='uint8').reshape(28,28)
    #pixels = np.flip(pixels, axis=1)
    #pixels = np.rot90(pixels, k=1, axes=(0, 1))
    img = Image.fromarray(pixels, mode='L')
    img.save(image_filename, format='webp', lossless = True)

def draw_object(src, sx,sy, s_width,s_height,
                dst, dx,dy, d_width,d_height):
    # src
    s_width_orig = src.shape[1]
    s_height_orig = src.shape[0]
    # dst
    d_width_orig = dst.shape[1]
    d_height_orig = dst.shape[0]
    dx -= sx
    dy -= sy
    for j in range(sy,s_height):
        for i in range(sx,s_width):
            color = src[j][i]
            dst[j+dy][i+dx] = color# if color else 127
    return

def list_first_val(arr):
    for i in arr:
        if i > 0:
            return 1
    return -1
    
def get_bbox(data):
    """aligned-axis bounding-box (bounding square)"""
    x1 = 0xffff
    y1 = 0xffff
    x2 = 0
    y2 = 0
    # y1
    for j in range(len(data)):
        if list_first_val(data[j]) > 0:
            y1 = j
            break
    # y2
    for j in range(len(data)):
        end = len(data)-j-1
        if list_first_val(data[end]) > 0:
            y2 = end
            break
    # x1, x2
    for j in range(len(data)):
        ydata = data[j]
        val = 0xffff
        last = 0
        for i in range(len(ydata)):
            if ydata[i] > 0:
                x1 = min(x1,i)
                x2 = max(x2,i)
    return [x1,y1, x2+1,y2+1]
    
    
def emnist_convert(dataset_dir, input_name, output_dir, emnist_mapping, X_set='train'):
    X_set_dir = os.path.join(dataset_dir, output_dir, X_set)
    makedirs(X_set_dir)
    
    input_file = os.path.join(dataset_dir, input_name+'-'+X_set+'.csv')
    i = 0
    
    blocks = []
    json_output = {
        "width": 28,
        "height": 28,
        "files": [],
        "id": "",
        "bbox": [],
    }
    
    print("begin: read data")
    with open(input_file) as f:
        while True:
            line = f.readline()
            if not line:
                break
            tmp = [int(s) for s in line.strip().split(',') if s.isdigit()]
            #write_image(tmp[1:], X_set_dir, i, X_set)
            #print(chr(emnist_mapping[tmp[0]]))
            pixels = np.array(tmp[1:], dtype='uint8').reshape(28,28)
            pixels = np.flip(pixels, axis=1)
            pixels = np.rot90(pixels, k=1, axes=(0, 1))
            json_output['id'] += chr(emnist_mapping[tmp[0]])
            json_output['bbox'].append(get_bbox(pixels))
            blocks.append({
                "width": 28, "height": 28,
                "data": {
                    "idx": i,
                    "val": chr(emnist_mapping[tmp[0]]),
                    "pixels": pixels
                }
            })
            i += 1
            if i > 600:
                break

    print("begin: export files")
    count = 0
    while len(blocks) > 0:
        packer = BinPacker(4096,4096)
        res = packer.fit(blocks, ["height"])
        # advance on the next pack that couldn't be fitted
        blocks = blocks[len(res):]
        dst = np.zeros((packer.width,packer.height), dtype="uint8")
        count_prev = count
        for i in range(len(res)):
            node = res[i]
            if node.data == "empty":
                continue
            count += 1
            idx = node.data['idx']
            val = node.data['val']
            ##print(idx, node.x,node.y, packer.width,packer.height)
            draw_object(node.data['pixels'],
                        0,0, 28,28,
                        dst, node.x,node.y, packer.width,packer.height)
        # write image
        image_filename = "X_{}_{}_to_{}.webp".format(X_set,count_prev,count-1)
        image_filename = os.path.join(X_set_dir, image_filename)
        json_output['files'].append(image_filename)
        print("save:", image_filename)
        img = Image.fromarray(dst, mode='L')
        img.save(image_filename, format='webp', lossless = True)
        # plot image
        #plt.figure(figsize = (20,20))
        #plt.imshow(dst, cmap='gray')
        #plt.show()
    
    # write json file
    json_filename = os.path.join(X_set_dir, X_set + ".json")
    json_dump = json.dumps(json_output,separators=(',',':'))
    with open(json_filename, 'w', encoding='utf-8') as f:
        f.write(json_dump)
            
set_name = 'emnist-balanced'
emnist_convert(dataset_dir, set_name, 'origin-'+set_name, emnist_mapping, 'test')


begin: tree build
begin: export files


KeyboardInterrupt: 