In [1]:
import pandas as pd
from skimage.morphology import thin

In [2]:
# rewrite the following to create a pandas dataframe with the following columns:
#   'image' relative path to the image
#   'unicode' the corresponding unicode value
import os

raw_apl_dirpath: str = os.path.join(
    os.path.abspath("."),
    "raw",
    "apl"
)

processing_dirpaths: list[str] = [
    os.path.join(
        raw_apl_dirpath,
        folder
    ) for folder in 
    os.listdir(raw_apl_dirpath)
]

files_to_process: list[str] = []

user_dir: str
for user_dir in processing_dirpaths:
    for char_filename in os.listdir(user_dir):
        
        if not char_filename.endswith(".png"):
            continue
        
        char_filepath: str = os.path.join(
            user_dir,
            char_filename
        )
       
        files_to_process.append(
            char_filepath
        )


In [3]:
files_to_process[:5]

['c:\\Users\\LeonBass\\Documents\\visual_studio_code\\character-dataset\\raw\\apl\\Aiden\\u2190-1730735471726.png',
 'c:\\Users\\LeonBass\\Documents\\visual_studio_code\\character-dataset\\raw\\apl\\Aiden\\u2190-1731063485150.png',
 'c:\\Users\\LeonBass\\Documents\\visual_studio_code\\character-dataset\\raw\\apl\\Aiden\\u2191-1730735474296.png',
 'c:\\Users\\LeonBass\\Documents\\visual_studio_code\\character-dataset\\raw\\apl\\Aiden\\u2191-1731063486973.png',
 'c:\\Users\\LeonBass\\Documents\\visual_studio_code\\character-dataset\\raw\\apl\\Aiden\\u2192-1731063488679.png']

In [4]:
unicode_labels: list[str] = list(
    map(
        lambda x : f"u{x.split("-")[-2].split("u")[-1]}",
        files_to_process
    )
)

metadata: pd.DataFrame = pd.DataFrame(
    {
        "image": files_to_process, 
        "unicode": unicode_labels
    }
)
metadata

Unnamed: 0,image,unicode
0,c:\Users\LeonBass\Documents\visual_studio_code...,u2190
1,c:\Users\LeonBass\Documents\visual_studio_code...,u2190
2,c:\Users\LeonBass\Documents\visual_studio_code...,u2191
3,c:\Users\LeonBass\Documents\visual_studio_code...,u2191
4,c:\Users\LeonBass\Documents\visual_studio_code...,u2192
...,...,...
2205,c:\Users\LeonBass\Documents\visual_studio_code...,uaf
2206,c:\Users\LeonBass\Documents\visual_studio_code...,ud7
2207,c:\Users\LeonBass\Documents\visual_studio_code...,ud7
2208,c:\Users\LeonBass\Documents\visual_studio_code...,uf7


In [5]:
import numpy as np
def get_first_index(arr, x):
    for i, val in enumerate(arr):
        if val == x:
            return i


def finding_box(im):
    # function to get the character into a bounding box
    valid_rows = np.min(im, axis=0)
    valid_cols = np.min(im, axis=1)
    top = max(0, get_first_index(valid_rows, 0)-1)
    bottom = min(len(valid_rows) - get_first_index(reversed(valid_rows), 0) + 1, len(valid_rows))

    left = max(0, get_first_index(valid_cols, 0)-1)
    right = min(len(valid_cols) - get_first_index(reversed(valid_cols), 0) + 1, len(valid_cols))

    return im[left:right, top:bottom]


In [6]:
from skimage import measure


def downsample_and_binarise(im, target_size=64):
    # function to downsample image to be of size exactly (64, 64)
    max_dim = int(np.ceil(max(im.shape)/target_size))*target_size

    height_diff = max_dim - im.shape[0]

    width_diff = max_dim - im.shape[1]

    pad_info = []

    half_diff = height_diff // 2
    right_pad = half_diff if height_diff % 2 == 0 else half_diff + 1
    pad_info.append((half_diff, right_pad))

    half_diff = width_diff // 2
    bottom_pad = half_diff if width_diff % 2 == 0 else half_diff + 1
    pad_info.append((half_diff, bottom_pad))

    im = np.pad(im, pad_info, constant_values=255)

    dim = measure.block_reduce(im, int(np.ceil(max_dim/target_size)))

    # block reduce change scale of values, so move it back to 8-bit
    dim = (255 * (dim / np.max(dim))).astype("uint8")

    dim = np.where(dim < target_size, 0, 255).astype("uint8")

    if dim.shape != (target_size, target_size):
        pad_amount = target_size - dim.shape[0]
        dim = np.pad(dim, (0, pad_amount), constant_values=255)

    return dim



In [7]:
import os
import time
import datetime

def save_image(im, unicode, relative_dir="./"):
    
    os.makedirs(relative_dir, exist_ok=True)
    
    # saves image with correct unicode formatting
    # change relative directory to store processed images where you want
    existing_images = os.listdir(relative_dir)
    
    im_id = int(time.time() * 1000)

    filename = f"{relative_dir}/{unicode}-{im_id}.png"

    while filename in existing_images:
        im_id = int(time.time() * 1000)

        filename = f"{relative_dir}/{unicode}-{im_id}.png"


    im = Image.fromarray(im)
    im.save(filename)


In [8]:
# this cell processes the images and saves them
from PIL import Image
import matplotlib.pyplot as plt

total_size = len(metadata)

for i, row in metadata.iterrows():
    if i % (total_size//100) == 0:
        print(f"{i} out of {total_size} complete")
    
    image_path: str = row["image"]
    image_unicode: str = row["unicode"]
    
    image_pil: Image.Image = Image.open(image_path)
    image_np: np.ndarray = np.asarray(image_pil)
    
    raw_im: np.ndarray = 255-image_np[:, :, -1]  
    bounded_img: np.ndarray = finding_box(raw_im) 
    
    binarised_img: np.ndarray = bounded_img < 100
    
    skeleton_img: np.ndarray = thin(binarised_img)
    
    out_dirpath: str = os.path.join(
        os.path.abspath("."),
        "processed",
        image_unicode
    ) 
    save_image(skeleton_img, row["unicode"], relative_dir=out_dirpath)

0 out of 2210 complete
22 out of 2210 complete
44 out of 2210 complete
66 out of 2210 complete
88 out of 2210 complete
110 out of 2210 complete
132 out of 2210 complete
154 out of 2210 complete
176 out of 2210 complete
198 out of 2210 complete
220 out of 2210 complete
242 out of 2210 complete
264 out of 2210 complete
286 out of 2210 complete
308 out of 2210 complete
330 out of 2210 complete
352 out of 2210 complete
374 out of 2210 complete
396 out of 2210 complete
418 out of 2210 complete
440 out of 2210 complete
462 out of 2210 complete
484 out of 2210 complete
506 out of 2210 complete
528 out of 2210 complete
550 out of 2210 complete
572 out of 2210 complete
594 out of 2210 complete
616 out of 2210 complete
638 out of 2210 complete
660 out of 2210 complete
682 out of 2210 complete
704 out of 2210 complete
726 out of 2210 complete
748 out of 2210 complete
770 out of 2210 complete
792 out of 2210 complete
814 out of 2210 complete
836 out of 2210 complete
858 out of 2210 complete
880 ou