In [1]:
import pandas as pd

In [9]:
# rewrite the following to create a pandas dataframe with the following columns:
#   'image' relative path to the image
#   'unicode' the corresponding unicode value
import os

processing_path = "./raw/apl/Raven"

files_to_process = os.listdir(processing_path)

temp_dict = {"image": list(
    map(
        lambda x : os.path.join(processing_path, x),
        files_to_process
    )), 
    "unicode": list(map(lambda x : x.split("-")[0], files_to_process))}

metadata = pd.DataFrame(temp_dict)
metadata

Unnamed: 0,image,unicode
0,./raw/apl/Leon\u2190-1730480871639.png,u2190
1,./raw/apl/Leon\u2190-1730480994847.png,u2190
2,./raw/apl/Leon\u2190-1730481132055.png,u2190
3,./raw/apl/Leon\u2190-1730481344854.png,u2190
4,./raw/apl/Leon\u2190-1730481532404.png,u2190
...,...,...
1033,./raw/apl/Leon\uf7-1730483439240.png,uf7
1034,./raw/apl/Leon\uf7-1730483590760.png,uf7
1035,./raw/apl/Leon\uf7-1730483770121.png,uf7
1036,./raw/apl/Leon\uf7-1730483932594.png,uf7


In [14]:
import numpy as np
def get_first_index(arr, x):
    for i, val in enumerate(arr):
        if val == x:
            return i


def finding_box(im):
    # function to get the character into a bounding box
    valid_rows = np.min(im, axis=0)
    valid_cols = np.min(im, axis=1)
    top = max(0, get_first_index(valid_rows, 0)-10)
    bottom = min(len(valid_rows) - get_first_index(reversed(valid_rows), 0) + 10, len(valid_rows))

    left = max(0, get_first_index(valid_cols, 0)-10)
    right = min(len(valid_cols) - get_first_index(reversed(valid_cols), 0) + 10, len(valid_cols))

    return im[left:right, top:bottom]


In [15]:
from skimage import measure


def downsample_and_binarise(im, target_size=64):
    # function to downsample image to be of size exactly (64, 64)
    max_dim = int(np.ceil(max(im.shape)/target_size))*target_size

    height_diff = max_dim - im.shape[0]

    width_diff = max_dim - im.shape[1]

    pad_info = []

    half_diff = height_diff // 2
    right_pad = half_diff if height_diff % 2 == 0 else half_diff + 1
    pad_info.append((half_diff, right_pad))

    half_diff = width_diff // 2
    bottom_pad = half_diff if width_diff % 2 == 0 else half_diff + 1
    pad_info.append((half_diff, bottom_pad))

    im = np.pad(im, pad_info, constant_values=255)

    dim = measure.block_reduce(im, int(np.ceil(max_dim/target_size)))

    # block reduce change scale of values, so move it back to 8-bit
    dim = (255 * (dim / np.max(dim))).astype("uint8")

    dim = np.where(dim < target_size, 0, 255).astype("uint8")

    if dim.shape != (target_size, target_size):
        pad_amount = target_size - dim.shape[0]
        dim = np.pad(dim, (0, pad_amount), constant_values=255)

    return dim



In [20]:
import os
import time
import datetime

def save_image(im, unicode, relative_dir="./"):
    # saves image with correct unicode formatting
    # change relative directory to store processed images where you want
    existing_images = os.listdir(relative_dir)
    
    im_id = int(time.time() * 1000)

    filename = f"{relative_dir}/{unicode}-{im_id}.png"

    while filename in existing_images:
        im_id = int(time.time() * 1000)

        filename = f"{relative_dir}/{unicode}-{im_id}.png"


    im = Image.fromarray(im)
    im.save(filename)


In [35]:
# this cell processes the images and saves them
from PIL import Image
import matplotlib.pyplot as plt

total_size = len(metadata)

for i, row in metadata.iterrows():
    if i % (total_size//100) == 0:
        print(f"{i} out of {total_size} complete")
        
        
    raw_im = 255 - np.asarray(Image.open(row["image"]))[:, :, -1]  
    bounded_img = finding_box(raw_im)
    downsample_image = downsample_and_binarise(bounded_img)
    assert downsample_image.shape == (64, 64)
    save_image(downsample_image, row["unicode"], relative_dir="./processed/Raven")

0 out of 1038 complete
10 out of 1038 complete
20 out of 1038 complete
30 out of 1038 complete
40 out of 1038 complete
50 out of 1038 complete
60 out of 1038 complete
70 out of 1038 complete
80 out of 1038 complete
90 out of 1038 complete
100 out of 1038 complete
110 out of 1038 complete
120 out of 1038 complete
130 out of 1038 complete
140 out of 1038 complete
150 out of 1038 complete
160 out of 1038 complete
170 out of 1038 complete
180 out of 1038 complete
190 out of 1038 complete
200 out of 1038 complete
210 out of 1038 complete
220 out of 1038 complete
230 out of 1038 complete
240 out of 1038 complete
250 out of 1038 complete
260 out of 1038 complete
270 out of 1038 complete
280 out of 1038 complete
290 out of 1038 complete
300 out of 1038 complete
310 out of 1038 complete
320 out of 1038 complete
330 out of 1038 complete
340 out of 1038 complete
350 out of 1038 complete
360 out of 1038 complete
370 out of 1038 complete
380 out of 1038 complete
390 out of 1038 complete
400 out of 