In [168]:
from PIL import Image 
from skimage.morphology import thin
from scipy.ndimage import zoom
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm import tqdm

In [169]:
__filedir__: str = os.path.abspath(
    "."
)
raw_dirpath: str = os.path.join(
    __filedir__,
    "raw"
)
processed_dirpath: str = os.path.join(
    __filedir__,
    "processed"
)

apl_dirpath: str = os.path.join(
    raw_dirpath,
    "apl"
)
ascii_dirpath: str = os.path.join(
    raw_dirpath,
    "ascii"
)


In [170]:
def crop_to_bbox(
    image: Image.Image
) -> Image.Image:
    
    # Find the bounding box (min and max coordinates of non-zero pixels)
    gray_image: Image.Image = image.convert('L')

    # Create a binary image where non-zero pixels are set to 1 and zero pixels are set to 0
    binary_image: Image.Image = gray_image.point(lambda p: p > 0)

    # Get the bounding box of non-zero pixels
    left, top, right, bottom = binary_image.getbbox()
    
    return image.crop(
        (
            left, 
            top, 
            right, 
            bottom
        )
    )

def convert_to_zero_bg(
    image_np: np.ndarray
) -> np.ndarray:
    fixed_image_np = (image_np > 0).astype(np.uint8)

    #hist, _ = np.histogram(fixed_image_np, bins=2)

    is_black_bg: bool = image_np[0][0] == 0

    if not is_black_bg:
        fixed_image_np = 1 - fixed_image_np

    return fixed_image_np

def process(
    in_image_path: str,
    out_image_path: str,
    target_height: int = 64
) -> None:
    
    out_dirpath: str = os.path.dirname(out_image_path)
    
    if not os.path.exists(out_dirpath):
        os.makedirs(out_dirpath)

    pil_image: Image.Image = Image.open(
        in_image_path,
    ).convert("RGBA")

    image_np_all_channel: np.ndarray = np.asarray(
        pil_image
    )

    image_np_r: np.ndarray = image_np_all_channel[:, :, 0]
    image_np_a: np.ndarray = image_np_all_channel[:, :, 3]

    image_np_r = convert_to_zero_bg(image_np_r)
    image_np_a = convert_to_zero_bg(image_np_a)

    image_np: np.ndarray = np.maximum(image_np_r, image_np_a)

    pil_image_resized: Image.Image = Image.fromarray(image_np)

    pil_image_cropped: Image.Image = crop_to_bbox(
        pil_image_resized
    )

    image_cropped_np: np.ndarray = np.asarray(
        pil_image_cropped
    )

    y_scale: float = target_height/image_cropped_np.shape[0]

    image_resized_np: np.ndarray = zoom(
        image_cropped_np,
        (
            y_scale,
            y_scale
        )
    )
    #thin_image_np: np.ndarray = thin(image_resized_np)
    thin_image_np: np.ndarray = image_resized_np

    
    thin_image_pil: Image.Image = Image.fromarray(
        thin_image_np
    )

    thin_image_pil = thin_image_pil.point(lambda x: 255 if x==1 else 0)

    thin_image_pil.save(out_image_path)
    


In [171]:
#direntry: os.DirEntry
#for direntry in os.scandir(r"C:\Users\LeonBass\Documents\visual_studio_code\character-dataset\raw\apl\MrMaker"):    
#    process(direntry.path, str(direntry.path).replace("raw", "TEST"))

In [172]:

authors: list[str] = os.listdir(
    apl_dirpath
)

author: str
for author in tqdm(
    iterable=authors,
    desc="Processing APL...",
    total=len(authors)
):
    author_dirpath: str = os.path.join(
        apl_dirpath,
        author
    )
    
    dir_entry: os.DirEntry
    for dir_entry in os.scandir(
        author_dirpath
    ):
        filename: str = str(dir_entry.name)
        
        if not filename.endswith(".png"):
            continue
        
        unicode: str
        hash: str
        unicode, _ = filename[:-4].split("-")
        
        out_dirpath: str = os.path.join(
            processed_dirpath,
            unicode
        )
        
        out_filepath: str = os.path.join(
            out_dirpath,
            filename
        )
        try:
            process(dir_entry.path, out_filepath)
        except Exception:
            pass
            

Processing APL...: 100%|██████████| 11/11 [02:24<00:00, 13.14s/it]


In [173]:

unicode_dirnames: list[str] = os.listdir(
    ascii_dirpath
)

unicode_dirname: str
for unicode_dirname in tqdm(
    iterable=unicode_dirnames,
    desc="Processing ASCII...",
    total=len(unicode_dirnames)
):
    unicode_dirpath: str = os.path.join(
        ascii_dirpath,
        unicode_dirname
    )
    
    dir_entry: os.DirEntry
    for dir_entry in os.scandir(
        unicode_dirpath
    ):
        filename: str = str(dir_entry.name)
        
        if not filename.endswith(".png"):
            continue
        
        unicode: str
        hash: str
        unicode, _ = filename[:-4].split("-")
        
        out_dirpath: str = os.path.join(
            processed_dirpath,
            unicode
        )
        
        out_filepath: str = os.path.join(
            out_dirpath,
            filename
        )
        process(dir_entry.path, out_filepath)
        

Processing ASCII...: 100%|██████████| 94/94 [01:24<00:00,  1.11it/s]
