# Global Imports

In [66]:
from PIL import Image 
from skimage.morphology import thin
from scipy.ndimage import zoom
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm import tqdm
from scipy.ndimage import convolve


# Paths

In [67]:
__filedir__: str = os.path.abspath(
    "."
)
raw_dirpath: str = os.path.join(
    __filedir__,
    "raw"
)
processed_dirpath: str = os.path.join(
    __filedir__,
    "processed_128_blur"
)

apl_raw_dirpath: str = os.path.join(
    raw_dirpath,
    "apl"
)
ascii_raw_dirpath: str = os.path.join(
    raw_dirpath,
    "ascii"
)


# Constants

In [77]:
TARGET_HEIGHT: int = 128
BLUR_SIGMA_ASCII: float = 1.25
BLUR_SIGMA_APL: float = 3.0

DISPLAY_IMAGES: bool = False
THRESHOLD: int = 127

# Functions

In [78]:
def crop_to_bbox(
    image: Image.Image
) -> Image.Image:
    
    # Find the bounding box (min and max coordinates of non-zero pixels)
    gray_image: Image.Image = image.convert('L')

    # Create a binary image where non-zero pixels are set to 1 and zero pixels are set to 0
    binary_image: Image.Image = gray_image.point(lambda p: p > 0)

    # Get the bounding box of non-zero pixels
    left, top, right, bottom = binary_image.getbbox()
    
    return image.crop(
        (
            left, 
            top, 
            right, 
            bottom
        )
    )


def blur_image(
    image: Image.Image, 
    sigma: float=1.0
) -> Image.Image:
    """
    Applies Gaussian blur to a grayscale PIL image.

    Parameters:
        image (PIL.Image.Image): Grayscale input image.
        sigma (float): Standard deviation for the Gaussian kernel.

    Returns:
        PIL.Image.Image: Blurred image.
    """
    if image.mode != "L":
        raise ValueError("Input image must be in grayscale mode ('L').")
    
    # Calculate kernel size to cover 2 standard deviations
    kernel_size = int(np.ceil(2 * sigma * 2)) + 1

    # Create a Gaussian kernel
    def gaussian_kernel(size, sigma):
        ax = np.arange(-(size // 2), size // 2 + 1)
        xx, yy = np.meshgrid(ax, ax)
        kernel = np.exp(-(xx**2 + yy**2) / (2 * sigma**2))
        kernel /= np.sum(kernel)
        return kernel

    kernel = gaussian_kernel(kernel_size, sigma)

    # Convert the PIL image to a numpy array
    img_array = np.array(image, dtype=np.float32)

    # Apply the Gaussian kernel using convolution
    blurred_array = convolve(img_array, kernel, mode='reflect')

    blurred_array = (blurred_array-blurred_array.min())/(blurred_array.max()-blurred_array.min())

    blurred_array = blurred_array*255
    
    # Convert the blurred array back to a PIL image
    blurred_image = Image.fromarray(np.uint8(blurred_array))
    return blurred_image

def convert_to_zero_bg(
    image_np: np.ndarray
) -> np.ndarray:
    fixed_image_np = (image_np > 0).astype(np.uint8)

    #hist, _ = np.histogram(fixed_image_np, bins=2)

    is_black_bg: bool = image_np[0][0] == 0

    if not is_black_bg:
        fixed_image_np = 1 - fixed_image_np

    return fixed_image_np

def process(
    in_image_path: str,
    out_image_path: str,
    target_height: int = 128,
    blur_sigma: float = 1.0
) -> None:
    
    out_dirpath: str = os.path.dirname(out_image_path)
    
    if not os.path.exists(out_dirpath):
        os.makedirs(out_dirpath)

    pil_image: Image.Image = Image.open(
        in_image_path,
    ).convert("RGBA")

    image_np_all_channel: np.ndarray = np.asarray(
        pil_image
    )

    image_np_r: np.ndarray = image_np_all_channel[:, :, 0]
    image_np_a: np.ndarray = image_np_all_channel[:, :, 3]

    image_np_r = convert_to_zero_bg(image_np_r)
    image_np_a = convert_to_zero_bg(image_np_a)

    image_np: np.ndarray = np.maximum(image_np_r, image_np_a)

    pil_image_resized: Image.Image = Image.fromarray(image_np)

    if DISPLAY_IMAGES:
        plt.imshow(pil_image_resized)
        plt.show()

    pil_image_blurred: Image.Image = blur_image(
        pil_image_resized,
        blur_sigma
    )
    
    if DISPLAY_IMAGES:
        plt.imshow(pil_image_blurred)
        plt.show()

    pil_image_cropped: Image.Image = crop_to_bbox(
        pil_image_blurred
    )

    if DISPLAY_IMAGES:
        plt.imshow(pil_image_cropped)
        plt.show()

    image_cropped_np: np.ndarray = np.asarray(
        pil_image_cropped
    )

    y_scale: float = target_height/image_cropped_np.shape[0]

    image_resized_np: np.ndarray = zoom(
        image_cropped_np,
        (
            y_scale,
            y_scale
        )
    )
    
    if DISPLAY_IMAGES:
        plt.imshow(image_resized_np)
        plt.show()
    
    #thin_image_np: np.ndarray = thin(image_resized_np)
    thin_image_np: np.ndarray = image_resized_np

    thin_image_np = (thin_image_np - thin_image_np.min())/(thin_image_np.max() - thin_image_np.min())
    
    thin_image_np = thin_image_np * 255
    
    thin_image_np = np.uint8(thin_image_np > THRESHOLD)
    
    thin_image_pil: Image.Image = Image.fromarray(
        thin_image_np
    )

    if DISPLAY_IMAGES:
        plt.imshow(thin_image_pil)
        plt.show()
    
    thin_image_pil = thin_image_pil.point(lambda x: 255 if x==1 else 0)

    thin_image_pil.save(out_image_path)
    


# Processing Dataset

In [None]:

authors: list[str] = os.listdir(
    apl_raw_dirpath
)

author: str
for author in tqdm(
    iterable=authors,
    desc="Processing APL Authors...",
    total=len(authors)
):
    author_dirpath: str = os.path.join(
        apl_raw_dirpath,
        author
    )
    
    dir_entry: os.DirEntry
    for dir_entry in os.scandir(
        author_dirpath
    ):
        filename: str = str(dir_entry.name)
        
        if not filename.endswith(".png"):
            continue
        
        unicode: str
        hash: str
        unicode, _ = filename[:-4].split("-")
        
        out_dirpath: str = os.path.join(
            processed_dirpath,
            unicode
        )
        
        out_filepath: str = os.path.join(
            out_dirpath,
            filename
        )
        
        #if os.path.exists(out_filepath):
        #    continue
        
        process(dir_entry.path, out_filepath, TARGET_HEIGHT, BLUR_SIGMA_APL)
        
            

In [79]:

unicode_dirnames: list[str] = os.listdir(
    ascii_raw_dirpath
)

unicode_dirname: str
for unicode_dirname in tqdm(
    iterable=unicode_dirnames,
    desc="Processing ASCII...",
    total=len(unicode_dirnames)
):
    unicode_dirpath: str = os.path.join(
        ascii_raw_dirpath,
        unicode_dirname
    )
    
    dir_entry: os.DirEntry
    for dir_entry in os.scandir(
        unicode_dirpath
    ):
        filename: str = str(dir_entry.name)
        
        if not filename.endswith(".png"):
            continue
        
        unicode: str
        hash: str
        unicode, _ = filename[:-4].split("-")
        
        out_dirpath: str = os.path.join(
            processed_dirpath,
            unicode
        )
        
        out_filepath: str = os.path.join(
            out_dirpath,
            filename
        )
        
        #if os.path.exists(out_filepath):
        #    continue
        
        process(dir_entry.path, out_filepath, TARGET_HEIGHT, BLUR_SIGMA_ASCII)
        
        
        

Processing ASCII...: 100%|██████████| 94/94 [01:11<00:00,  1.32it/s]


In [None]:
im: Image.Image = Image.open(dir_entry.path)

blur: Image.Image = blur_image(im, 1.0)

blur_np: np.ndarray = np.asarray(blur)
blur_np.max()

np.uint8(255)