In [122]:
import os
import numpy as np
from PIL import Image, ImageOps, ImageFont, ImageDraw
import cv2
import matplotlib.pyplot as plt
import random


In [131]:
from utils import read_ngrams, synthetic_line

In [142]:
char_to_class_id = {
    'Alef': 0,
    'Ayin': 1,
    'Bet': 2,
    'Dalet': 3,
    'Gimel': 4,
    'He': 5,
    'Het': 6,
    'Kaf': 7,
    'Kaf-final': 8,
    'Lamed': 9,
    'Mem': 10,
    'Mem-medial': 11,
    'Nun-final': 12,
    'Nun-medial': 13,
    'Pe': 14,
    'Pe-final': 15,
    'Qof': 16,
    'Resh': 17,
    'Samekh': 18,
    'Shin': 19,
    'Taw': 20,
    'Tet': 21,
    'Tsadi-final': 22,
    'Tsadi-medial': 23,
    'Waw': 24,
    'Yod': 25,
    'Zayin': 26,
}

In [92]:

#crop+pad+resize
def image_padding_mid(image):
    #prject
    ink = np.all(image == 0, axis=2)
    vert_proj = np.sum(ink, axis = 1)
    hori_proj = np.sum(ink, axis = 0)

    top = np.argmax(vert_proj > 0)
    bottom = len(vert_proj) - np.argmax(vert_proj[::-1] > 0)
    left = np.argmax(hori_proj > 0)
    right = len(hori_proj) - np.argmax(hori_proj[::-1] > 0)

    #crop
    cropped = image[top:bottom, left:right]

    #pad to mid 1/3
    pad_height = cropped.shape[0]
    pad_width = cropped.shape[1]
    top_pad = np.full((pad_height, pad_width, 3), 255, dtype=np.uint8)
    bottom_pad = np.full((pad_height, pad_width, 3), 255, dtype=np.uint8)
    padded = np.vstack((top_pad, cropped, bottom_pad))

    #resize height to 75
    target_height = 75
    h, w = padded.shape[:2]
    aspect_ratio = w / h
    target_width = int(target_height * aspect_ratio)
    resized = cv2.resize(padded, (target_width, target_height), interpolation=cv2.INTER_AREA)

    return resized

def image_padding_tall(image):
    #prject
    ink = np.all(image == 0, axis=2)
    vert_proj = np.sum(ink, axis = 1)
    hori_proj = np.sum(ink, axis = 0)

    top = np.argmax(vert_proj > 0)
    bottom = len(vert_proj) - np.argmax(vert_proj[::-1] > 0)
    left = np.argmax(hori_proj > 0)
    right = len(hori_proj) - np.argmax(hori_proj[::-1] > 0)

    #crop
    cropped = image[top:bottom, left:right]

    #pad to tall with bottom 1/3
    pad_height = cropped.shape[0]
    pad_width = cropped.shape[1]
    pad_top = int(pad_height/8)
    pad_bottom = int(pad_height/2)
    top_pad = np.full((pad_top, pad_width, 3), 255, dtype=np.uint8)
    bottom_pad = np.full((pad_bottom, pad_width, 3), 255, dtype=np.uint8)
    padded = np.vstack((top_pad, cropped, bottom_pad))

    #resize height to 75
    target_height = 75
    h, w = padded.shape[:2]
    aspect_ratio = w / h
    target_width = int(target_height * aspect_ratio)
    resized = cv2.resize(padded, (target_width, target_height), interpolation=cv2.INTER_AREA)

    return resized

def image_padding_long(image):
    #prject
    ink = np.all(image == 0, axis=2)
    vert_proj = np.sum(ink, axis = 1)
    hori_proj = np.sum(ink, axis = 0)

    top = np.argmax(vert_proj > 0)
    bottom = len(vert_proj) - np.argmax(vert_proj[::-1] > 0)
    left = np.argmax(hori_proj > 0)
    right = len(hori_proj) - np.argmax(hori_proj[::-1] > 0)

    #crop
    cropped = image[top:bottom, left:right]

    #pad to tall with bottom 1/3
    pad_height = cropped.shape[0]
    pad_width = cropped.shape[1]
    pad_bottom = int(pad_height/8)
    pad_top = int(pad_height/2)
    top_pad = np.full((pad_top, pad_width, 3), 255, dtype=np.uint8)
    bottom_pad = np.full((pad_bottom, pad_width, 3), 255, dtype=np.uint8)
    padded = np.vstack((top_pad, cropped, bottom_pad))

    #resize height to 75
    target_height = 75
    h, w = padded.shape[:2]
    aspect_ratio = w / h
    target_width = int(target_height * aspect_ratio)
    resized = cv2.resize(padded, (target_width, target_height), interpolation=cv2.INTER_AREA)

    return resized

In [124]:
#loop through images
load_path = 'monkbrill2'
save_path = 'processed_monkbrill'
char_list_mid = [
    # 'Alef',
    # 'Ayin',
    # 'Bet',
    # 'Dalet',
    # 'Gimel',
    # 'He',
    # 'Het',
    # 'Kaf',
    # 'Mem',
    # 'Mem-medial',
    # 'Nun-medial',
    # 'Pe',
    'Resh',
    'Samekh',
    'Shin',
    'Taw',
    'Tet',
    'Tsadi-medial',
    'Waw',
    # 'Yod',
    # 'Zayin'
]
char_list_tall = [
    'Lamed'
]
char_list_long = [
    'Kaf-final',
    'Nun-final',
    'Pe-final',
    'Qof',
    'Tsadi-final'
]


In [125]:
# mid
os.makedirs(save_path, exist_ok=True)
for char in char_list_mid:
    input_dir = os.path.join(load_path, char)
    output_dir = os.path.join(save_path, char)
    os.makedirs(output_dir, exist_ok=True)

    # List image files
    image_files = [f for f in os.listdir(input_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

    for idx, filename in enumerate(sorted(image_files), 1):
        image_path = os.path.join(input_dir, filename)
        image = cv2.imread(image_path)
        if image is None:
            print(f"Failed to load: {image_path}")
            continue

        processed = image_padding_mid(image)
        save_filename = f"{idx}.jpg"
        save_fullpath = os.path.join(output_dir, save_filename)
        cv2.imwrite(save_fullpath, processed)

        print(f"Saved: {save_fullpath}")

Saved: processed_monkbrill/Resh/1.jpg
Saved: processed_monkbrill/Resh/2.jpg
Saved: processed_monkbrill/Resh/3.jpg
Saved: processed_monkbrill/Resh/4.jpg
Saved: processed_monkbrill/Resh/5.jpg
Saved: processed_monkbrill/Resh/6.jpg
Saved: processed_monkbrill/Resh/7.jpg
Saved: processed_monkbrill/Resh/8.jpg
Saved: processed_monkbrill/Resh/9.jpg
Saved: processed_monkbrill/Resh/10.jpg
Saved: processed_monkbrill/Resh/11.jpg
Saved: processed_monkbrill/Resh/12.jpg
Saved: processed_monkbrill/Resh/13.jpg
Saved: processed_monkbrill/Resh/14.jpg
Saved: processed_monkbrill/Resh/15.jpg
Saved: processed_monkbrill/Resh/16.jpg
Saved: processed_monkbrill/Resh/17.jpg
Saved: processed_monkbrill/Resh/18.jpg
Saved: processed_monkbrill/Resh/19.jpg
Saved: processed_monkbrill/Resh/20.jpg
Saved: processed_monkbrill/Resh/21.jpg
Saved: processed_monkbrill/Resh/22.jpg
Saved: processed_monkbrill/Resh/23.jpg
Saved: processed_monkbrill/Resh/24.jpg
Saved: processed_monkbrill/Resh/25.jpg
Saved: processed_monkbrill/Resh/26

In [119]:
for char in char_list_tall:
    input_dir = os.path.join(load_path, char)
    output_dir = os.path.join(save_path, char)
    os.makedirs(output_dir, exist_ok=True)

    # List image files
    image_files = [f for f in os.listdir(input_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

    for idx, filename in enumerate(sorted(image_files), 1):
        image_path = os.path.join(input_dir, filename)
        image = cv2.imread(image_path)
        if image is None:
            print(f"Failed to load: {image_path}")
            continue

        processed = image_padding_tall(image)
        save_filename = f"{idx}.jpg"
        save_fullpath = os.path.join(output_dir, save_filename)
        cv2.imwrite(save_fullpath, processed)

        print(f"Saved: {save_fullpath}")

Saved: processed_monkbrill/Lamed/1.jpg
Saved: processed_monkbrill/Lamed/2.jpg
Saved: processed_monkbrill/Lamed/3.jpg
Saved: processed_monkbrill/Lamed/4.jpg
Saved: processed_monkbrill/Lamed/5.jpg
Saved: processed_monkbrill/Lamed/6.jpg
Saved: processed_monkbrill/Lamed/7.jpg
Saved: processed_monkbrill/Lamed/8.jpg
Saved: processed_monkbrill/Lamed/9.jpg
Saved: processed_monkbrill/Lamed/10.jpg
Saved: processed_monkbrill/Lamed/11.jpg
Saved: processed_monkbrill/Lamed/12.jpg
Saved: processed_monkbrill/Lamed/13.jpg
Saved: processed_monkbrill/Lamed/14.jpg
Saved: processed_monkbrill/Lamed/15.jpg
Saved: processed_monkbrill/Lamed/16.jpg
Saved: processed_monkbrill/Lamed/17.jpg
Saved: processed_monkbrill/Lamed/18.jpg
Saved: processed_monkbrill/Lamed/19.jpg
Saved: processed_monkbrill/Lamed/20.jpg
Saved: processed_monkbrill/Lamed/21.jpg
Saved: processed_monkbrill/Lamed/22.jpg
Saved: processed_monkbrill/Lamed/23.jpg
Saved: processed_monkbrill/Lamed/24.jpg
Saved: processed_monkbrill/Lamed/25.jpg
Saved: pr

In [120]:
for char in char_list_long:
    input_dir = os.path.join(load_path, char)
    output_dir = os.path.join(save_path, char)
    os.makedirs(output_dir, exist_ok=True)

    # List image files
    image_files = [f for f in os.listdir(input_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]

    for idx, filename in enumerate(sorted(image_files), 1):
        image_path = os.path.join(input_dir, filename)
        image = cv2.imread(image_path)
        if image is None:
            print(f"Failed to load: {image_path}")
            continue

        processed = image_padding_long(image)
        save_filename = f"{idx}.jpg"
        save_fullpath = os.path.join(output_dir, save_filename)
        cv2.imwrite(save_fullpath, processed)

        print(f"Saved: {save_fullpath}")

Saved: processed_monkbrill/Kaf-final/1.jpg
Saved: processed_monkbrill/Kaf-final/2.jpg
Saved: processed_monkbrill/Kaf-final/3.jpg
Saved: processed_monkbrill/Kaf-final/4.jpg
Saved: processed_monkbrill/Kaf-final/5.jpg
Saved: processed_monkbrill/Kaf-final/6.jpg
Saved: processed_monkbrill/Kaf-final/7.jpg
Saved: processed_monkbrill/Kaf-final/8.jpg
Saved: processed_monkbrill/Kaf-final/9.jpg
Saved: processed_monkbrill/Kaf-final/10.jpg
Saved: processed_monkbrill/Nun-final/1.jpg
Saved: processed_monkbrill/Nun-final/2.jpg
Saved: processed_monkbrill/Nun-final/3.jpg
Saved: processed_monkbrill/Nun-final/4.jpg
Saved: processed_monkbrill/Nun-final/5.jpg
Saved: processed_monkbrill/Nun-final/6.jpg
Saved: processed_monkbrill/Nun-final/7.jpg
Saved: processed_monkbrill/Nun-final/8.jpg
Saved: processed_monkbrill/Nun-final/9.jpg
Saved: processed_monkbrill/Nun-final/10.jpg
Saved: processed_monkbrill/Nun-final/11.jpg
Saved: processed_monkbrill/Nun-final/12.jpg
Saved: processed_monkbrill/Nun-final/13.jpg
Saved:

In [143]:
#create ngrams
def create_line_image(text, char_img_dir, target_height=75, spacing=1):
    chars = text.replace('_', '')  #all characters, no underscores
    words = text.split(' ')
    images = []
    bboxes = []  #for YOLO (class_id, x_center, y_center, width, height)

    x_offset = 0
    y_offset = 0

    for word in words:
        word_chars = word.split('_')
        for ch in word_chars:
            char_folder = os.path.join(char_img_dir, ch)
            if not os.path.exists(char_folder):
                continue
            img_files = os.listdir(char_folder)
            if not img_files:
                continue
            img_path = os.path.join(char_folder, random.choice(img_files))
            img = cv2.imread(img_path)
            if img is None:
                continue

            class_id = char_to_class_id.get(ch, -1)  # -1 = unknown
            if class_id == -1:
                print(f"Unknown class_id for {ch}")
                continue
            h, w = img.shape[:2]
            images.append((img, (x_offset, w, class_id)))
            x_offset += w + spacing

        x_offset += spacing * 5  #space between words

    if not images:
        return None, []

    #line h and w
    max_h = max([img.shape[0] for img, _ in images])
    total_w = x_offset
    line_img = np.full((max_h, total_w, 3), 255, dtype=np.uint8)

    x = 0
    for img, (x_pos, w, class_id) in images:
        h = img.shape[0]
        line_img[0:h, x_pos:x_pos + w] = img


    #resize image to 75, keep ratio
    h, w = line_img.shape[:2]
    scale = target_height / h
    new_w = int(w * scale)
    resized = cv2.resize(line_img, (new_w, target_height), interpolation=cv2.INTER_AREA)

    #calculate bounding boxes
    bboxes = []
    x_offset = 0
    for img, (_, orig_w, class_id) in images:
        box_w = int(orig_w * scale)
        box_h = target_height
        x_center = (x_offset + box_w // 2) / new_w
        y_center = 0.5  #always centered vertically
        w_rel = box_w / new_w
        h_rel = 1.0
        bboxes.append((class_id, round(x_center, 6), round(y_center, 6), round(w_rel, 6), round(h_rel, 6)))
        x_offset += box_w + int(spacing * scale)

    return resized, bboxes

In [148]:
def create_line_image_tightbox(text, char_img_dir, char_to_class_id, target_height=75, spacing=1):
    words = text.split(' ')
    images = []
    x_offset = 0

    for word in words:
        word_chars = word.split('_')
        for ch in word_chars:
            char_folder = os.path.join(char_img_dir, ch)
            if not os.path.exists(char_folder):
                continue
            img_files = [f for f in os.listdir(char_folder) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
            if not img_files:
                continue

            img_path = os.path.join(char_folder, random.choice(img_files))
            img = cv2.imread(img_path)
            if img is None:
                continue

            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            ys, xs = np.where(gray < 250)  # ink pixels
            if len(xs) == 0 or len(ys) == 0:
                continue

            class_id = char_to_class_id.get(ch, -1)
            if class_id == -1:
                print(f"Unknown class_id for {ch}")
                continue

            bbox_in_char = (xs.min(), ys.min(), xs.max(), ys.max())  # x_min, y_min, x_max, y_max

            images.append({
                "img": img,  # full padded image
                "x_offset": x_offset,
                "bbox_in_char": bbox_in_char,
                "class_id": class_id
            })

            x_offset += img.shape[1] + spacing  # width of the full image (with padding)

        x_offset += spacing * 5  # extra space between words

    if not images:
        return None, []

    max_h = target_height  # assume already padded and resized to 75 height
    total_w = x_offset
    line_img = np.full((max_h, total_w, 3), 255, dtype=np.uint8)

    bboxes = []

    for item in images:
        img = item["img"]
        x = item["x_offset"]
        h, w = img.shape[:2]

        # Paste full padded character image
        line_img[0:h, x:x + w] = img

        # Get ink-only bounding box inside the char image
        x_min, y_min, x_max, y_max = item["bbox_in_char"]

        # Convert to full-line coordinates
        abs_x_min = x + x_min
        abs_x_max = x + x_max
        abs_y_min = y_min
        abs_y_max = y_max

        box_w = abs_x_max - abs_x_min
        box_h = abs_y_max - abs_y_min
        x_center = (abs_x_min + box_w / 2) / total_w
        y_center = (abs_y_min + box_h / 2) / max_h
        w_rel = box_w / total_w
        h_rel = box_h / max_h

        bboxes.append((
            item["class_id"],
            round(x_center, 6),
            round(y_center, 6),
            round(w_rel, 6),
            round(h_rel, 6)
        ))

    return line_img, bboxes

In [151]:
def save_line_with_annotation_train(image, bboxes, save_dir, filename_base):
    image_subdir = os.path.join(save_dir, 'images', 'train')
    label_subdir = os.path.join(save_dir, 'labels', 'train')
    os.makedirs(image_subdir, exist_ok=True)
    os.makedirs(label_subdir, exist_ok=True)

    image_path = os.path.join(image_subdir, f"{filename_base}.jpg")
    label_path = os.path.join(label_subdir, f"{filename_base}.txt")

    cv2.imwrite(image_path, image)

    with open(label_path, 'w') as f:
        for bbox in bboxes:
            f.write(f"{' '.join(map(str, bbox))}\n")

In [155]:
def save_line_with_annotation_val(image, bboxes, save_dir, filename_base):
    image_subdir = os.path.join(save_dir, 'images', 'test')
    label_subdir = os.path.join(save_dir, 'labels', 'test')
    os.makedirs(image_subdir, exist_ok=True)
    os.makedirs(label_subdir, exist_ok=True)

    image_path = os.path.join(image_subdir, f"{filename_base}.jpg")
    label_path = os.path.join(label_subdir, f"{filename_base}.txt")

    cv2.imwrite(image_path, image)

    with open(label_path, 'w') as f:
        for bbox in bboxes:
            f.write(f"{' '.join(map(str, bbox))}\n")

In [152]:
n_grams = read_ngrams()
char_img_dir = 'processed_monkbrill'
output_dir = 'yolo_synthetic_line_pretrain'

for i in range(5000):  #generate how many images?
    line = synthetic_line(n_grams)
    image, bboxes = create_line_image_tightbox(line, char_img_dir, char_to_class_id)
    if image is not None and bboxes:
        save_line_with_annotation_train(image, bboxes, output_dir, f"line_{i}")
        print(f"Saved line_{i}.jpg with {len(bboxes)} boxes")

Saved line_0.jpg with 23 boxes
Saved line_1.jpg with 15 boxes
Saved line_2.jpg with 22 boxes
Saved line_3.jpg with 14 boxes
Saved line_4.jpg with 12 boxes
Saved line_5.jpg with 19 boxes
Saved line_6.jpg with 9 boxes
Saved line_7.jpg with 23 boxes
Saved line_8.jpg with 13 boxes
Saved line_9.jpg with 25 boxes
Saved line_10.jpg with 28 boxes
Saved line_11.jpg with 21 boxes
Saved line_12.jpg with 13 boxes
Saved line_13.jpg with 26 boxes
Saved line_14.jpg with 32 boxes
Saved line_15.jpg with 14 boxes
Saved line_16.jpg with 15 boxes
Saved line_17.jpg with 18 boxes
Saved line_18.jpg with 11 boxes
Saved line_19.jpg with 11 boxes
Saved line_20.jpg with 19 boxes
Saved line_21.jpg with 12 boxes
Saved line_22.jpg with 25 boxes
Saved line_23.jpg with 15 boxes
Saved line_24.jpg with 18 boxes
Saved line_25.jpg with 24 boxes
Saved line_26.jpg with 16 boxes
Saved line_27.jpg with 19 boxes
Saved line_28.jpg with 19 boxes
Saved line_29.jpg with 8 boxes
Saved line_30.jpg with 13 boxes
Saved line_31.jpg wi

In [156]:
for i in range(500):  #generate how many images?
    line = synthetic_line(n_grams)
    image, bboxes = create_line_image_tightbox(line, char_img_dir, char_to_class_id)
    if image is not None and bboxes:
        save_line_with_annotation_val(image, bboxes, output_dir, f"line_{i}")
        print(f"Saved line_{i}.jpg with {len(bboxes)} boxes")

Saved line_0.jpg with 7 boxes
Saved line_1.jpg with 26 boxes
Saved line_2.jpg with 7 boxes
Saved line_3.jpg with 31 boxes
Saved line_4.jpg with 21 boxes
Saved line_5.jpg with 23 boxes
Saved line_6.jpg with 29 boxes
Saved line_7.jpg with 8 boxes
Saved line_8.jpg with 13 boxes
Saved line_9.jpg with 18 boxes
Saved line_10.jpg with 29 boxes
Saved line_11.jpg with 29 boxes
Saved line_12.jpg with 21 boxes
Saved line_13.jpg with 29 boxes
Saved line_14.jpg with 23 boxes
Saved line_15.jpg with 18 boxes
Saved line_16.jpg with 16 boxes
Saved line_17.jpg with 6 boxes
Saved line_18.jpg with 10 boxes
Saved line_19.jpg with 7 boxes
Saved line_20.jpg with 23 boxes
Saved line_21.jpg with 17 boxes
Saved line_22.jpg with 11 boxes
Saved line_23.jpg with 12 boxes
Saved line_24.jpg with 22 boxes
Saved line_25.jpg with 33 boxes
Saved line_26.jpg with 25 boxes
Saved line_27.jpg with 16 boxes
Saved line_28.jpg with 31 boxes
Saved line_29.jpg with 21 boxes
Saved line_30.jpg with 8 boxes
Saved line_31.jpg with 8