In [1]:
import numpy as np
import os
import tensorflow as tf

In [2]:
np.random.seed(42)
tf.random.set_seed(42)


In [3]:
characters = set()
max_len = 0
base_path = "data_zettel/cropped_images/"  # gets overwritten by config

In [4]:
def read_data():
    data_list = []
    image_files = [f for f in os.listdir(base_path) if f.endswith('.jpg')]

    for image_file in image_files:
        image_name = os.path.splitext(image_file)[0]

        img_path = os.path.join(base_path, image_file)
        label_file = os.path.join(base_path, f"{image_name}.txt")

        if os.path.exists(label_file):
            with open(label_file, "r", encoding="utf-8") as file:
                line = file.readline().strip()
                data_list.append((img_path, line))

    #np.random.shuffle(data_list) # Rausgenommen zum testen
    return data_list

In [5]:
def get_vocabulary_length(data):
    characters = set()
    max_len = 0

    for _, label in data:
        for char in label:
            characters.add(char)

        max_len = max(max_len, len(label))

    characters = sorted(list(characters))

    print("Maximum length: ", max_len)
    print("Vocab size: ", len(characters))
    return characters, max_len

In [6]:
def get_image_paths_and_labels(samples):
    x_img_paths = []
    y_labels = []

    for img_path, label in samples:
        if os.path.exists(img_path):
            x_img_paths.append(img_path)
            y_labels.append(label)

    return x_img_paths, y_labels

In [7]:
def split_data(lines_list):
    split_idx = int(0.9 * len(lines_list))
    train_samples = lines_list[:split_idx]
    test_samples = lines_list[split_idx:]

    val_split_idx = int(0.5 * len(test_samples))
    validation_samples = test_samples[:val_split_idx]
    test_samples = test_samples[val_split_idx:]

    return train_samples, test_samples, validation_samples

In [12]:
data = read_data()
all_data = read_data()
characters, max_len = get_vocabulary_length(all_data)
train_samples, test_samples, validation_samples = split_data(data)


Maximum length:  47
Vocab size:  72


In [9]:
print(f"Total train samples: {len(train_samples)}")
print(f"Total validation samples: {len(validation_samples)}")
print(f"Total test samples: {len(test_samples)}")

Total train samples: 2114
Total validation samples: 117
Total test samples: 118


In [10]:

x_train_img_paths, y_train_labels = get_image_paths_and_labels(train_samples)

x_val_img_paths, y_val_labels = get_image_paths_and_labels(validation_samples)

test_path, test_label = get_image_paths_and_labels(test_samples)


In [11]:
# Has to be here because load data functions need to be called before
import handwriting.tokenizer as tokenizer
import handwriting.custom_image_generator as cgi

# takes eternity
#x_train, y_train = tokenizer.prepare_data(x_train_img_paths, y_train_labels) 
#x_test, y_test = tokenizer.prepare_data(x_test_img_paths, y_test_labels)

#train_generator = cgi.CustomImageGenerator(x_train_img_paths, y_train_labels, BATCH_SIZE, IMAGE_WIDTH, IMAGE_HEIGHT)

train_ds = tokenizer.prepare_dataset(x_train_img_paths, y_train_labels, (512,64), 32)
val_ds = tokenizer.prepare_dataset(x_val_img_paths, y_val_labels,(512,64), 32)
#test_ds = tokenizer.prepare_dataset(x_test_img_paths, y_test_labels,(IMAGE_WIDTH,IMAGE_HEIGHT),BATCH_SIZE)
#aug_train_ds = tokenizer.prepare_augmented_dataset(x_train_img_paths, y_train_labels, BATCH_SIZE)

Maximum length:  47
Vocab size:  69
