In [2]:
!wget -q https://git.io/J0fjL -O IAM_Words.zip 
!unzip -qq IAM_Words.zip 

!mkdir data 
!mkdir data/words 
!tar -xf IAM_Words/words.tgz -C data/words 
!mv IAM_Words/words.txt data

mkdir: cannot create directory ‘data’: File exists
mkdir: cannot create directory ‘data/words’: File exists


In [1]:
!head -18 data/words.txt

#--- words.txt ---------------------------------------------------------------#
#
# iam database word information
#
# format: a01-000u-00-00 ok 154 1 408 768 27 51 AT A
#
#     a01-000u-00-00  -> word id for line 00 in form a01-000u
#     ok              -> result of word segmentation
#                            ok: word was correctly
#                            er: segmentation of word can be bad
#
#     154             -> graylevel to binarize the line containing this word
#     1               -> number of components for this word
#     408 768 27 51   -> bounding box around this word in x,y,w,h format
#     AT              -> the grammatical tag for this word, see the
#                        file tagset.txt for an explanation
#     A               -> the transcription for this word
#


In [2]:
from tensorflow.keras.layers.experimental.preprocessing import StringLookup
from tensorflow import keras

import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import os
import pandas as pd
from sklearn.metrics import classification_report
import optuna

np.random.seed(42)
tf.random.set_seed(42)
# hiding tensorflow warnings
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

# Data Splitting

- We also remove any images that are labeled "err"
- 80/10/10 split for training, validation, test

In [3]:
base_path = "data"
words_list = []

words = open(f"{base_path}/words.txt", "r").readlines()
for line in words:
    if line[0] == "#":
        continue
    if line.split(" ")[1] != "err":  # We don't need to deal with errored entries.
        words_list.append(line)

np.random.shuffle(words_list)

split_idx = int(0.9 * len(words_list))
train_samples = words_list[:split_idx]
test_samples = words_list[split_idx:]

val_split_idx = int(0.5 * len(test_samples))
validation_samples = test_samples[:val_split_idx]
test_samples = test_samples[val_split_idx:]

# make sure they all add up
assert len(words_list) == len(train_samples) + len(validation_samples) + len(
    test_samples
)

print(f"Total training samples: {len(train_samples)}")
print(f"Total validation samples: {len(validation_samples)}")
print(f"Total test samples: {len(test_samples)}")

Total training samples: 86810
Total validation samples: 4823
Total test samples: 4823


# Data Preprocessing
- Cleaning the data label that came from the words.txt file
- Building the vocabulary of characters (from training data only)

In [4]:
base_image_path = os.path.join(base_path, "words")

def get_image_paths_and_labels(samples):
    paths = []
    corrected_samples = []
    for (i, file_line) in enumerate(samples):
        line_split = file_line.strip()
        line_split = line_split.split(" ")

        # Each line split will have this format for the corresponding image:
        # part1/part1-part2/part1-part2-part3.png
        image_name = line_split[0]
        partI = image_name.split("-")[0]
        partII = image_name.split("-")[1]
        img_path = os.path.join(
            base_image_path, partI, partI + "-" + partII, image_name + ".png"
        )
        if os.path.getsize(img_path):
            paths.append(img_path)
            corrected_samples.append(file_line.split("\n")[0])

    return paths, corrected_samples


train_img_paths, train_labels = get_image_paths_and_labels(train_samples)
validation_img_paths, validation_labels = get_image_paths_and_labels(validation_samples)
test_img_paths, test_labels = get_image_paths_and_labels(test_samples)

"""
Then we prepare the ground-truth labels.
"""

# Find maximum length and the size of the vocabulary in the training data.
train_labels_cleaned = []
characters = set()
max_len = 0

for label in train_labels:
    label = label.split(" ")[-1].strip()
    for char in label:
        characters.add(char)

    max_len = max(max_len, len(label))
    train_labels_cleaned.append(label)

characters = sorted(list(characters))

print("Maximum length: ", max_len)
print("Vocab size: ", len(characters))
train_labels_cleaned[:10]

Maximum length:  21
Vocab size:  78


['sure',
 'he',
 'during',
 'of',
 'booty',
 'gastronomy',
 'boy',
 'The',
 'and',
 'in']

In [5]:
def clean_labels(labels):
    cleaned_labels = []
    for label in labels:
        label = label.split(" ")[-1].strip()
        cleaned_labels.append(label)
    return cleaned_labels

validation_labels_cleaned = clean_labels(validation_labels)
test_labels_cleaned = clean_labels(test_labels)

In [6]:
AUTOTUNE = tf.data.AUTOTUNE

# Mapping characters to integers.
char_to_num = StringLookup(vocabulary=list(characters), mask_token=None)

# Mapping integers back to original characters.
num_to_char = StringLookup(
    vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True
)

Metal device set to: Apple M2


In [7]:
def distortion_free_resize(image, img_size):
    """
    * Aspect ratio is preserved.
    * Content of the images is not affected.
    """
    w, h = img_size
    image = tf.image.resize(image, size=(h, w), preserve_aspect_ratio=True)

    # Check tha amount of padding needed to be done.
    pad_height = h - tf.shape(image)[0]
    pad_width = w - tf.shape(image)[1]

    # Only necessary if you want to do same amount of padding on both sides.
    if pad_height % 2 != 0:
        height = pad_height // 2
        pad_height_top = height + 1
        pad_height_bottom = height
    else:
        pad_height_top = pad_height_bottom = pad_height // 2

    if pad_width % 2 != 0:
        width = pad_width // 2
        pad_width_left = width + 1
        pad_width_right = width
    else:
        pad_width_left = pad_width_right = pad_width // 2

    image = tf.pad(
        image,
        paddings=[
            [pad_height_top, pad_height_bottom],
            [pad_width_left, pad_width_right],
            [0, 0],
        ],
    )

    image = tf.transpose(image, perm=[1, 0, 2])
    image = tf.image.flip_left_right(image)
    return image


In [8]:
batch_size  = 30
padding_token = 99
image_width = 128
image_height = 32


def preprocess_image(image_path, img_size=(image_width, image_height)):
    image = tf.io.read_file(image_path)
    image = tf.image.decode_png(image, 1)
    image = distortion_free_resize(image, img_size)
    image = tf.cast(image, tf.float32) / 255.0
    return image


def vectorize_label(label):
    label = char_to_num(tf.strings.unicode_split(label, input_encoding="UTF-8"))
    length = tf.shape(label)[0]
    pad_amount = max_len - length
    label = tf.pad(label, paddings=[[0, pad_amount]], constant_values=padding_token)
    return label


def process_images_labels(image_path, label):
    image = preprocess_image(image_path)
    label = vectorize_label(label)
    return {"image": image, "label": label}


def prepare_dataset(image_paths, labels):
    dataset = tf.data.Dataset.from_tensor_slices((image_paths, labels)).map(
        process_images_labels, num_parallel_calls=AUTOTUNE
    )
    return dataset.batch(batch_size).cache().prefetch(AUTOTUNE)

### Running All Data through Preprocessing Steps

In [9]:
train_ds = prepare_dataset(train_img_paths, train_labels_cleaned)
validation_ds = prepare_dataset(validation_img_paths, validation_labels_cleaned)
test_ds = prepare_dataset(test_img_paths, test_labels_cleaned)

validation_images = []
validation_labels = []

for batch in validation_ds:
    validation_images.append(batch["image"])
    validation_labels.append(batch["label"])

# Optuna Experiments

### Modeling

In [10]:
class CTCLayer(keras.layers.Layer):
    def __init__(self, name=None):
        super().__init__(name=name)
        self.loss_fn = keras.backend.ctc_batch_cost

    def call(self, y_true, y_pred):
        batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
        input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
        label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")

        input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
        label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")
        loss = self.loss_fn(y_true, y_pred, input_length, label_length)
        self.add_loss(loss)

        # At test time, just return the computed predictions.
        return y_pred


def create_model(trial):
    # Inputs to the model
    input_img = keras.Input(shape=(image_width, image_height, 1), name="image")
    labels = keras.layers.Input(name="label", shape=(None,))

    # First conv block.
    conv1_activation = trial.suggest_categorical("conv1_activation", ["relu", "tanh"])
    conv1_kernal_size = trial.suggest_int("conv1_kernal_size", 2, 4, step=1) 
    conv1_filters = trial.suggest_categorical("conv1_filters", [16, 32, 64, 128])
    x = keras.layers.Conv2D(
        conv1_filters,
        (conv1_kernal_size, conv1_kernal_size),
        activation=conv1_activation,
        kernel_initializer="he_normal",
        padding="same",
        name="Conv1",
    )(input_img)
    #max_pooling1 = trial.suggest_int("max_pooling1", 1, 3, step=1)
    x = keras.layers.MaxPooling2D((2, 2), name="pool1")(x)

    # Second conv block.
    conv2_activation = trial.suggest_categorical("conv2_activation", ["relu", "tanh"])
    conv2_filters = trial.suggest_categorical("conv1_filters", [16, 32, 64, 128])
    conv2_kernal_size = trial.suggest_int("conv2_kernal_size", 2, 4)
    x = keras.layers.Conv2D(
        conv2_filters,
        (conv2_kernal_size, conv2_kernal_size),
        activation=conv2_activation,
        kernel_initializer="he_normal",
        padding="same",
        name="Conv2",
    )(x)
    #max_pooling2 = trial.suggest_int("max_pooling2", 1, 3, step=1)
    x = keras.layers.MaxPooling2D((2, 2), name="pool2")(x)
    new_shape = ((image_width // 4), (image_height // 4) * conv2_filters)
    
    x = keras.layers.Reshape(target_shape=new_shape, name="reshape")(x)
    
    # Dense 1
    dense1_activation = trial.suggest_categorical("dense1_activation", ["relu", "tanh"])
    dense1_filters = trial.suggest_categorical("dense1_filters", [16, 32, 64, 128])
    x = keras.layers.Dense(dense1_filters, 
                           activation=dense1_activation, 
                           name="dense1")(x)
    dropout = trial.suggest_float("dropout", 0.15, 0.3)
    x = keras.layers.Dropout(dropout)(x)

    # RNNs.
    
    dropout1 = trial.suggest_float("dropout1", 0.15, 0.3)
    x = keras.layers.Bidirectional(
        keras.layers.LSTM(256, return_sequences=True, dropout=dropout1)
    )(x)
    
    dropout2 = trial.suggest_float("dropout2", 0.15, 0.3)
    x = keras.layers.Bidirectional(
        keras.layers.LSTM(128, return_sequences=True, dropout=dropout2)
    )(x)
    
    dropout3 = trial.suggest_float("dropout3", 0.15, 0.3)
    x = keras.layers.Bidirectional(
        keras.layers.LSTM(64, return_sequences=True, dropout=dropout3)
    )(x)
    

    # +2 is to account for the two special tokens introduced by the CTC loss.
    # The recommendation comes here: https://git.io/J0eXP.
    
    x = keras.layers.Dense(
        len(char_to_num.get_vocabulary()) + 2, 
        activation="softmax", 
        name="dense2"
    )(x)

    # Add CTC layer for calculating CTC loss at each step.
    output = CTCLayer(name="ctc_loss")(labels, x)

    # Define the model.
    model = keras.models.Model(
        inputs=[input_img, labels], outputs=output, name="handwriting_recognizer"
    )
    
    optimizer = create_optimizer(trial)
    model.compile(optimizer=optimizer)
    
    return model

### Optuna Optimizer Hyperparameter

In [11]:
def create_optimizer(trial):
    optimizer_name = trial.suggest_categorical("optimizer", ["RMSprop", "Adam", "SGD"])
    if optimizer_name == "RMSprop":
        lrn_rate = trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True)
        return keras.optimizers.RMSprop(learning_rate=lrn_rate)
    elif optimizer_name == "Adam":
        lrn_rate = trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True)
        return keras.optimizers.Adam(learning_rate=lrn_rate)
    else:
        lrn_rate = trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True)
        momentum = trial.suggest_float("sgd_opt_momentum", 1e-5, 1e-1, log=True)
        return keras.optimizers.SGD(learning_rate=lrn_rate, momentum=momentum)

### Running the Optuna Experiment

In [None]:
EPOCHS = 5
TRIALS = 50

def objective(trial):
    model = create_model(trial)
    model.fit(train_ds,
              validation_data=validation_ds,
              epochs=EPOCHS)
    score = model.evaluate(validation_ds, verbose=0)
    print(score)
    return score # loss


study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=TRIALS)

print("Number of finished trials: ", len(study.trials))

print("Best trial:")
trial = study.best_trial

print("  Loss: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2022-11-21 22:02:39,734][0m A new study created in memory with name: no-name-55fe4810-6cb4-4b29-9c4b-7c8fa7ce2fec[0m


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[32m[I 2022-11-21 22:46:17,088][0m Trial 0 finished with value: 10.699230194091797 and parameters: {'conv1_activation': 'relu', 'conv1_kernal_size': 3, 'conv1_filters': 128, 'conv2_activation': 'tanh', 'conv2_kernal_size': 3, 'dense1_activation': 'relu', 'dense1_filters': 64, 'dropout': 0.2588281899202205, 'dropout1': 0.24851540784072856, 'dropout2': 0.1957974403031273, 'dropout3': 0.18997362822273584, 'optimizer': 'RMSprop', 'learning_rate': 4.37198621103679e-05}. Best is trial 0 with value: 10.699230194091797.[0m


10.699230194091797
Epoch 1/5


In [None]:
EPOCHS = 5
TRIALS = 50

def objective(trial):
    model = create_model(trial)
    model.fit(train_ds,
              validation_data=validation_ds,
              epochs=EPOCHS)
    score = model.evaluate(validation_ds, verbose=0)
    print(score)
    return score # loss


study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=TRIALS)

print("Number of finished trials: ", len(study.trials))

print("Best trial:")
trial = study.best_trial

print("  Loss: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2022-11-21 13:05:54,598][0m A new study created in memory with name: no-name-9b99ab66-5ede-4d64-9271-0dc96be9fa64[0m


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[32m[I 2022-11-21 13:51:35,862][0m Trial 0 finished with value: 11.282990455627441 and parameters: {'conv1_activation': 'tanh', 'conv1_kernal_size': 3, 'conv1_filters': 128, 'conv2_activation': 'relu', 'conv2_kernal_size': 2, 'dense1_activation': 'tanh', 'dense1_filters': 16, 'dropout': 0.17025617515768346, 'dropout1': 0.24395370077337075, 'dropout2': 0.1678819040278219, 'dropout3': 0.17606772863563322, 'optimizer': 'RMSprop', 'learning_rate': 5.4050553487620996e-05}. Best is trial 0 with value: 11.282990455627441.[0m


11.282990455627441
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[32m[I 2022-11-21 14:36:19,505][0m Trial 1 finished with value: 13.2564058303833 and parameters: {'conv1_activation': 'tanh', 'conv1_kernal_size': 4, 'conv1_filters': 64, 'conv2_activation': 'tanh', 'conv2_kernal_size': 4, 'dense1_activation': 'tanh', 'dense1_filters': 32, 'dropout': 0.17573855933645482, 'dropout1': 0.293158037751315, 'dropout2': 0.1609573703067315, 'dropout3': 0.2512913196726221, 'optimizer': 'RMSprop', 'learning_rate': 0.014123409818068488}. Best is trial 0 with value: 11.282990455627441.[0m


13.2564058303833
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[32m[I 2022-11-21 15:19:07,503][0m Trial 2 finished with value: 5.262175559997559 and parameters: {'conv1_activation': 'tanh', 'conv1_kernal_size': 3, 'conv1_filters': 16, 'conv2_activation': 'tanh', 'conv2_kernal_size': 4, 'dense1_activation': 'relu', 'dense1_filters': 64, 'dropout': 0.2016789077244987, 'dropout1': 0.21986319623915873, 'dropout2': 0.17333798751459284, 'dropout3': 0.24849332369425925, 'optimizer': 'RMSprop', 'learning_rate': 0.003403590295929654}. Best is trial 2 with value: 5.262175559997559.[0m


5.262175559997559
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[32m[I 2022-11-21 16:06:37,729][0m Trial 3 finished with value: 13.154173851013184 and parameters: {'conv1_activation': 'tanh', 'conv1_kernal_size': 4, 'conv1_filters': 128, 'conv2_activation': 'tanh', 'conv2_kernal_size': 4, 'dense1_activation': 'tanh', 'dense1_filters': 32, 'dropout': 0.1855946587372339, 'dropout1': 0.15802481317927508, 'dropout2': 0.21579507510224188, 'dropout3': 0.28999575614135, 'optimizer': 'RMSprop', 'learning_rate': 1.2852743565354296e-05}. Best is trial 2 with value: 5.262175559997559.[0m


13.154173851013184
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[32m[I 2022-11-21 16:51:42,506][0m Trial 4 finished with value: 15.414863586425781 and parameters: {'conv1_activation': 'tanh', 'conv1_kernal_size': 4, 'conv1_filters': 128, 'conv2_activation': 'tanh', 'conv2_kernal_size': 3, 'dense1_activation': 'tanh', 'dense1_filters': 32, 'dropout': 0.24388552837006705, 'dropout1': 0.2339869343152106, 'dropout2': 0.15239959970940184, 'dropout3': 0.23114677100545816, 'optimizer': 'Adam', 'learning_rate': 0.023549062057632502}. Best is trial 2 with value: 5.262175559997559.[0m


15.414863586425781
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[32m[I 2022-11-21 17:36:58,085][0m Trial 5 finished with value: 14.709196090698242 and parameters: {'conv1_activation': 'relu', 'conv1_kernal_size': 2, 'conv1_filters': 128, 'conv2_activation': 'relu', 'conv2_kernal_size': 3, 'dense1_activation': 'relu', 'dense1_filters': 64, 'dropout': 0.29747031828362225, 'dropout1': 0.27438956728485125, 'dropout2': 0.23246832955701627, 'dropout3': 0.284303783937842, 'optimizer': 'Adam', 'learning_rate': 3.0951297051597176e-05}. Best is trial 2 with value: 5.262175559997559.[0m


14.709196090698242
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[32m[I 2022-11-21 18:19:55,271][0m Trial 6 finished with value: 13.934127807617188 and parameters: {'conv1_activation': 'tanh', 'conv1_kernal_size': 3, 'conv1_filters': 32, 'conv2_activation': 'relu', 'conv2_kernal_size': 3, 'dense1_activation': 'tanh', 'dense1_filters': 16, 'dropout': 0.23734796951289494, 'dropout1': 0.22722773832474186, 'dropout2': 0.2214243990711678, 'dropout3': 0.29548491517208475, 'optimizer': 'RMSprop', 'learning_rate': 1.1696320810261748e-05}. Best is trial 2 with value: 5.262175559997559.[0m


13.934127807617188
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[32m[I 2022-11-21 19:02:34,563][0m Trial 7 finished with value: 8.59115219116211 and parameters: {'conv1_activation': 'relu', 'conv1_kernal_size': 4, 'conv1_filters': 32, 'conv2_activation': 'relu', 'conv2_kernal_size': 4, 'dense1_activation': 'relu', 'dense1_filters': 32, 'dropout': 0.2376578003760425, 'dropout1': 0.21318774028985096, 'dropout2': 0.2410417312745997, 'dropout3': 0.2936370552020956, 'optimizer': 'Adam', 'learning_rate': 0.00039238005106890275}. Best is trial 2 with value: 5.262175559997559.[0m


8.59115219116211
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[32m[I 2022-11-21 19:44:45,708][0m Trial 8 finished with value: 10.374730110168457 and parameters: {'conv1_activation': 'relu', 'conv1_kernal_size': 3, 'conv1_filters': 16, 'conv2_activation': 'tanh', 'conv2_kernal_size': 2, 'dense1_activation': 'relu', 'dense1_filters': 64, 'dropout': 0.19523811712859238, 'dropout1': 0.2817981489223422, 'dropout2': 0.1892199189940514, 'dropout3': 0.1543085468240067, 'optimizer': 'Adam', 'learning_rate': 0.00012024157737004158}. Best is trial 2 with value: 5.262175559997559.[0m


10.374730110168457
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[32m[I 2022-11-21 20:28:22,077][0m Trial 9 finished with value: 3.33522367477417 and parameters: {'conv1_activation': 'relu', 'conv1_kernal_size': 2, 'conv1_filters': 64, 'conv2_activation': 'relu', 'conv2_kernal_size': 2, 'dense1_activation': 'tanh', 'dense1_filters': 128, 'dropout': 0.165444182763684, 'dropout1': 0.2929540339456813, 'dropout2': 0.1903758346041926, 'dropout3': 0.2652413191938886, 'optimizer': 'Adam', 'learning_rate': 0.0021336185815240276}. Best is trial 9 with value: 3.33522367477417.[0m


3.33522367477417
Epoch 1/5
