# Imports

In [None]:
%load_ext autoreload

In [None]:
%autoreload
%matplotlib inline
from collections import Counter
from operator import itemgetter

import matplotlib.pyplot as plt
import numpy as np
import xml.etree.ElementTree as ET
from tqdm.auto import tqdm

from handwriting_generator.constants import DATA_DIR
from handwriting_generator.utils import (
    load_line_strokes,
    load_transcriptions,
    plot_strokes,
    filter_line_strokes_and_transcriptions,
    convert_stroke_set_to_array,
)

# Constants

In [None]:
ascii_dir = DATA_DIR / "ascii"
line_strokes_dir = DATA_DIR / "lineStrokes"

# Data

In [None]:
line_strokes = load_line_strokes(line_strokes_dir, show_progress=True)

In [None]:
len(line_strokes)

In [None]:
transcriptions = load_transcriptions(ascii_dir, show_progress=True)

In [None]:
len(transcriptions)

In [None]:
line_strokes, transcriptions = filter_line_strokes_and_transcriptions(
    line_strokes, transcriptions, show_progress=True
)

In [None]:
len(line_strokes)

In [None]:
len(transcriptions)

## Exploration

In [None]:
_ = plt.boxplot(sorted(list(map(len, transcriptions.values())), reverse=True))
_ = plt.title("Distribution of transcription length")

In [None]:
_ = plt.boxplot(
    sorted(list(map(lambda x: sum(map(len, x)), line_strokes.values())), reverse=True)
)
_ = plt.title("Distribution of line stroke length")

Next, we will look at the distribution of characters in the different transcriptions

In [None]:
all_characters = Counter()

for transcription in transcriptions.values():
    all_characters.update(transcription)

dict(sorted(all_characters.items(), key=itemgetter(1), reverse=True))

As we can see clearly, the distribution of characters is imbalanced:

- There are lowercase letters than uppercase letters => we will lowercase all the transcriptions
- Some characters appear was less frequently than others => We will treat the least frequent ones as unknown

After that, we randomly select a stroke set and a transcription and plot them

In [None]:
indices = np.random.choice(len(line_strokes), size=4)

stroke_set_list = []
transcription_list = []

for idx in indices:
    filename = tuple(line_strokes.keys())[idx]
    stroke_set_list.append(line_strokes[filename])
    transcription_list.append(transcriptions[filename])

In [None]:
fig, axes = plt.subplots(2, 2)
for stroke_set, transcription, ax in zip(
    stroke_set_list, transcription_list, axes.ravel()
):
    for strokes in stroke_set:
        x, y = [], []
        for i, point in enumerate(strokes):
            x.append(int(point[0]))
            y.append(int(point[1]))
        ax.scatter(x, y, s=0.1)
    ax.invert_yaxis()
    ax.set_aspect("equal", adjustable="datalim")
    ax.set_title(transcription)
fig.tight_layout()

As can be seen in some of the above examples, there is trend in the y-axis that should be
removed because it will interefere with the model's training.

In [None]:
fig, axes = plt.subplots(2, 2)

for stroke_set, transcription, ax in zip(
    stroke_set_list[:], transcription_list, axes.ravel()
):
    y = []

    for strokes in stroke_set:
        for point in strokes:
            y.append(int(point[1]))

    z = np.polyfit(np.arange(0, len(y)), y, deg=1)

    counter = 0
    for strokes in stroke_set:
        x, y = [], []
        for i, point in enumerate(strokes):
            x.append(int(point[0]))
            y.append(int(point[1]) - y_trend[counter])
            counter += 1
        ax.scatter(x, y, s=0.1)
    ax.invert_yaxis()
    ax.set_aspect("equal", adjustable="datalim")
    ax.set_title(transcription)
fig.tight_layout()

Two other things we should do is to first reduce the scale of the x and y values as well as replace them with their respective 1st order differences. 

In [None]:
fig, axes = plt.subplots(2, 2)

for stroke_set, transcription, ax in zip(
    stroke_set_list, transcription_list, axes.ravel()
):
    arr = np.zeros((sum(map(len, stroke_set)), 3))

    counter = -1
    for strokes in stroke_set:
        x, y = [], []
        for i, point in enumerate(strokes):
            counter += 1
            arr[counter, 0] = int(point[0])
            arr[counter, 1] = int(point[1])
            arr[counter, 2] = point[0]
        arr[counter, 2] = 1

    # Remove trend on the y-axis
    X = np.arange(0, len(arr))
    z = np.polyfit(X, arr[:, 1], deg=1)
    y_trend = np.polyval(z, X)
    arr[:, 1] -= y_trend
    # Normalize
    arr[:, :2] = arr[:, :2] / np.max(arr[:, :2])
    # Difference
    arr[:, :2] = np.diff(arr[:, :2], prepend=0, axis=0)
    # Plot
    ax.scatter(np.cumsum(arr[:, 0]), np.cumsum(arr[:, 1]), s=0.1)
    ax.invert_yaxis()
    ax.set_aspect("equal", adjustable="datalim")
    ax.set_title(transcription)

fig.tight_layout()

Next, we put all of this pre-processing into a function to be able to reuse it:

In [None]:
strokes_array_list = []
for stroke_set in stroke_set_list:
    strokes_array = convert_stroke_set_to_array(stroke_set)
    strokes_array_list.append(strokes_array)