# Optical character recognition for documents processing
## External solution
*Python version 3.10*

In [None]:
from pprint import pprint

import skimage
from easyocr import Reader

In [None]:
files_collection = [f"{i}.jpeg" for i in range(10)]
files_collection.pop(4)
files_collection.append("4.png")
files_location = " DataForOCR"

In [None]:
images = [skimage.io.imread(f"{files_location}/{url}")
          for url in files_collection]

In [None]:
new_reader = Reader(lang_list=["ru"])

In [None]:
group_text = [new_reader.readtext(image=im,
                                  batch_size=3,
                                  detail=0)
              for im in images]

In [None]:
# Filter out single-character & non-alphabet predictions
# Print some meaningful results

pprint([inner_item
        for item in group_text
        for inner_item in item
        if len(inner_item) >= 2 and inner_item.isalpha()])

## My Solution

In [None]:
from skimage.color import rgb2gray

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

In [None]:
d = {chr(i + 1040): i + 1 for i in range(32)}  # No letter "ё"
d.update({" ": 0})
inverted_d = {value: key for key, value in d.items()}
len_full_name = 36  # let this be maximum full name length


def char_to_num(character_string, flag=True):
    character_list = list(character_string)
    character_numbers = [d[item] for item in character_list]

    if flag:
        while len(character_numbers) < len_full_name:
            character_numbers.append(0)

    return character_numbers


def num_to_char(character_numbers):
    return [inverted_d.get(item, "_") for item in character_numbers]

In [None]:
def transform_features(features,
                       ind_slice=None, flag=None, reshape_flag=True,
                       gray_flag=True, normal_flag=False):
    features = features.as_numpy_iterator()
    if gray_flag:
        lst_to_crop = [rgb2gray(item) for item in features]
    else:
        lst_to_crop = [item for item in features]

    if flag == "val":
        features = np.array(lst_to_crop[ind_slice:], dtype=np.float64)
    elif flag == "train":
        features = np.array(lst_to_crop[:ind_slice], dtype=np.float64)
    else:
        features = np.array(lst_to_crop, dtype=np.float64)

    if reshape_flag:
        n, nx, ny = features.shape
        features = features.reshape((n, nx * ny))

    if normal_flag:
        norm = tf.keras.layers.Rescaling(scale=1.0 / 255.0)
        features = norm(features)

    return features

In [None]:
def reverse_flat_categories(array, initial_shape):
    res = tf.keras.layers.Reshape((initial_shape[1], initial_shape[2]))

    return np.argmax(res(array), axis=2)


def reverse_flat_continuous(array, initial_shape):
    res = tf.keras.layers.Reshape((initial_shape[1], initial_shape[2]))

    return res(array)

*Load data*

In [None]:
with open("labels.txt") as f:
    dt = f.read().split("\n")
    labels = np.array(list(map(char_to_num, dt)), dtype=np.int64)

with open("extra_labels.txt") as f:
    extra_dt = f.read().split("\n")
    extra_labels = np.array(list(map(char_to_num, extra_dt)), dtype=np.int64)

In [None]:
img_height = 224
img_width = 224
entry_features = tf.keras.utils.image_dataset_from_directory(
    files_location,
    image_size=(img_height, img_width), interpolation="bicubic",
    batch_size=None, shuffle=False, labels=None)
extra_entry_features = tf.keras.utils.image_dataset_from_directory(
    "ExtraData",
    image_size=(img_height, img_width), interpolation="bicubic",
    batch_size=None, shuffle=False, labels=None)
extra_eval_data = tf.keras.utils.image_dataset_from_directory(
    "TestData",
    image_size=(img_height, img_width), interpolation="bicubic",
    batch_size=None, shuffle=False, labels=None)

In [None]:
combined_features_dataset = entry_features.concatenate(extra_entry_features)
tf_features = tf.convert_to_tensor(transform_features(combined_features_dataset,
                                                      reshape_flag=False,
                                                      gray_flag=False, normal_flag=True))
tf_labels = tf.convert_to_tensor(np.concatenate((labels, extra_labels), axis=0))

In [None]:
cat_label_group = tf.keras.utils.to_categorical(tf_labels, num_classes=33)
cat_label_group_shape = cat_label_group.shape
flat = tf.keras.layers.Flatten()
flat_label_group = flat(cat_label_group)

*Training and plotting*

In [None]:
def train_model(model,
                feature_set, epochs, label_set,
                validation_split, batch_size=None):
    history = model.fit(x=feature_set, y=label_set, batch_size=batch_size,
                        epochs=epochs, shuffle=True,
                        validation_split=validation_split)
    epochs = history.epoch

    hist = pd.DataFrame(history.history)
    mse1, mse2, acc1, acc2 = hist["loss"], hist["val_loss"],\
                             hist["accuracy"], hist["val_accuracy"]

    return epochs, mse1, mse2, acc1, acc2

In [None]:
def plot_the_loss_curve(epochs, mse1, mse2):
    plt.figure()
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.plot(epochs, mse1, label="Training")
    plt.plot(epochs, mse2, label="Validation")
    plt.legend()
    plt.show()


def plot_accuracy(epochs, acc1, acc2):
    plt.figure()
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.plot(epochs, acc1, label="Training")
    plt.plot(epochs, acc2, label="Validation")
    plt.legend()
    plt.show()

In [None]:
def construct_model(set_learning_rate, output_dims=60, inp_shape=(500, 500, 3)):
    model = tf.keras.models.Sequential()

    # VGG-16 like
    model.add(tf.keras.layers.Conv2D(input_shape=inp_shape,
                                     filters=64, kernel_size=(3, 3), padding="same",
                                     activation="tanh"))
    model.add(tf.keras.layers.Conv2D(filters=64, kernel_size=(3, 3), padding="same",
                                     activation="tanh"))
    model.add(tf.keras.layers.MaxPool2D(pool_size=(2, 2), strides=(2, 2)))

    model.add(tf.keras.layers.Conv2D(filters=128, kernel_size=(3, 3), padding="same",
                                     activation="tanh"))
    model.add(tf.keras.layers.Conv2D(filters=128, kernel_size=(3, 3), padding="same",
                                     activation="tanh"))
    model.add(tf.keras.layers.MaxPool2D(pool_size=(2, 2), strides=(2, 2)))

    model.add(tf.keras.layers.Conv2D(filters=256, kernel_size=(3, 3), padding="same",
                                     activation="tanh"))
    model.add(tf.keras.layers.Conv2D(filters=256, kernel_size=(3, 3), padding="same",
                                     activation="tanh"))
    model.add(tf.keras.layers.Conv2D(filters=256, kernel_size=(3, 3), padding="same",
                                     activation="tanh"))
    model.add(tf.keras.layers.MaxPool2D(pool_size=(2, 2), strides=(2, 2)))

    model.add(tf.keras.layers.Conv2D(filters=512, kernel_size=(3, 3), padding="same",
                                     activation="tanh"))
    model.add(tf.keras.layers.Conv2D(filters=512, kernel_size=(3, 3), padding="same",
                                     activation="tanh"))
    model.add(tf.keras.layers.Conv2D(filters=512, kernel_size=(3, 3), padding="same",
                                     activation="tanh"))
    model.add(tf.keras.layers.MaxPool2D(pool_size=(2, 2), strides=(2, 2)))

    model.add(tf.keras.layers.Conv2D(filters=512, kernel_size=(3, 3), padding="same",
                                     activation="tanh"))
    model.add(tf.keras.layers.Conv2D(filters=512, kernel_size=(3, 3), padding="same",
                                     activation="tanh"))
    model.add(tf.keras.layers.Conv2D(filters=512, kernel_size=(3, 3), padding="same",
                                     activation="tanh"))
    model.add(tf.keras.layers.MaxPool2D(pool_size=(2, 2), strides=(2, 2)))

    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(units=4096, activation="relu"))
    model.add(tf.keras.layers.Dense(units=4096, activation="relu"))
    model.add(tf.keras.layers.Dense(units=output_dims,
                                    activation="softmax"))

    model.compile(optimizer=tf.keras.optimizers.Adadelta(),
                  loss=tf.keras.losses.CategoricalCrossentropy(),
                  metrics=tf.keras.metrics.Accuracy())

    return model

In [None]:
learning_rate = 0.9999
epochs = 30
batch_size = 9
validation_split = 0.1

new_fit = construct_model(
    learning_rate,
    output_dims=flat_label_group.shape[1],
    inp_shape=(img_height, img_width, 3)
)
epochs, mse1, mse2, acc1, acc2 = train_model(model=new_fit,
                                             feature_set=tf_features,
                                             epochs=epochs,
                                             label_set=flat_label_group,
                                             batch_size=batch_size,
                                             validation_split=validation_split)

In [None]:
plot_the_loss_curve(epochs, mse1, mse2)

In [None]:
plot_accuracy(epochs, acc1, acc2)

In [None]:
# tf.keras.utils.plot_model(new_fit)

In [None]:
new_fit.summary()

In [None]:
out = new_fit.predict(
    x=tf.convert_to_tensor(transform_features(
        extra_eval_data,
        reshape_flag=False,
        gray_flag=False,
        normal_flag=True
    )),
)

In [None]:
pprint(list(map(
    lambda x: ''.join(num_to_char(x)),
    reverse_flat_categories(out, cat_label_group_shape)
)))