In [None]:
# import everything
from tensorflow import keras
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import os
import cv2
import matplotlib.pyplot as plt

np.random.seed(1)
tf.random.set_seed(1)

In [None]:
# Constants
DATASET_BASE_PATH = "Dataset"
BASE_IMAGE_PATH = os.path.join(DATASET_BASE_PATH, "words")
IMAGE_WIDTH = 80
IMAGE_HEIGHT = 80

In [None]:
# Get the lines that contain data about the words
def get_usefull_lines():
    words_list = []

    # Remove useless lines
    words = open(f"{DATASET_BASE_PATH}/words.txt", "r").readlines()
    for line in words:

        # Remove initial comments
        if line[0] == "#":
            continue

        # Remove errored entries
        if line.split(" ")[1] != "err":
            words_list.append(line)
            
    return words_list

words_list = get_usefull_lines()

In [None]:
def get_image_paths_and_labels(samples):
    paths = []
    labels = []
    for (i, file_line) in enumerate(samples):
        line_split = file_line.strip()
        line_split = line_split.split(" ")

        # Each line split will have this format for the corresponding image:
        # part1/part1-part2/part1-part2-part3.png
        image_name = line_split[0]
        partI = image_name.split("-")[0]
        partII = image_name.split("-")[1]
        img_path = os.path.join(
            BASE_IMAGE_PATH, partI, partI + "-" + partII, image_name + ".png"
        )
        if os.path.getsize(img_path):
            paths.append(img_path)
            labels.append(file_line.split("\n")[0])

    return paths, labels


In [None]:
def clean_labels(labels):
    cleaned_labels = []
    for label in labels:
        label = label.split(" ")[-1].strip()
        cleaned_labels.append(label)
    return np.asarray(cleaned_labels)

In [None]:
def getWordsThatAppearBetween(listL, more_than, less_than):
    unique, counts = np.unique(listL, return_counts=True)
    x = dict(zip(unique, counts))
    return [key for key, value in x.items() if value <= less_than and value >= more_than]

In [None]:
def get_final_lines(word_dictionary, listL):
    words_list = []
    for line in listL:

        if line.split(" ")[-1].strip() in word_dictionary:
            words_list.append(line)

    return words_list

In [None]:
# Edit the required numer of appearances here
words_list = get_usefull_lines()
all_cleaned_labels = clean_labels(words_list)

# decide on the sample
print("Total sample:", len(all_cleaned_labels))
print("Total unique words:", len(np.unique(all_cleaned_labels)))

print("Total words with more than 100 appearances:", 
    len(getWordsThatAppearBetween(all_cleaned_labels, 100, len(all_cleaned_labels))))
    
word_dictionary = getWordsThatAppearBetween(all_cleaned_labels, 110, 150)
print("Total words with appearances between 110 and 150:", len(word_dictionary))
print(word_dictionary)

# uncomment and edit the following line to change the required range for the number of 
# appearances of a word in order to add it to the dataset
# word_dictionary = getWordsThatAppearBetween(all_cleaned_labels, more_than, less_than) 

# use only the words that appear the required amount of times
words_list = get_final_lines(word_dictionary, words_list)
print("Final sample", len(words_list))

In [None]:
def split_dataset(words_list):
    split_idx = int(0.9 * len(words_list))
    train_samples = words_list[:split_idx]
    test_samples = words_list[split_idx:]

    val_split_idx = int(0.5 * len(test_samples))
    validation_samples = test_samples[:val_split_idx]
    test_samples = test_samples[val_split_idx:]

    assert len(words_list) == len(train_samples) + len(validation_samples) + len(
        test_samples
    )

    return train_samples, validation_samples, test_samples

In [None]:
# shuffle the lines
np.random.shuffle(words_list)

# split the dataset
train_samples, validation_samples, test_samples = split_dataset(words_list)

# get labels and image paths
train_img_paths, train_labels = get_image_paths_and_labels(train_samples)
validation_img_paths, validation_labels = get_image_paths_and_labels(validation_samples)
test_img_paths, test_labels = get_image_paths_and_labels(test_samples)

# print statistics
print(f"Total training samples: {len(train_samples)}")
print(f"Total validation samples: {len(validation_samples)}")
print(f"Total test samples: {len(test_samples)}")

# get the cleaned labels in arrays
train_labels = clean_labels(train_labels)
validation_labels = clean_labels(validation_labels)
test_labels = clean_labels(test_labels)

# save the data up to now
# labels
labels_and_paths_location = 'labels_and_paths/'
np.save(labels_and_paths_location + 'train_labels.npy', train_labels)
np.save(labels_and_paths_location + 'validation_labels.npy', validation_labels)
np.save(labels_and_paths_location + 'test_labels.npy', test_labels)
# paths
np.save(labels_and_paths_location + 'train_img_paths.npy', train_img_paths)
np.save(labels_and_paths_location + 'validation_img_paths.npy', validation_img_paths)
np.save(labels_and_paths_location + 'test_img_paths.npy', test_img_paths)

model_dictionary = np.unique(train_labels)
np.save(labels_and_paths_location + 'model_dictionary.npy', model_dictionary)

In [None]:
# Start from here if you want to have the same dataset as me
# read the data
# labels
labels_and_paths_location = 'labels_and_paths/'
train_labels = np.load(labels_and_paths_location + 'train_labels.npy')
validation_labels = np.load(labels_and_paths_location + 'validation_labels.npy')
test_labels = np.load(labels_and_paths_location + 'test_labels.npy')
# paths
train_img_paths = np.load(labels_and_paths_location + 'train_img_paths.npy')
validation_img_paths = np.load(labels_and_paths_location + 'validation_img_paths.npy')
test_img_paths = np.load(labels_and_paths_location + 'test_img_paths.npy')

model_dictionary = np.load(labels_and_paths_location + 'model_dictionary.npy')

In [None]:
# resize the image without distorting it
def resize_image(img, size=(IMAGE_WIDTH, IMAGE_HEIGHT)):
    h, w = img.shape[:2]
    c = img.shape[2] if len(img.shape)>2 else 1
    if h == w: 
        return cv2.resize(img, size, cv2.INTER_AREA)
    dif = h if h > w else w
    interpolation = cv2.INTER_AREA if dif > (size[0]+size[1])//2 else cv2.INTER_CUBIC
    x_pos = (dif - w)//2
    y_pos = (dif - h)//2
    if len(img.shape) == 2:
        mask = np.ones((dif, dif), dtype=img.dtype)
        mask = cv2.bitwise_not(mask)               # Added mask inversion here
        mask[y_pos:y_pos+h, x_pos:x_pos+w] = img[:h, :w]
    else:
        mask = np.ones((dif, dif, c), dtype=img.dtype)
        mask = cv2.bitwise_not(mask)               # Added mask inversion here
        mask[y_pos:y_pos+h, x_pos:x_pos+w, :] = img[:h, :w, :]
    return cv2.resize(mask, size, interpolation)

In [None]:
# difference between original and resized pictures
img_array = cv2.imread(train_img_paths[10], cv2.IMREAD_GRAYSCALE)  # convert to array
print(train_labels[10])
plt.imshow(img_array, cmap='gray')  # graph it
plt.show()
img_array = resize_image(img_array)
plt.imshow(img_array, cmap='gray')  # graph it
plt.show()

In [None]:
def create_dataset(img_paths, labels):
    dataset = []
    unique_labels = np.unique(labels)

    for index, path in enumerate(img_paths):
        # convert to array
        img_array = cv2.imread(path, cv2.IMREAD_GRAYSCALE)  
        # resize to normalize data size
        img_array = resize_image(img_array)  
        # find the first occurence of the word in the uniquel labels list
        unique_label_index = np.where(unique_labels == labels[index])[0][0]
        # normalise the colours of the image 
        img_array = tf.keras.utils.normalize(img_array, axis=1)
        dataset.append([img_array, unique_label_index])

    return dataset

In [None]:
# save tha data into numpy objects to save time
def save_datasets(img_paths, labels, save_name):
    dataset = create_dataset(img_paths, labels) 
    X = []
    y = []

    for features, label in dataset:
        X.append(features)
        y.append(label)

    # keras need the features to be the correct shape
    X = np.array(X).reshape(-1, IMAGE_WIDTH, IMAGE_HEIGHT, 1)
    file_name = save_name + '.npy'
    np.save('my_dataset/X_'+ file_name, X)
    np.save('my_dataset/y_'+ file_name, y)

In [None]:
save_datasets(train_img_paths, train_labels, 'train')
save_datasets(test_img_paths, test_labels, 'test')
save_datasets(validation_img_paths, validation_labels, 'validate')

In [None]:
# load the data from numpy objects
X_train = np.load('my_dataset/X_train.npy')
X_test = np.load('my_dataset/X_test.npy')
X_validate = np.load('my_dataset/X_validate.npy')

y_train = np.load('my_dataset/y_train.npy')
y_test = np.load('my_dataset/y_test.npy')
y_validate = np.load('my_dataset/y_validate.npy')

In [None]:
# check an object from the dataset
unique_train_labels = np.unique(train_labels)
print(unique_train_labels[[y_train[10]]])
plt.imshow(X_train[10], cmap='gray')  # graph it
plt.show()