## Organizing the EMNIST Letters Image Set
***

#### Importing Auxiliary Packages
***

In [4]:
from torchvision.datasets import EMNIST
from PIL import Image
import numpy as np
import idx2numpy
import os

#### Downloading the Dataset
***

In [2]:
EMNIST('.', download = True, split = 'letters')

Dataset EMNIST
    Number of datapoints: 124800
    Root location: .
    Split: Train

#### Organizing Images in Directories
***

In [3]:
train_images_files = 'EMNIST/raw/emnist-letters-train-images-idx3-ubyte'
train_labels_files = 'EMNIST/raw/emnist-letters-train-labels-idx1-ubyte'
test_images_files = 'EMNIST/raw/emnist-letters-test-images-idx3-ubyte'
test_labels_files = 'EMNIST/raw/emnist-letters-test-labels-idx1-ubyte'

train_images_array = idx2numpy.convert_from_file(train_images_files)
train_labels_array = idx2numpy.convert_from_file(train_labels_files)
test_images_array = idx2numpy.convert_from_file(test_images_files)
test_labels_array = idx2numpy.convert_from_file(test_labels_files)

print('== Data Dimension ==\n')
print(f'Train Images Shape: {train_images_array.shape}')
print(f'Train Labels Shape: {train_labels_array.shape}')
print(f'Test Images Shape: {test_images_array.shape}')
print(f'Test Images Shape: {test_labels_array.shape}')

== Data Dimension ==

Train Images Shape: (124800, 28, 28)
Train Labels Shape: (124800,)
Test Images Shape: (20800, 28, 28)
Test Images Shape: (20800,)


In [4]:
def analyze_arrays(labels_array, extract):
    '''
    analyzes the amount of images contained in the array.

    Input:
        labels_array (array) --> array with all image labels.
        extract (string) --> defines the analyzed image extract type label.
    '''
    
    count, sum = 0, 0
    for num in range(1, 27):
        for label in labels_array:
            if num == label: count += 1

        print(f'{extract} images class {num}: {count} images')
        sum += count
        count = 0

    print(f'\n== {extract} total images: {sum} images ==')

In [5]:
analyze_arrays(train_labels_array, 'Train')

Train images class 1: 4800 images
Train images class 2: 4800 images
Train images class 3: 4800 images
Train images class 4: 4800 images
Train images class 5: 4800 images
Train images class 6: 4800 images
Train images class 7: 4800 images
Train images class 8: 4800 images
Train images class 9: 4800 images
Train images class 10: 4800 images
Train images class 11: 4800 images
Train images class 12: 4800 images
Train images class 13: 4800 images
Train images class 14: 4800 images
Train images class 15: 4800 images
Train images class 16: 4800 images
Train images class 17: 4800 images
Train images class 18: 4800 images
Train images class 19: 4800 images
Train images class 20: 4800 images
Train images class 21: 4800 images
Train images class 22: 4800 images
Train images class 23: 4800 images
Train images class 24: 4800 images
Train images class 25: 4800 images
Train images class 26: 4800 images

== Train total images: 124800 images ==


In [6]:
analyze_arrays(test_labels_array, 'Test')

Test images class 1: 800 images
Test images class 2: 800 images
Test images class 3: 800 images
Test images class 4: 800 images
Test images class 5: 800 images
Test images class 6: 800 images
Test images class 7: 800 images
Test images class 8: 800 images
Test images class 9: 800 images
Test images class 10: 800 images
Test images class 11: 800 images
Test images class 12: 800 images
Test images class 13: 800 images
Test images class 14: 800 images
Test images class 15: 800 images
Test images class 16: 800 images
Test images class 17: 800 images
Test images class 18: 800 images
Test images class 19: 800 images
Test images class 20: 800 images
Test images class 21: 800 images
Test images class 22: 800 images
Test images class 23: 800 images
Test images class 24: 800 images
Test images class 25: 800 images
Test images class 26: 800 images

== Test total images: 20800 images ==


In [7]:
def array_to_png(image_array, image_dir):
    '''
    saves an image array in png format.

    Input:
        image_array (array) --> image array
        image_dir (str) --> string with directory and image name.png
    Returns:
    '''
    
    rotated_image = np.rot90(image_array, k=-1)
    flipped_image = np.fliplr(rotated_image)

    image = Image.fromarray(flipped_image)
    image.save(image_dir)

In [2]:
def make_directories(main_directory, sub_directories):
    '''
    creates directories to store the dataset images.

    Input:
        main_directory (string) --> main folder name.
        sub_directories (string) --> name of subdirectories.
    '''

    os.makedirs(main_directory, exist_ok = True)

    for sub in sub_directories:
        sub_directory = os.path.join(main_directory, sub)
        os.makedirs(sub_directory, exist_ok = True)

In [9]:
def array_to_png_all_images(directory, sub_dir, labels_array, images_array):
    '''
    assemble a directory of images in png format from array lists.

    Input:
        directory (string) --> main directory.
        sub_dir (string) --> sub directories indexed in order with classes.
        labels_array (array) --> array list containing all labels.
        images_array (array) --> array list containing all images. 
    '''

    for num in range(1, 27):
        for index, label in enumerate(labels_array):
            if num == label: 
                array_to_png(images_array[index], 
                             directory + sub_dir[num - 1] + f'/{index}.png')

In [10]:
sub_directories = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 
                   'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
make_directories('EMNIST/original-data/train', sub_directories)
make_directories('EMNIST/original-data/test', sub_directories)

array_to_png_all_images('EMNIST/original-data/train/', sub_directories, train_labels_array, train_images_array)
array_to_png_all_images('EMNIST/original-data/test/', sub_directories, test_labels_array, test_images_array)

In [5]:
def analyze_directory(directory, extract):
    '''
    analyzes the amount of images present in subdirectories.

    Input:
        directory (string) --> main directory to be parsed.
        extract (string) --> data extract type.
    '''

    count, sum = 0, 0
    for _, label in enumerate(os.listdir(directory)):
        count = len(os.listdir(directory + label))
        print(f'{extract} images in "{label}" directory: {count} images')
        sum += count 
        count = 0

    print(f'\n== {extract} total images: {sum} images ==')

In [6]:
analyze_directory('EMNIST/original-data/train/', 'Train')

Train images in "a" directory: 4800 images
Train images in "b" directory: 4800 images
Train images in "c" directory: 4800 images
Train images in "d" directory: 4800 images
Train images in "e" directory: 4800 images
Train images in "f" directory: 4800 images
Train images in "g" directory: 4800 images
Train images in "h" directory: 4800 images
Train images in "i" directory: 4800 images
Train images in "j" directory: 4800 images
Train images in "k" directory: 4800 images
Train images in "l" directory: 4800 images
Train images in "m" directory: 4800 images
Train images in "n" directory: 4800 images
Train images in "o" directory: 4800 images
Train images in "p" directory: 4800 images
Train images in "q" directory: 4800 images
Train images in "r" directory: 4800 images
Train images in "s" directory: 4800 images
Train images in "t" directory: 4800 images
Train images in "u" directory: 4800 images
Train images in "v" directory: 4800 images
Train images in "w" directory: 4800 images
Train image

In [7]:
analyze_directory('EMNIST/original-data/test/', 'Test')

Test images in "a" directory: 800 images
Test images in "b" directory: 800 images
Test images in "c" directory: 800 images
Test images in "d" directory: 800 images
Test images in "e" directory: 800 images
Test images in "f" directory: 800 images
Test images in "g" directory: 800 images
Test images in "h" directory: 800 images
Test images in "i" directory: 800 images
Test images in "j" directory: 800 images
Test images in "k" directory: 800 images
Test images in "l" directory: 800 images
Test images in "m" directory: 800 images
Test images in "n" directory: 800 images
Test images in "o" directory: 800 images
Test images in "p" directory: 800 images
Test images in "q" directory: 800 images
Test images in "r" directory: 800 images
Test images in "s" directory: 800 images
Test images in "t" directory: 800 images
Test images in "u" directory: 800 images
Test images in "v" directory: 800 images
Test images in "w" directory: 800 images
Test images in "x" directory: 800 images
Test images in "

#### Organizing the Data to Store the Artificial Images
***

In [3]:
sub_directories = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 
                   'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
make_directories('EMNIST/artificial-augmentation/train', sub_directories)
make_directories('EMNIST/artificial-augmentation/test', sub_directories)