## Organizing the MNIST Image Set
***

#### Importing Auxiliary Packages
***

In [2]:
from torchvision.datasets import MNIST
from PIL import Image
import idx2numpy
import os

#### Downloading the Dataset
***

In [3]:
MNIST('.', download = True)

Dataset MNIST
    Number of datapoints: 60000
    Root location: .
    Split: Train

#### Organizing Images in Directories
***

In [4]:
train_images_files = 'MNIST/raw/train-images-idx3-ubyte'
train_labels_files = 'MNIST/raw/train-labels-idx1-ubyte'
test_images_files = 'MNIST/raw/t10k-images-idx3-ubyte'
test_labels_files = 'MNIST/raw/t10k-labels-idx1-ubyte'

train_images_array = idx2numpy.convert_from_file(train_images_files)
train_labels_array = idx2numpy.convert_from_file(train_labels_files)
test_images_array = idx2numpy.convert_from_file(test_images_files)
test_labels_array = idx2numpy.convert_from_file(test_labels_files)

print('== Data Dimension ==\n')
print(f'Train Images Shape: {train_images_array.shape}')
print(f'Train Labels Shape: {train_labels_array.shape}')
print(f'Test Images Shape: {test_images_array.shape}')
print(f'Test Images Shape: {test_labels_array.shape}')

== Data Dimension ==

Train Images Shape: (60000, 28, 28)
Train Labels Shape: (60000,)
Test Images Shape: (10000, 28, 28)
Test Images Shape: (10000,)


In [5]:
def analyze_arrays(labels_array, extract):
    '''
    analyzes the amount of images contained in the array.

    Input:
        labels_array (array) --> array with all image labels.
        extract (string) --> defines the analyzed image extract type label.
    '''
    
    count, sum = 0, 0
    for num in range(0, 10):
        for label in labels_array:
            if num == label: count += 1

        print(f'{extract} images class {num}: {count} images')
        sum += count
        count = 0

    print(f'\n== {extract} total images: {sum} images ==')

In [6]:
analyze_arrays(train_labels_array, 'Train')

Train images class 0: 5923 images
Train images class 1: 6742 images
Train images class 2: 5958 images
Train images class 3: 6131 images
Train images class 4: 5842 images
Train images class 5: 5421 images
Train images class 6: 5918 images
Train images class 7: 6265 images
Train images class 8: 5851 images
Train images class 9: 5949 images

== Train total images: 60000 images ==


In [7]:
analyze_arrays(test_labels_array, 'Test')

Test images class 0: 980 images


Test images class 1: 1135 images
Test images class 2: 1032 images
Test images class 3: 1010 images
Test images class 4: 982 images
Test images class 5: 892 images
Test images class 6: 958 images
Test images class 7: 1028 images
Test images class 8: 974 images
Test images class 9: 1009 images

== Test total images: 10000 images ==


In [8]:
def array_to_png(image_array, image_dir):
    '''
    saves an image array in png format.

    Input:
        image_array (array) --> image array
        image_dir (str) --> string with directory and image name.png
    Returns:
    '''
    
    image = Image.fromarray(image_array)
    image.save(image_dir)

In [9]:
def make_directories(main_directory, sub_directories):
    '''
    creates directories to store the dataset images.

    Input:
        main_directory (string) --> main folder name.
        sub_directories (string) --> name of subdirectories.
    '''

    os.makedirs(main_directory, exist_ok = True)

    for sub in sub_directories:
        sub_directory = os.path.join(main_directory, sub)
        os.makedirs(sub_directory, exist_ok = True)

In [10]:
def array_to_png_all_images(directory, sub_dir, labels_array, images_array):
    '''
    assemble a directory of images in png format from array lists.

    Input:
        directory (string) --> main directory.
        sub_dir (string) --> sub directories indexed in order with classes.
        labels_array (array) --> array list containing all labels.
        images_array (array) --> array list containing all images. 
    '''

    for num in range(0, 10):
        for index, label in enumerate(labels_array):
            if num == label: 
                array_to_png(images_array[index], 
                             directory + sub_dir[num] + f'/{index}.png')

In [11]:
sub_directories = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']
make_directories('MNIST/original-data/train', sub_directories)
make_directories('MNIST/original-data/test', sub_directories)

array_to_png_all_images('MNIST/original-data/train/', sub_directories, train_labels_array, train_images_array)
array_to_png_all_images('MNIST/original-data/test/', sub_directories, test_labels_array, test_images_array)

In [12]:
def analyze_directory(directory, extract):
    '''
    analyzes the amount of images present in subdirectories.

    Input:
        directory (string) --> main directory to be parsed.
        extract (string) --> data extract type.
    '''

    count, sum = 0, 0
    for index, label in enumerate(os.listdir(directory)):
        count = len(os.listdir(directory + label))
        print(f'Train images in "{label}" directory: {count} images')
        sum += count 
        count = 0

    print(f'\n== {extract} total images: {sum} images ==')

In [13]:
analyze_directory('MNIST/original-data/train/', 'Train')

Train images in "eight" directory: 5851 images
Train images in "five" directory: 5421 images
Train images in "four" directory: 5842 images
Train images in "nine" directory: 5949 images
Train images in "one" directory: 6742 images
Train images in "seven" directory: 6265 images
Train images in "six" directory: 5918 images
Train images in "three" directory: 6131 images
Train images in "two" directory: 5958 images
Train images in "zero" directory: 5923 images

== Train total images: 60000 images ==


In [14]:
analyze_directory('MNIST/original-data/test/', 'Test')

Train images in "eight" directory: 974 images
Train images in "five" directory: 892 images
Train images in "four" directory: 982 images
Train images in "nine" directory: 1009 images
Train images in "one" directory: 1135 images
Train images in "seven" directory: 1028 images
Train images in "six" directory: 958 images
Train images in "three" directory: 1010 images
Train images in "two" directory: 1032 images
Train images in "zero" directory: 980 images

== Test total images: 10000 images ==
