## Organizing the CELEBA Image Set
***

#### Importing Auxiliary Packages
***

In [1]:
from torchvision.datasets import CelebA
from PIL import Image
import numpy as np
import idx2numpy
import shutil
import os

#### Downloading the Dataset
***

* The dataset can be downloaded from the following Web site: [Large-scale CelebFaces Attributes (CelebA) Dataset](http://mmlab.ie.cuhk.edu.hk/projects/CelebA.html).

#### Organizing the Dataset Structure
***

In [2]:
def read_file(file_name):
    '''
    Reads a file .txt line by line.

    Input:
        file_name (str) --> relative path of the .txt file.
    Returns:
        lines (list) --> List of strings with the contents of each line of the .txt file.
    '''

    lines = []
    with open(file_name, 'r') as file:
        for line in file:
            lines.append(line.strip())
    return lines

In [12]:
def return_sex(file_name, initial_line, images_number):
    '''
    Returns the name of the image with the gender of the celebrity.

    Input:
        file_name (str) --> relative path of the .txt file.
        initial_line (int) --> initial line to start reading the file .txt.
        images_number (int) --> Total number of images in the dataset.

    Returns:
        image_with_sex (list) --> set of lists with the file name and gender of the celebrity.
    '''

    image_with_sex = []
    file_lines = read_file(file_name)

    for index in range(0, images_number):
        image_file = file_lines[initial_line + index].split()[0]
        image_sex = file_lines[initial_line + index].split()[21]

        if image_sex == '1': image_sex = 'male'
        else: image_sex = 'female'

        image_with_sex.append([image_file, image_sex])
    
    return image_with_sex

In [13]:
file_name = 'celeba/raw/list_attr_celeba.txt'
image_with_sex = return_sex(file_name, initial_line  = 2, 
                            images_number = len(os.listdir('celeba/raw/img_align_celeba/')))

In [23]:
train_images = image_with_sex[:int(len(image_with_sex) * 0.90)]
test_images = image_with_sex[int(len(image_with_sex) * 0.90):]

#### Organizing Images in Directories
***

In [2]:
def make_directories(main_directory, sub_directories):
    '''
    creates directories to store the dataset images.

    Input:
        main_directory (string) --> main folder name.
        sub_directories (string) --> name of subdirectories.
    '''

    os.makedirs(main_directory, exist_ok = True)

    for sub in sub_directories:
        sub_directory = os.path.join(main_directory, sub)
        os.makedirs(sub_directory, exist_ok = True)

In [17]:
sub_directories = ['male', 'female']
make_directories('celeba/original-data/train', sub_directories)
make_directories('celeba/original-data/test', sub_directories)

In [30]:
def organize_images(original_image_dir, destination_dir, images_with_sex):
    '''
    Organizing the images in the saved directories.

    Input: 
        original_image_dir (str) --> relative path where the original images are stored.
        destination_dir (str) --> relative path where the original images are stored.
        images_with_sex (list) --> list containing the name of the images and the gender of the celebrities.
    '''

    for image_with_sex in images_with_sex:
        shutil.copy(original_image_dir + image_with_sex[0],
                    destination_dir + image_with_sex[1])

In [31]:
organize_images('celeba/raw/img_align_celeba/', 'celeba/original-data/train/', train_images)
organize_images('celeba/raw/img_align_celeba/', 'celeba/original-data/test/', test_images)

In [2]:
def analyze_directory(directory, extract):
    '''
    analyzes the amount of images present in subdirectories.

    Input:
        directory (string) --> main directory to be parsed.
        extract (string) --> data extract type.
    '''

    count, sum = 0, 0
    for _, label in enumerate(os.listdir(directory)):
        count = len(os.listdir(directory + label))
        print(f'{extract} images in "{label}" directory: {count} images')
        sum += count 
        count = 0

    print(f'\n== {extract} total images: {sum} images ==')

In [3]:
analyze_directory('celeba/original-data/train/', 'Train')

Train images in "female" directory: 105746 images
Train images in "male" directory: 76593 images

== Train total images: 182339 images ==


In [4]:
analyze_directory('celeba/original-data/test/', 'Test')

Test images in "female" directory: 12419 images
Test images in "male" directory: 7841 images

== Test total images: 20260 images ==


#### Organizing the Data to Store the Artificial Images
***

In [3]:
sub_directories = ['male', 'female']
make_directories('celeba/artificial-augmentation/train', sub_directories)
make_directories('celeba/artificial-augmentation/test', sub_directories)