In [7]:
import os
import sys
from dotenv import load_dotenv

load_dotenv()
sys.path.append(os.environ.get('PATH_CUSTOM_MODULES'))

import augment_image
import data_prep

import pandas as pd

### Prepare all basic variable

In [8]:
path_source = os.environ.get('PATH_DATASET_DESTINATION')
scenario_names = ['scenario_2', 'scenario_3'] # scenario 1 is the original dataset
dataset_names = ['rimone', 'g1020', 'refuge', 'papila']
fold_names = ['fold_1', 'fold_2', 'fold_3', 'fold_4', 'fold_5']
labels_name = ['normal', 'glaukoma']
image_size = {'rimone': (300,300),
            'g1020': (240,300),
            'refuge': (300,300),
            'papila': (200,300)}

### Prepare the path source and destination

In [9]:
# merge path source and path destination
# for each dataset, scenario, and label
path_dataset_src = {}
path_dataset_aug = {}
path_dataset_merge = {}

for scenario in scenario_names:
    for dataset in dataset_names:
        for fold in fold_names:
            for label in labels_name:
                ## create the source path for training data
                path_dataset_src[f'{scenario}_'
                                + f'{dataset}_'
                                + f'{fold}_'
                                + label] = os.path.join(path_source,
                                                        scenario,
                                                        dataset,
                                                        fold,
                                                        'train',
                                                        label)
                ## create the destination path a.k.a. augmented path for training data
                path_dataset_aug[scenario + '_'
                                + dataset + '_'
                                + fold + '_'
                                + label] = os.path.join(path_source,
                                                        scenario,
                                                        dataset,
                                                        fold,
                                                        'train_augmented',
                                                        label)
                ## create the merge path for training data
                path_dataset_merge[f'{scenario}_'
                                    + f'{dataset}_'
                                    + f'{fold}_'
                                    + label] = os.path.join(path_source,
                                                            scenario,
                                                            dataset,
                                                            fold,
                                                            'train_merged',
                                                            label)
del scenario, dataset, fold, label

### Prepare the merged directory

In [17]:
# create the directory for the augmented dataset
directory_result = augment_image.create_directory(path_dict=path_dataset_merge)

## print the result
for key, values in directory_result.items():
    if key == 'Already Exists' and values != []:
        for value in values:
            print('Directory already exists:', value)
        del value
del key, values, directory_result

### Get the image list

In [11]:
# initiate the variables to store the file names
original_files = {}
augmented_files = {}
# get file for the original image
for key, value in path_dataset_src.items():
    original_files[key] = data_prep.get_file_names(path=value)
del key, value
# get file for the augmented image
for key, value in path_dataset_aug.items():
        augmented_files[key] = data_prep.get_file_names(path=value)
del key, value

### Validate the image list

In [12]:
# validate the categories
len(original_files) == len(augmented_files) == len(path_dataset_src) == len(path_dataset_aug) == len(path_dataset_merge)

True

In [13]:
# validate the images
for key in original_files.keys():
    if len(original_files[key]) != len(augmented_files[key]):
        print(f'{key} are not equal')

### Merge the original and augmented data

In [14]:
# create the dataframe to store the result
copy_result = {
    'image type': [],
    'id': [],
    'Already Exists': [],
    'Success': []
}

df_result = pd.DataFrame(copy_result)
del copy_result

df_result.head()

Unnamed: 0,image type,id,Already Exists,Success


In [27]:
# copy the original images
for key in path_dataset_src.keys():
    result = data_prep.copy_files(source_path=path_dataset_src[key],
                                destination_path=path_dataset_merge[key],
                                file_names=original_files[key])
    df_result.loc[len(df_result)] = ['original',
                                    key,
                                    len(result['Already Exists']),
                                    len(result['Success'])]

In [28]:
# copy the augmented images
for key in path_dataset_aug.keys():
    result = data_prep.copy_files(source_path=path_dataset_aug[key],
                                destination_path=path_dataset_merge[key],
                                file_names=augmented_files[key])
    df_result.loc[len(df_result)] = ['augmented',
                                    key,
                                    len(result['Already Exists']),
                                    len(result['Success'])]

In [32]:
# check the result
df_result.loc[df_result['Already Exists'] > 0]

Unnamed: 0,image type,id,Already Exists,Success


### Remove the source file

In [34]:
# merge the path with file name
## define the variable to store the files path
rm_files = {}
## merge the original image path
for key, value in path_dataset_src.items():
    rm_files[f'ori_{key}'] = [os.path.join(value,
                                        file) for file in original_files[key]]
del key, value
# merge the augmented image path
for key, value in path_dataset_aug.items():
    rm_files[f'aug_{key}'] = [os.path.join(value,
                                        file) for file in augmented_files[key]]
del key, value

In [33]:
# validate the length of the files
len(rm_files) == len(df_result)

True

In [38]:
for files in rm_files.values():
    result_status = augment_image.remove_file(files)

print('Remove all the files:',
        f'Files removed: {len(result_status["Success"])}',
        f'Files not removed: {len(result_status["Not Found"])}',
        sep='\n')

del files, result_status

Remove all the files:
Files removed: 0
Files not removed: 109


### Remove the source directory

In [39]:
# prepare the variable
rm_dir = []
for scenario in scenario_names:
    for dataset in dataset_names:
        for fold in fold_names:
            for data_type in ['train', 'train_augmented']:
                rm_dir.append(os.path.join(path_source,
                                        scenario,
                                        dataset,
                                        fold,
                                        data_type))

In [41]:
# remove the directory
result_status = augment_image.remove_dir(dir_path=rm_dir)

print('Remove all the directory:',
        f'Directory removed: {len(result_status["Success"])}',
        f'Directory not removed: {len(result_status["Not Found"])}',
        sep='\n')

Remove all the directory:
Directory removed: 80
Directory not removed: 0
