In [2]:
import os
import sys
from dotenv import load_dotenv

load_dotenv()
sys.path.append(os.environ.get('PATH_CUSTOM_MODULES'))

import augment_image

import pandas as pd
from tensorflow.keras.preprocessing.image import ImageDataGenerator


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 

 The versions of TensorFlow you are currently using is 2.10.1 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


### Prepare all basic variable

In [3]:
path_source = os.environ.get('PATH_DATASET_DESTINATION')
scenario_names = ['scenario_1', 'scenario_2', 'scenario_3']
dataset_names = ['rimone', 'g1020', 'refuge', 'papila']
fold_names = ['fold_1', 'fold_2', 'fold_3', 'fold_4', 'fold_5']
labels_name = ['normal', 'glaukoma']
image_size = {'rimone': (300,300),
            'g1020': (240,300),
            'refuge': (300,300),
            'papila': (200,300)}

### Prepare the path source and detination

In [12]:
# merge path source and path destination
# for each dataset, scenario, and label
path_dataset_src = {}
path_dataset_val_test_src = {}
path_dataset_aug = {}
path_dataset_val_test_dest = {}
## create the source path for training data
for scenario in scenario_names:
    for dataset in dataset_names:
        for fold in fold_names:
                path_dataset_src[scenario + '_'
                                + dataset + '_'
                                + fold] = os.path.join(path_source,
                                                        scenario,
                                                        dataset,
                                                        fold,
                                                        'train')
del scenario, dataset, fold
## create the source path for validation and testing data
for scenario in scenario_names:
    for dataset in dataset_names:
        for fold in fold_names:
            for data_type in ['val', 'test']:
                path_dataset_val_test_src[scenario + '_'
                                        + dataset + '_'
                                        + fold + '_'
                                        + data_type] = os.path.join(path_source,
                                                                    scenario,
                                                                    dataset,
                                                                    fold,
                                                                    data_type)
del scenario, dataset, fold, data_type
## create the destination path a.k.a. augmented path for training data
for scenario in scenario_names:
    for dataset in dataset_names:
        for fold in fold_names:
            for label in labels_name:
                path_dataset_aug[scenario + '_'
                                + dataset + '_'
                                + fold + '_'
                                + label] = os.path.join(path_source,
                                                        scenario,
                                                        dataset,
                                                        fold,
                                                        'train_augmented',
                                                        label)
del scenario, dataset, fold, label
## create the destination path a.k.a. augmented path for validation and testing data
for scenario in scenario_names:
    for dataset in dataset_names:
        for fold in fold_names:
            for data_type in ['val', 'test']:
                for label in labels_name:
                    path_dataset_val_test_dest[scenario + '_'
                                                + dataset + '_'
                                                + fold + '_'
                                                + data_type + '_'
                                                + label] = os.path.join(path_source,
                                                                        scenario,
                                                                        dataset,
                                                                        fold,
                                                                        data_type,
                                                                        label)
del scenario, dataset, fold, data_type, label

### Prepare the image data generator for each scenario

In [5]:
# create the image data generator
## data generator for scenario 1 (without augmentation)
datagenerator_s1 = ImageDataGenerator(
    rescale=1./255
)
## data generator for scenario 2 (with augmentation)
datagenerator_s2 = ImageDataGenerator(
    rescale=1./255,
    horizontal_flip=True,
    vertical_flip=True,
    brightness_range=[1, 1.5]
)
## data generator for scenario 3 (with augmentation and clahe)
datagenerator_s3 = ImageDataGenerator(
    rescale=1./255,
    horizontal_flip=True,
    vertical_flip=True,
    brightness_range=[1, 1.5],
    preprocessing_function=augment_image.clahe_augmentation
)
## data generator for scenario 3(only clahe)
datagenerator_s3_val_test = ImageDataGenerator(
    rescale=1./255,
    preprocessing_function=augment_image.clahe_augmentation
)

### Prepare Augment Directory
only run this code once to be safe

In [11]:
# create the directory for the augmented dataset
directory_result = augment_image.create_directory(path_dict=path_dataset_aug)
## print the result
for key, values in directory_result.items():
    if key == 'Already Exists' and values != []:
        for value in values:
            print('Directory already exists:', value)
del key, values, value, directory_result

Directory already exists: scenario_1_rimone_fold_1_normal
Directory already exists: scenario_1_rimone_fold_1_glaukoma
Directory already exists: scenario_1_rimone_fold_2_normal
Directory already exists: scenario_1_rimone_fold_2_glaukoma
Directory already exists: scenario_1_rimone_fold_3_normal
Directory already exists: scenario_1_rimone_fold_3_glaukoma
Directory already exists: scenario_1_rimone_fold_4_normal
Directory already exists: scenario_1_rimone_fold_4_glaukoma
Directory already exists: scenario_1_rimone_fold_5_normal
Directory already exists: scenario_1_rimone_fold_5_glaukoma
Directory already exists: scenario_1_g1020_fold_1_normal
Directory already exists: scenario_1_g1020_fold_1_glaukoma
Directory already exists: scenario_1_g1020_fold_2_normal
Directory already exists: scenario_1_g1020_fold_2_glaukoma
Directory already exists: scenario_1_g1020_fold_3_normal
Directory already exists: scenario_1_g1020_fold_3_glaukoma
Directory already exists: scenario_1_g1020_fold_4_normal
Direc

### Scenario 2
**Condition**:
- basic augmentation, 
- rgb color
- no clahe
#### Import the image into data generator

In [12]:
# define the augmentation mode for scenario 2
s2_src = {}

In [13]:
# get the image using image data generator
## load image using image data generator
for dataset in dataset_names:
    for fold in fold_names:
        for label in labels_name:
            print(f'Loading {dataset} {fold} {label}...')
            s2_src[dataset + '_'
                    + fold + '_'
                    + label] = (datagenerator_s2.flow_from_directory(
                                path_dataset_src[scenario_names[1] + '_'
                                                + dataset + '_'
                                                + fold],
                                target_size=image_size[dataset],
                                class_mode='binary',
                                classes=[label],
                                shuffle=True,
                                seed=1915026018,
                                save_to_dir=path_dataset_aug[scenario_names[1] + '_'
                                                            + dataset + '_'
                                                            + fold + '_'
                                                            + label],
                                save_prefix=f's2_{dataset}_{fold}_{label}',
                                save_format='jpg'))
del dataset, fold, label

Loading rimone fold_1 normal...
Found 218 images belonging to 1 classes.
Loading rimone fold_1 glaukoma...
Found 121 images belonging to 1 classes.
Loading rimone fold_2 normal...
Found 218 images belonging to 1 classes.
Loading rimone fold_2 glaukoma...
Found 121 images belonging to 1 classes.
Loading rimone fold_3 normal...
Found 218 images belonging to 1 classes.
Loading rimone fold_3 glaukoma...


Found 121 images belonging to 1 classes.
Loading rimone fold_4 normal...
Found 219 images belonging to 1 classes.
Loading rimone fold_4 glaukoma...
Found 120 images belonging to 1 classes.
Loading rimone fold_5 normal...
Found 219 images belonging to 1 classes.
Loading rimone fold_5 glaukoma...
Found 120 images belonging to 1 classes.
Loading g1020 fold_1 normal...
Found 506 images belonging to 1 classes.
Loading g1020 fold_1 glaukoma...
Found 208 images belonging to 1 classes.
Loading g1020 fold_2 normal...
Found 506 images belonging to 1 classes.
Loading g1020 fold_2 glaukoma...
Found 208 images belonging to 1 classes.
Loading g1020 fold_3 normal...
Found 506 images belonging to 1 classes.
Loading g1020 fold_3 glaukoma...
Found 208 images belonging to 1 classes.
Loading g1020 fold_4 normal...
Found 506 images belonging to 1 classes.
Loading g1020 fold_4 glaukoma...
Found 208 images belonging to 1 classes.
Loading g1020 fold_5 normal...
Found 507 images belonging to 1 classes.
Loading

#### Generate the augmented image & saved it

In [14]:
augment_image.generate_aug_img(dataset_names=dataset_names,
                                fold_names=fold_names,
                                labels_names=labels_name,
                                batch_datasets=s2_src,
                                data_type='train')

Generating augmented image for rimone/fold_1/normal...


Elapsed time: 9.53 seconds
Generating augmented image for rimone/fold_1/glaukoma...
Elapsed time: 5.15 seconds
Generating augmented image for rimone/fold_2/normal...
Elapsed time: 8.52 seconds
Generating augmented image for rimone/fold_2/glaukoma...
Elapsed time: 4.47 seconds
Generating augmented image for rimone/fold_3/normal...
Elapsed time: 7.16 seconds
Generating augmented image for rimone/fold_3/glaukoma...
Elapsed time: 5.06 seconds
Generating augmented image for rimone/fold_4/normal...
Elapsed time: 8.07 seconds
Generating augmented image for rimone/fold_4/glaukoma...
Elapsed time: 4.21 seconds
Generating augmented image for rimone/fold_5/normal...
Elapsed time: 8.06 seconds
Generating augmented image for rimone/fold_5/glaukoma...
Elapsed time: 5.03 seconds
Generating augmented image for g1020/fold_1/normal...
Elapsed time: 55.82 seconds
Generating augmented image for g1020/fold_1/glaukoma...
Elapsed time: 28.52 seconds
Generating augmented image for g1020/fold_2/normal...
Elaps

#### Validate the augmented image

In [15]:
# create a varible to store the file name
s2_src_fname = {}
s2_aug_fname = {}

# collecting the source file name
for key, value in path_dataset_src.items():
    if key.split('_')[1] == '2':
        for label in labels_name:
            s2_src_fname[key + '_'
                        + label] = [file for file in os.listdir(os.path.join(value,label))]
del key, value, label

# collecting the augmented file name
for key, value in path_dataset_aug.items():
    if key.split('_')[1] == '2':
        s2_aug_fname[key] = [file for file in os.listdir(value)]
del key, value

In [16]:
# merge the two dictionary into dataframe
s2_df_result = pd.concat([pd.DataFrame({
                                'category': s2_src_fname.keys(),
                                'file_count': [len(value) for value in s2_src_fname.values()],
                                'type': 'source'
                            }),
                            pd.DataFrame({
                                'category': s2_aug_fname.keys(),
                                'file_count': [len(value) for value in s2_aug_fname.values()],
                                'type': 'augmented'
                            })])
s2_df_result.head(2)

Unnamed: 0,category,file_count,type
0,scenario_2_rimone_fold_1_normal,218,source
1,scenario_2_rimone_fold_1_glaukoma,121,source


In [17]:
# validate the file count
s2_df_validate = pd.DataFrame(s2_df_result.groupby(['category', 'type']).file_count.sum())

s2_df_validate.sort_values(by='category', inplace=True)
s2_df_validate = s2_df_validate.pivot_table(index='category',
                                            columns='type',
                                            values='file_count')

s2_df_validate.loc[s2_df_validate.augmented == s2_df_validate.source,
                    'status'] = 'valid'
s2_df_validate.loc[s2_df_validate.augmented != s2_df_validate.source,
                    'status'] = 'invalid'

s2_df_validate.head(5)

type,augmented,source,status
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
scenario_2_g1020_fold_1_glaukoma,208.0,208.0,valid
scenario_2_g1020_fold_1_normal,506.0,506.0,valid
scenario_2_g1020_fold_2_glaukoma,208.0,208.0,valid
scenario_2_g1020_fold_2_normal,506.0,506.0,valid
scenario_2_g1020_fold_3_glaukoma,208.0,208.0,valid


In [18]:
# print the result
print(f'Total categories: {len(s2_df_validate)}',
    f'\nTotal valid categories: {len(s2_df_validate[s2_df_validate.status == "valid"])}',
    f'\nTotal invalid categories: {len(s2_df_validate[s2_df_validate.status == "invalid"])}',
    sep='\n')

Total categories: 40

Total valid categories: 40

Total invalid categories: 0


### Scenario 3
**Condition**:
- basic augmentation, 
- rgb color, 
- clahe
#### Import the image into data generator

In [19]:
# define the augmentation mode for scenario 3
s3_src = {}
s3_val_test_src = {}
s3_file_code_name = []

In [20]:
# load image using image data generator for training data
for dataset in dataset_names:
    for fold in fold_names:
        for label in labels_name:
            print(f'Loading {dataset} {fold} {label}...')
            s3_src[dataset + '_'
                    + fold + '_'
                    + label] = (datagenerator_s3.flow_from_directory(
                                path_dataset_src[scenario_names[2] + '_'
                                                + dataset + '_'
                                                + fold],
                                target_size=image_size[dataset],
                                class_mode='binary',
                                classes=[label],
                                shuffle=True,
                                seed=1915026018,
                                save_to_dir=path_dataset_aug[scenario_names[2] + '_'
                                                            + dataset + '_'
                                                            + fold + '_'
                                                            + label],
                                save_prefix=f's3_{dataset}_{fold}_{label}',
                                save_format='jpg'))
del dataset, fold, label

Loading rimone fold_1 normal...
Found 218 images belonging to 1 classes.
Loading rimone fold_1 glaukoma...
Found 121 images belonging to 1 classes.
Loading rimone fold_2 normal...
Found 218 images belonging to 1 classes.
Loading rimone fold_2 glaukoma...
Found 121 images belonging to 1 classes.
Loading rimone fold_3 normal...
Found 218 images belonging to 1 classes.
Loading rimone fold_3 glaukoma...
Found 121 images belonging to 1 classes.
Loading rimone fold_4 normal...
Found 219 images belonging to 1 classes.
Loading rimone fold_4 glaukoma...
Found 120 images belonging to 1 classes.
Loading rimone fold_5 normal...
Found 219 images belonging to 1 classes.
Loading rimone fold_5 glaukoma...
Found 120 images belonging to 1 classes.


Loading g1020 fold_1 normal...
Found 506 images belonging to 1 classes.
Loading g1020 fold_1 glaukoma...
Found 208 images belonging to 1 classes.
Loading g1020 fold_2 normal...
Found 506 images belonging to 1 classes.
Loading g1020 fold_2 glaukoma...
Found 208 images belonging to 1 classes.
Loading g1020 fold_3 normal...
Found 506 images belonging to 1 classes.
Loading g1020 fold_3 glaukoma...
Found 208 images belonging to 1 classes.
Loading g1020 fold_4 normal...
Found 506 images belonging to 1 classes.
Loading g1020 fold_4 glaukoma...
Found 208 images belonging to 1 classes.
Loading g1020 fold_5 normal...
Found 507 images belonging to 1 classes.
Loading g1020 fold_5 glaukoma...
Found 207 images belonging to 1 classes.
Loading refuge fold_1 normal...
Found 756 images belonging to 1 classes.
Loading refuge fold_1 glaukoma...
Found 84 images belonging to 1 classes.
Loading refuge fold_2 normal...
Found 756 images belonging to 1 classes.
Loading refuge fold_2 glaukoma...
Found 84 images 

In [21]:
# load image using image data generator for validation and test data
for dataset in dataset_names:
    for fold in fold_names:
        for data_type in ['val', 'test']:
            for label in labels_name:
                print(f'Loading {dataset} {fold} {data_type} {label}...')
                s3_val_test_src[dataset + '_'
                                + fold + '_'
                                + data_type + '_'
                                + label] = (datagenerator_s3_val_test.flow_from_directory(
                                            path_dataset_val_test_src[scenario_names[2] + '_'
                                                                    + dataset + '_'
                                                                    + fold + '_'
                                                                    + data_type],
                                            target_size=image_size[dataset],
                                            class_mode='binary',
                                            classes=[label],
                                            shuffle=True,
                                            seed=1915026018,
                                            save_to_dir=path_dataset_val_test_dest[scenario_names[2] + '_'
                                                                                    + dataset + '_'
                                                                                    + fold + '_'
                                                                                    + data_type + '_'
                                                                                    + label],
                                            save_prefix=f's3_{dataset}_{fold}_{data_type}_{label}',
                                            save_format='jpg'))
                s3_file_code_name.append(f's3_{dataset}_{fold}_{data_type}_{label}')
del dataset, fold, data_type, label

Loading rimone fold_1 val normal...
Found 32 images belonging to 1 classes.
Loading rimone fold_1 val glaukoma...
Found 17 images belonging to 1 classes.
Loading rimone fold_1 test normal...
Found 63 images belonging to 1 classes.
Loading rimone fold_1 test glaukoma...
Found 34 images belonging to 1 classes.
Loading rimone fold_2 val normal...
Found 32 images belonging to 1 classes.
Loading rimone fold_2 val glaukoma...
Found 17 images belonging to 1 classes.
Loading rimone fold_2 test normal...
Found 63 images belonging to 1 classes.
Loading rimone fold_2 test glaukoma...
Found 34 images belonging to 1 classes.
Loading rimone fold_3 val normal...
Found 32 images belonging to 1 classes.
Loading rimone fold_3 val glaukoma...
Found 17 images belonging to 1 classes.
Loading rimone fold_3 test normal...
Found 63 images belonging to 1 classes.
Loading rimone fold_3 test glaukoma...
Found 34 images belonging to 1 classes.
Loading rimone fold_4 val normal...
Found 32 images belonging to 1 cla

#### Generate the augmented image & saved it

In [22]:
# generate the augmented image for training data
augment_image.generate_aug_img(dataset_names=dataset_names,
                                fold_names=fold_names,
                                labels_names=labels_name,
                                batch_datasets=s3_src,
                                data_type='train')

Generating augmented image for rimone/fold_1/normal...


Elapsed time: 53.67 seconds
Generating augmented image for rimone/fold_1/glaukoma...
Elapsed time: 24.23 seconds
Generating augmented image for rimone/fold_2/normal...
Elapsed time: 39.54 seconds
Generating augmented image for rimone/fold_2/glaukoma...
Elapsed time: 22.64 seconds
Generating augmented image for rimone/fold_3/normal...
Elapsed time: 39.28 seconds
Generating augmented image for rimone/fold_3/glaukoma...
Elapsed time: 21.82 seconds
Generating augmented image for rimone/fold_4/normal...
Elapsed time: 39.27 seconds
Generating augmented image for rimone/fold_4/glaukoma...
Elapsed time: 21.44 seconds
Generating augmented image for rimone/fold_5/normal...
Elapsed time: 39.38 seconds
Generating augmented image for rimone/fold_5/glaukoma...
Elapsed time: 22.08 seconds
Generating augmented image for g1020/fold_1/normal...
Elapsed time: 70.65 seconds
Generating augmented image for g1020/fold_1/glaukoma...
Elapsed time: 27.64 seconds
Generating augmented image for g1020/fold_2/norma

In [23]:
# generate the augmented image for validation and testing data
augment_image.generate_aug_img(dataset_names=dataset_names,
                                fold_names=fold_names,
                                labels_names=labels_name,
                                batch_datasets=s3_val_test_src,
                                data_type='val_test')

Generating augmented image for rimone/fold_1/val...
Elapsed time: 7.39 seconds
Generating augmented image for rimone/fold_1/test...
Elapsed time: 12.72 seconds
Generating augmented image for rimone/fold_2/val...
Elapsed time: 6.18 seconds
Generating augmented image for rimone/fold_2/test...
Elapsed time: 12.33 seconds
Generating augmented image for rimone/fold_3/val...
Elapsed time: 6.25 seconds
Generating augmented image for rimone/fold_3/test...
Elapsed time: 12.40 seconds
Generating augmented image for rimone/fold_4/val...
Elapsed time: 6.20 seconds
Generating augmented image for rimone/fold_4/test...
Elapsed time: 12.64 seconds
Generating augmented image for rimone/fold_5/val...
Elapsed time: 6.33 seconds
Generating augmented image for rimone/fold_5/test...
Elapsed time: 12.36 seconds
Generating augmented image for g1020/fold_1/val...
Elapsed time: 14.02 seconds
Generating augmented image for g1020/fold_1/test...
Elapsed time: 27.97 seconds
Generating augmented image for g1020/fold

#### Validate the augmented image

In [24]:
# create a varible to store the file name
s3_src_fname = {}
s3_aug_fname = {}

# collecting the source file name
for key, value in path_dataset_src.items():
    if key.split('_')[1] == '3':
        for label in labels_name:
            s3_src_fname[key + '_'
                        + label] = [file for file in os.listdir(os.path.join(value,label))]
del key, value, label

# collecting the augmented file name
for key, value in path_dataset_aug.items():
    if key.split('_')[1] == '3':
        s3_aug_fname[key] = [file for file in os.listdir(value)]
del key, value

In [25]:
# merge the two dictionary into dataframe
s3_df_result = pd.concat([pd.DataFrame({
                                'category': s3_src_fname.keys(),
                                'file_count': [len(value) for value in s3_src_fname.values()],
                                'type': 'source'
                            }),
                            pd.DataFrame({
                                'category': s3_aug_fname.keys(),
                                'file_count': [len(value) for value in s3_aug_fname.values()],
                                'type': 'augmented'
                            })])
s3_df_result.head(2)

Unnamed: 0,category,file_count,type
0,scenario_3_rimone_fold_1_normal,218,source
1,scenario_3_rimone_fold_1_glaukoma,121,source


In [26]:
# validate the file count
s3_df_validate = pd.DataFrame(s3_df_result.groupby(['category', 'type']).file_count.sum())

s3_df_validate.sort_values(by='category', inplace=True)
s3_df_validate = s3_df_validate.pivot_table(index='category',
                                            columns='type',
                                            values='file_count')

s3_df_validate.loc[s3_df_validate.augmented == s3_df_validate.source,
                    'status'] = 'valid'
s3_df_validate.loc[s3_df_validate.augmented != s3_df_validate.source,
                    'status'] = 'invalid'

s3_df_validate.head(5)

type,augmented,source,status
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
scenario_3_g1020_fold_1_glaukoma,208.0,208.0,valid
scenario_3_g1020_fold_1_normal,506.0,506.0,valid
scenario_3_g1020_fold_2_glaukoma,208.0,208.0,valid
scenario_3_g1020_fold_2_normal,506.0,506.0,valid
scenario_3_g1020_fold_3_glaukoma,208.0,208.0,valid


In [27]:
# print the result
print(f'Total categories: {len(s3_df_validate)}',
    f'\nTotal valid categories: {len(s3_df_validate[s3_df_validate.status == "valid"])}',
    f'\nTotal invalid categories: {len(s3_df_validate[s3_df_validate.status == "invalid"])}',
    sep='\n')

Total categories: 40

Total valid categories: 40

Total invalid categories: 0


#### Remove the previous image
##### Getting the image file name

In [28]:
# define variable to store the file name
s3_rm_file, s3_aug_file = augment_image.get_file(files_code=s3_file_code_name,
                                            path_dest=path_dataset_val_test_dest,
                                            scenario=scenario_names[2])

##### Validate the file

In [29]:
# create the dataframe to store the result
s3_df_file_check = pd.DataFrame(columns = ['type',
                                        'category',
                                        'file_path',
                                        'file_name'])
s3_df_file_check.head()

Unnamed: 0,type,category,file_path,file_name


In [30]:
# add the file name that will be removed result into the dataframe
for category, files_list in s3_rm_file.items():
    for file in files_list:
        s3_df_file_check.loc[len(s3_df_file_check)] = ['remove',
                                                    category[3:].replace('_', ' '),
                                                    file,
                                                    os.path.basename(file)]
del category, files_list, file

In [31]:
# add the file name that is augmented into the dataframe
for category, Files_list in s3_aug_file.items():
    for file in Files_list:
        s3_df_file_check.loc[len(s3_df_file_check)] = ['augment',
                                                    category[3:].replace('_', ' '),
                                                    file,
                                                    os.path.basename(file)]
del category, Files_list, file

In [32]:
s3_df_file_check.isna().sum()

type         0
category     0
file_path    0
file_name    0
dtype: int64

In [33]:
# handle the result into the compare able shape
s3_df_validate = pd.DataFrame(s3_df_file_check.groupby(by=['category',
                                                        'type']).count()['file_name'])


s3_df_validate = s3_df_validate.pivot_table(values='file_name',
                                            index='category',
                                            columns='type')
s3_df_validate.loc[s3_df_validate.augment == s3_df_validate.remove,
                    'status'] = 'valid'
s3_df_validate.loc[s3_df_validate.augment != s3_df_validate.remove,
                    'status'] = 'invalid'
s3_df_validate.head(3)

type,augment,remove,status
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
g1020 fold 1 test glaukoma,59.0,59.0,valid
g1020 fold 1 test normal,145.0,145.0,valid
g1020 fold 1 val glaukoma,29.0,29.0,valid


In [34]:
# print the result
print(f'total categories: {s3_df_validate.shape[0]}',
        f'valid file(s)   : {s3_df_validate.loc[s3_df_validate.status == "valid"].shape[0]}',
        f'invalid file(s) : {s3_df_validate.loc[s3_df_validate.status == "invalid"].shape[0]}',
        sep='\n')

total categories: 80
valid file(s)   : 80
invalid file(s) : 0


##### Removing the file

In [35]:
for files in s3_rm_file.values():
    result_status = augment_image.remove_file(files)

print(f'Files removed: {len(result_status["Success"])}',
        f'Files already removed: {len(result_status["Not Found"])}',
        sep='\n')

del files, result_status

Files removed: 31
Files already removed: 0


### Removing scenario one augmented directory

In [15]:
s1_rmdir = []
for scenario in scenario_names:
    for dataset in dataset_names:
        for fold in fold_names:
            s1_rmdir.append(os.path.join(path_source,
                                        scenario,
                                        dataset,
                                        fold,
                                        'train_augmented'))
    break

result_status = augment_image.remove_dir(s1_rmdir)
print(f'Directory removed: {len(result_status["Success"])}',
        f'Directory already removed: {len(result_status["Not Found"])}',
        sep='\n')

del scenario, dataset, fold

Directory removed: 20
Directory already removed: 0
