In [2]:
import os
import sys
from dotenv import load_dotenv

load_dotenv()
sys.path.append(os.environ.get('PATH_CUSTOM_MODULES'))

import augment_image

import pandas as pd
from tensorflow.keras.preprocessing.image import ImageDataGenerator

### Prepare all basic variable

In [3]:
path_source = os.environ.get('PATH_DATASET_DESTINATION')
scenario_names = ['scenario_2', 'scenario_3']
dataset_names = ['rimone', 'g1020', 'refuge', 'papila']
fold_names = ['fold_1', 'fold_2', 'fold_3', 'fold_4', 'fold_5']
labels_name = ['normal', 'glaukoma']
image_size = {'rimone': (300,300),
            'g1020': (240,300),
            'refuge': (300,300),
            'papila': (200,300)}

### Prepare the path source and detination

In [11]:
# merge path source and path destination for each dataset, scenario, and label
path_dataset_src = {}
path_dataset_val_test_src = {}
path_dataset_aug = {}
path_dataset_clahe_dst = {}
path_dataset_val_test_dest = {}

for scenario in scenario_names:
    for dataset in dataset_names:
        for fold in fold_names:
            ## create the source path for training data
            path_dataset_src[f'{scenario}_'
                            + f'{dataset}_'
                            + fold] = os.path.join(path_source,
                                                    scenario,
                                                    dataset,
                                                    fold,
                                                    'train')
            
            for data_type in ['val', 'test']:
                for label in labels_name:
                    ## create the destination path a.k.a. augmented path for validation and testing data
                    path_dataset_val_test_dest[f'{scenario}_'
                                                + f'{dataset}_'
                                                + f'{fold}_'
                                                + data_type + '_'
                                                + label] = os.path.join(path_source,
                                                                        scenario,
                                                                        dataset,
                                                                        fold,
                                                                        data_type,
                                                                        label)
                    
                ## create the source path for validation and testing data
                path_dataset_val_test_src[f'{scenario}_'
                                        + f'{dataset}_'
                                        + f'{fold}_'
                                        + data_type] = os.path.join(path_source,
                                                                    scenario,
                                                                    dataset,
                                                                    fold,
                                                                    data_type)
del scenario, dataset, fold, data_type, label

for scenario in scenario_names:
    for dataset in dataset_names:
        for fold in fold_names:
            for label in labels_name:
                ## create the source path for training data with only clahe augmentation
                path_dataset_clahe_dst[f'{scenario}_'
                                    + f'{dataset}_'
                                    + f'{fold}_'
                                    + label] = os.path.join(path_source,
                                                            scenario,
                                                            dataset,
                                                            fold,
                                                            'train',
                                                            label)
                
                ## create the destination path a.k.a. augmented path for training data
                path_dataset_aug[f'{scenario}_'
                                + f'{dataset}_'
                                + f'{fold}_'
                                + label] = os.path.join(path_source,
                                                        scenario,
                                                        dataset,
                                                        fold,
                                                        'train_augmented',
                                                        label)
del scenario, dataset, fold, label

### Prepare the image data generator for each scenario

In [14]:
# create the image data generator
## data generator for scenario 2 (with augmentation)
datagenerator_s2 = ImageDataGenerator(
    rescale=1./255,
    horizontal_flip=True,
    vertical_flip=True,
    brightness_range=[1, 1.5]
)
## data generator for scenario 3 (with augmentation and clahe)
datagenerator_s3 = ImageDataGenerator(
    rescale=1./255,
    horizontal_flip=True,
    vertical_flip=True,
    brightness_range=[1, 1.5],
    preprocessing_function=augment_image.clahe_augmentation
)
## data generator for scenario 3(only clahe)
datagenerator_s3_clahe = ImageDataGenerator(
    rescale=1./255,
    preprocessing_function=augment_image.clahe_augmentation
)

### Prepare Augment Directory
only run this code once to be safe

In [15]:
# create the directory for the augmented dataset
directory_result = augment_image.create_directory(path_dict=path_dataset_aug)
## print the result
for key, values in directory_result.items():
    if key == 'Already Exists' and values != []:
        for value in values:
            print('Directory already exists:', value)
        del value
del key, values, directory_result

### Scenario 2
**Condition**:
- basic augmentation, 
- rgb color
- no clahe
#### Import the image into data generator

In [16]:
# define the augmentation mode for scenario 2
s2_src = {}

In [17]:
# get the image using image data generator
## load image using image data generator
for dataset in dataset_names:
    for fold in fold_names:
        for label in labels_name:
            print(f'{dataset} {fold} {label}')
            s2_src[dataset + '_'
                    + fold + '_'
                    + label] = (datagenerator_s2.flow_from_directory(
                                path_dataset_src[scenario_names[0] + '_'
                                                + dataset + '_'
                                                + fold],
                                target_size=image_size[dataset],
                                class_mode='binary',
                                classes=[label],
                                shuffle=True,
                                seed=1915026018,
                                save_to_dir=path_dataset_aug[scenario_names[0] + '_'
                                                            + dataset + '_'
                                                            + fold + '_'
                                                            + label],
                                save_prefix=f's2_{dataset}_f{fold.split("_")[-1]}_{label}',
                                save_format='jpg'))
del dataset, fold, label

rimone fold_1 normal
Found 218 images belonging to 1 classes.
rimone fold_1 glaukoma
Found 121 images belonging to 1 classes.
rimone fold_2 normal
Found 218 images belonging to 1 classes.
rimone fold_2 glaukoma
Found 121 images belonging to 1 classes.
rimone fold_3 normal
Found 218 images belonging to 1 classes.
rimone fold_3 glaukoma
Found 121 images belonging to 1 classes.
rimone fold_4 normal
Found 219 images belonging to 1 classes.
rimone fold_4 glaukoma
Found 120 images belonging to 1 classes.
rimone fold_5 normal
Found 219 images belonging to 1 classes.
rimone fold_5 glaukoma
Found 120 images belonging to 1 classes.
g1020 fold_1 normal
Found 506 images belonging to 1 classes.
g1020 fold_1 glaukoma
Found 208 images belonging to 1 classes.
g1020 fold_2 normal
Found 506 images belonging to 1 classes.
g1020 fold_2 glaukoma
Found 208 images belonging to 1 classes.
g1020 fold_3 normal
Found 506 images belonging to 1 classes.
g1020 fold_3 glaukoma
Found 208 images belonging to 1 classes

#### Generate the augmented image & saved it

In [18]:
augment_image.generate_aug_img(dataset_names=dataset_names,
                                fold_names=fold_names,
                                labels_names=labels_name,
                                batch_datasets=s2_src,
                                data_type='train')

Generating augmented image for rimone f1 normal...
Elapsed time: 5.95 seconds
Generating augmented image for rimone f1 glaukoma...
Elapsed time: 3.17 seconds
Generating augmented image for rimone f2 normal...
Elapsed time: 6.89 seconds
Generating augmented image for rimone f2 glaukoma...
Elapsed time: 4.31 seconds
Generating augmented image for rimone f3 normal...
Elapsed time: 7.91 seconds
Generating augmented image for rimone f3 glaukoma...
Elapsed time: 9.65 seconds
Generating augmented image for rimone f4 normal...
Elapsed time: 12.46 seconds
Generating augmented image for rimone f4 glaukoma...
Elapsed time: 6.17 seconds
Generating augmented image for rimone f5 normal...
Elapsed time: 9.67 seconds
Generating augmented image for rimone f5 glaukoma...
Elapsed time: 5.32 seconds
Generating augmented image for g1020 f1 normal...
Elapsed time: 89.51 seconds
Generating augmented image for g1020 f1 glaukoma...
Elapsed time: 25.73 seconds
Generating augmented image for g1020 f2 normal...
E

#### Validate the augmented image

In [19]:
# create a varible to store the file name
s2_src_fname = {}
s2_aug_fname = {}

# collecting the source file name
for key, value in path_dataset_src.items():
    if key.split('_')[1] == '2':
        for label in labels_name:
            s2_src_fname[key + '_'
                        + label] = [file for file in os.listdir(os.path.join(value,label))]
del key, value, label

# collecting the augmented file name
for key, value in path_dataset_aug.items():
    if key.split('_')[1] == '2':
        s2_aug_fname[key] = [file for file in os.listdir(value)]
del key, value

In [20]:
# merge the two dictionary into dataframe
s2_df_result = pd.concat([pd.DataFrame({
                                'category': s2_src_fname.keys(),
                                'file_count': [len(value) for value in s2_src_fname.values()],
                                'type': 'source'
                            }),
                            pd.DataFrame({
                                'category': s2_aug_fname.keys(),
                                'file_count': [len(value) for value in s2_aug_fname.values()],
                                'type': 'augmented'
                            })])
s2_df_result.head(2)

Unnamed: 0,category,file_count,type
0,scenario_2_rimone_fold_1_normal,218,source
1,scenario_2_rimone_fold_1_glaukoma,121,source


In [23]:
# validate the file count
s2_df_validate = pd.DataFrame(s2_df_result.groupby(['category', 'type']).file_count.sum())

s2_df_validate.sort_values(by='category', inplace=True)
s2_df_validate = s2_df_validate.pivot_table(index='category',
                                            columns='type',
                                            values='file_count')

s2_df_validate.loc[s2_df_validate.augmented == s2_df_validate.source,
                    'status'] = 'valid'
s2_df_validate.loc[s2_df_validate.augmented != s2_df_validate.source,
                    'status'] = 'invalid'

s2_df_validate.head(5)

type,augmented,source,status
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
scenario_2_g1020_fold_1_glaukoma,208.0,208.0,valid
scenario_2_g1020_fold_1_normal,506.0,506.0,valid
scenario_2_g1020_fold_2_glaukoma,208.0,208.0,valid
scenario_2_g1020_fold_2_normal,506.0,506.0,valid
scenario_2_g1020_fold_3_glaukoma,208.0,208.0,valid


In [24]:
# print the result
print(f'Total categories: {len(s2_df_validate)}',
    f'\nTotal valid categories: {len(s2_df_validate[s2_df_validate.status == "valid"])}',
    f'\nTotal invalid categories: {len(s2_df_validate[s2_df_validate.status == "invalid"])}',
    sep='\n')

Total categories: 40

Total valid categories: 40

Total invalid categories: 0


### Scenario 3
**Condition**:
- basic augmentation, 
- rgb color, 
- clahe
#### Import the image into data generator

In [25]:
# define the augmentation mode for scenario 3
s3_src = {}
s3_tclahe_src = {}
s3_val_test_src = {}
s3_cfile_val_test = []
s3_cfile_train = []

In [26]:
# load image using image data generator for training data
for dataset in dataset_names:
    for fold in fold_names:
        for label in labels_name:
            print(f'Loading {dataset} f{fold.split("_")[-1]} {label}...')
            s3_src[dataset + '_'
                    + fold + '_'
                    + label] = (datagenerator_s3.flow_from_directory(
                                path_dataset_src[scenario_names[1] + '_'
                                                + dataset + '_'
                                                + fold],
                                target_size=image_size[dataset],
                                class_mode='binary',
                                classes=[label],
                                shuffle=True,
                                seed=1915026018,
                                save_to_dir=path_dataset_aug[scenario_names[1] + '_'
                                                            + dataset + '_'
                                                            + fold + '_'
                                                            + label],
                                save_prefix=f's3_{dataset}_f{fold.split("_")[-1]}_{label}',
                                save_format='jpg'))
del dataset, fold, label

Loading rimone f1 normal...
Found 218 images belonging to 1 classes.
Loading rimone f1 glaukoma...
Found 121 images belonging to 1 classes.
Loading rimone f2 normal...
Found 218 images belonging to 1 classes.
Loading rimone f2 glaukoma...
Found 121 images belonging to 1 classes.
Loading rimone f3 normal...
Found 218 images belonging to 1 classes.
Loading rimone f3 glaukoma...
Found 121 images belonging to 1 classes.
Loading rimone f4 normal...
Found 219 images belonging to 1 classes.
Loading rimone f4 glaukoma...
Found 120 images belonging to 1 classes.
Loading rimone f5 normal...
Found 219 images belonging to 1 classes.
Loading rimone f5 glaukoma...
Found 120 images belonging to 1 classes.
Loading g1020 f1 normal...
Found 506 images belonging to 1 classes.
Loading g1020 f1 glaukoma...
Found 208 images belonging to 1 classes.
Loading g1020 f2 normal...
Found 506 images belonging to 1 classes.
Loading g1020 f2 glaukoma...
Found 208 images belonging to 1 classes.
Loading g1020 f3 normal.

In [27]:
# load image using image data generator for training data with only clahe augmentation
for dataset in dataset_names:
    for fold in fold_names:
        for label in labels_name:
            print(f'Loading {dataset} f{fold.split("_")[-1]} {label}...')
            s3_tclahe_src[dataset + '_'
                    + fold + '_'
                    + label] = (datagenerator_s3_clahe.flow_from_directory(
                                path_dataset_src[scenario_names[1] + '_'
                                                + dataset + '_'
                                                + fold],
                                target_size=image_size[dataset],
                                class_mode='binary',
                                classes=[label],
                                shuffle=True,
                                seed=1915026018,
                                save_to_dir=path_dataset_clahe_dst[scenario_names[1] + '_'
                                                            + dataset + '_'
                                                            + fold + '_'
                                                            + label],
                                save_prefix=f's3_{dataset}_f{fold.split("_")[-1]}_clahe_{label}',
                                save_format='jpg'))
            s3_cfile_train.append(f's3_{dataset}_f{fold.split("_")[-1]}_clahe_{label}')
del dataset, fold, label

Loading rimone f1 normal...
Found 218 images belonging to 1 classes.
Loading rimone f1 glaukoma...
Found 121 images belonging to 1 classes.
Loading rimone f2 normal...
Found 218 images belonging to 1 classes.
Loading rimone f2 glaukoma...
Found 121 images belonging to 1 classes.
Loading rimone f3 normal...
Found 218 images belonging to 1 classes.
Loading rimone f3 glaukoma...
Found 121 images belonging to 1 classes.
Loading rimone f4 normal...
Found 219 images belonging to 1 classes.
Loading rimone f4 glaukoma...
Found 120 images belonging to 1 classes.
Loading rimone f5 normal...
Found 219 images belonging to 1 classes.
Loading rimone f5 glaukoma...
Found 120 images belonging to 1 classes.
Loading g1020 f1 normal...
Found 506 images belonging to 1 classes.
Loading g1020 f1 glaukoma...
Found 208 images belonging to 1 classes.
Loading g1020 f2 normal...
Found 506 images belonging to 1 classes.
Loading g1020 f2 glaukoma...
Found 208 images belonging to 1 classes.
Loading g1020 f3 normal.

In [28]:
# load image using image data generator for validation and test data
for dataset in dataset_names:
    for fold in fold_names:
        for data_type in ['val', 'test']:
            for label in labels_name:
                print(f'{dataset} {fold} {data_type} {label}')
                s3_val_test_src[dataset + '_'
                                + fold + '_'
                                + data_type + '_'
                                + label] = (datagenerator_s3_clahe.flow_from_directory(
                                            path_dataset_val_test_src[scenario_names[1] + '_'
                                                                    + dataset + '_'
                                                                    + fold + '_'
                                                                    + data_type],
                                            target_size=image_size[dataset],
                                            class_mode='binary',
                                            classes=[label],
                                            shuffle=True,
                                            seed=1915026018,
                                            save_to_dir=path_dataset_val_test_dest[scenario_names[1] + '_'
                                                                                    + dataset + '_'
                                                                                    + fold + '_'
                                                                                    + data_type + '_'
                                                                                    + label],
                                            save_prefix=f's3_{dataset}_f{fold.split("_")[-1]}_{data_type}_{label}',
                                            save_format='jpg'))
                s3_cfile_val_test.append(f's3_{dataset}_f{fold.split("_")[-1]}_{data_type}_{label}')
del dataset, fold, data_type, label

rimone fold_1 val normal
Found 32 images belonging to 1 classes.
rimone fold_1 val glaukoma
Found 17 images belonging to 1 classes.
rimone fold_1 test normal
Found 63 images belonging to 1 classes.
rimone fold_1 test glaukoma
Found 34 images belonging to 1 classes.
rimone fold_2 val normal
Found 32 images belonging to 1 classes.
rimone fold_2 val glaukoma
Found 17 images belonging to 1 classes.
rimone fold_2 test normal
Found 63 images belonging to 1 classes.
rimone fold_2 test glaukoma
Found 34 images belonging to 1 classes.
rimone fold_3 val normal
Found 32 images belonging to 1 classes.
rimone fold_3 val glaukoma
Found 17 images belonging to 1 classes.
rimone fold_3 test normal
Found 63 images belonging to 1 classes.
rimone fold_3 test glaukoma
Found 34 images belonging to 1 classes.
rimone fold_4 val normal
Found 32 images belonging to 1 classes.
rimone fold_4 val glaukoma
Found 17 images belonging to 1 classes.
rimone fold_4 test normal
Found 62 images belonging to 1 classes.
rimo

#### Generate the augmented image & saved it

In [29]:
# generate the augmented image for training data
augment_image.generate_aug_img(dataset_names=dataset_names,
                                fold_names=fold_names,
                                labels_names=labels_name,
                                batch_datasets=s3_src,
                                data_type='train')

Generating augmented image for rimone f1 normal...
Elapsed time: 51.06 seconds
Generating augmented image for rimone f1 glaukoma...
Elapsed time: 20.56 seconds
Generating augmented image for rimone f2 normal...
Elapsed time: 26.37 seconds
Generating augmented image for rimone f2 glaukoma...
Elapsed time: 13.31 seconds
Generating augmented image for rimone f3 normal...
Elapsed time: 25.03 seconds
Generating augmented image for rimone f3 glaukoma...
Elapsed time: 14.95 seconds
Generating augmented image for rimone f4 normal...
Elapsed time: 25.64 seconds
Generating augmented image for rimone f4 glaukoma...
Elapsed time: 12.90 seconds
Generating augmented image for rimone f5 normal...
Elapsed time: 23.89 seconds
Generating augmented image for rimone f5 glaukoma...
Elapsed time: 13.01 seconds
Generating augmented image for g1020 f1 normal...
Elapsed time: 58.13 seconds
Generating augmented image for g1020 f1 glaukoma...
Elapsed time: 23.76 seconds
Generating augmented image for g1020 f2 no

In [30]:
# generate the augmented image for training data with only clahe augmentation
augment_image.generate_aug_img(dataset_names=dataset_names,
                                fold_names=fold_names,
                                labels_names=labels_name,
                                batch_datasets=s3_tclahe_src,
                                data_type='train')

Generating augmented image for rimone f1 normal...
Elapsed time: 24.98 seconds
Generating augmented image for rimone f1 glaukoma...
Elapsed time: 13.25 seconds
Generating augmented image for rimone f2 normal...
Elapsed time: 23.80 seconds
Generating augmented image for rimone f2 glaukoma...
Elapsed time: 13.46 seconds
Generating augmented image for rimone f3 normal...
Elapsed time: 27.99 seconds
Generating augmented image for rimone f3 glaukoma...
Elapsed time: 15.60 seconds
Generating augmented image for rimone f4 normal...
Elapsed time: 26.03 seconds
Generating augmented image for rimone f4 glaukoma...
Elapsed time: 13.28 seconds
Generating augmented image for rimone f5 normal...
Elapsed time: 23.96 seconds
Generating augmented image for rimone f5 glaukoma...
Elapsed time: 13.39 seconds
Generating augmented image for g1020 f1 normal...
Elapsed time: 60.24 seconds
Generating augmented image for g1020 f1 glaukoma...
Elapsed time: 24.33 seconds
Generating augmented image for g1020 f2 no

In [31]:
# generate the augmented image for validation and testing data
augment_image.generate_aug_img(dataset_names=dataset_names,
                                fold_names=fold_names,
                                labels_names=labels_name,
                                batch_datasets=s3_val_test_src,
                                data_type='val_test')

Generating augmented image for rimone f1 val...
Elapsed time: 6.52 seconds
Generating augmented image for rimone f1 test...
Elapsed time: 11.93 seconds
Generating augmented image for rimone f2 val...
Elapsed time: 6.20 seconds
Generating augmented image for rimone f2 test...
Elapsed time: 11.46 seconds
Generating augmented image for rimone f3 val...
Elapsed time: 5.61 seconds
Generating augmented image for rimone f3 test...
Elapsed time: 11.45 seconds
Generating augmented image for rimone f4 val...
Elapsed time: 6.82 seconds
Generating augmented image for rimone f4 test...
Elapsed time: 11.93 seconds
Generating augmented image for rimone f5 val...
Elapsed time: 8.07 seconds
Generating augmented image for rimone f5 test...
Elapsed time: 12.43 seconds
Generating augmented image for g1020 f1 val...
Elapsed time: 18.06 seconds
Generating augmented image for g1020 f1 test...
Elapsed time: 27.19 seconds
Generating augmented image for g1020 f2 val...
Elapsed time: 13.34 seconds
Generating aug

#### Validate the augmented image

In [32]:
# create a varible to store the file name
s3_src_fname = {}
s3_aug_fname = {}

# collecting the source file name
for key, value in path_dataset_src.items():
    if key.split('_')[1] == '3':
        for label in labels_name:
            s3_src_fname[key + '_'
                        + label] = [file for file in os.listdir(os.path.join(value,label))]
del key, value, label

# collecting the augmented file name
for key, value in path_dataset_aug.items():
    if key.split('_')[1] == '3':
        s3_aug_fname[key] = [file for file in os.listdir(value)]
del key, value

In [33]:
# merge the two dictionary into dataframe
s3_df_result = pd.concat([pd.DataFrame({
                                'category': s3_src_fname.keys(),
                                'file_count': [len(value) for value in s3_src_fname.values()],
                                'type': 'source'
                            }),
                            pd.DataFrame({
                                'category': s3_aug_fname.keys(),
                                'file_count': [len(value) for value in s3_aug_fname.values()],
                                'type': 'augmented'
                            })])
s3_df_result.head(2)

Unnamed: 0,category,file_count,type
0,scenario_3_rimone_fold_1_normal,436,source
1,scenario_3_rimone_fold_1_glaukoma,242,source


In [36]:
# validate the file count
s3_df_validate = pd.DataFrame(s3_df_result.groupby(['category', 'type']).file_count.sum())

s3_df_validate.sort_values(by='category', inplace=True)
s3_df_validate = s3_df_validate.pivot_table(index='category',
                                            columns='type',
                                            values='file_count')

s3_df_validate.loc[s3_df_validate.augmented*2 == s3_df_validate.source,
                    'status'] = 'valid'
s3_df_validate.loc[s3_df_validate.augmented*2 != s3_df_validate.source,
                    'status'] = 'invalid'

s3_df_validate.head(5)

type,augmented,source,status
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
scenario_3_g1020_fold_1_glaukoma,208.0,416.0,valid
scenario_3_g1020_fold_1_normal,506.0,1012.0,valid
scenario_3_g1020_fold_2_glaukoma,208.0,416.0,valid
scenario_3_g1020_fold_2_normal,506.0,1012.0,valid
scenario_3_g1020_fold_3_glaukoma,208.0,416.0,valid


In [37]:
# print the result
print(f'Total categories: {len(s3_df_validate)}',
    f'\nTotal valid categories: {len(s3_df_validate[s3_df_validate.status == "valid"])}',
    f'\nTotal invalid categories: {len(s3_df_validate[s3_df_validate.status == "invalid"])}',
    sep='\n')

Total categories: 40

Total valid categories: 40

Total invalid categories: 0


#### Remove the previous image
##### Getting the image file name

In [None]:
# define variable to store the file name for validation and testing data
s3_rm_file_vt, s3_aug_file_vt = augment_image.get_file(files_code=s3_cfile_val_test,
                                    path_dest=path_dataset_val_test_dest,
                                    scenario=scenario_names[1])

In [None]:
# define variable to store the file name for training data with only clahe
s3_rm_file_tc, s3_aug_file_tc = augment_image.get_file(files_code=s3_cfile_train,
                                    path_dest=path_dataset_clahe_dst,
                                    scenario=scenario_names[1])

##### Validate the file

In [None]:
# create the dataframe to store the result
s3_df_file_check = pd.DataFrame(columns = ['data',
                                        'type',
                                        'category',
                                        'file_path',
                                        'file_name'])
s3_df_file_check.head()

In [None]:
# add the file name that will be removed result into the dataframe
for category, files_list in s3_rm_file_vt.items():
    for file in files_list:
        s3_df_file_check.loc[len(s3_df_file_check)] = ['val_test',
                                                    'remove',
                                                    category[3:].replace('_', ' '),
                                                    file,
                                                    os.path.basename(file)]
del category, files_list, file
# add the file name that is augmented into the dataframe
for category, files_list in s3_aug_file_vt.items():
    for file in files_list:
        s3_df_file_check.loc[len(s3_df_file_check)] = ['val_test',
                                                    'augment',
                                                    category[3:].replace('_', ' '),
                                                    file,
                                                    os.path.basename(file)]
del category, files_list, file

In [None]:
# add the file name that will be removed result into the dataframe
for category, files_list in s3_rm_file_tc.items():
    for file in files_list:
        s3_df_file_check.loc[len(s3_df_file_check)] = ['train_clahe',
                                                    'remove',
                                                    category[3:].replace('_', ' '),
                                                    file,
                                                    os.path.basename(file)]
del category, files_list, file
# add the file name that is augmented into the dataframe
for category, files_list in s3_aug_file_tc.items():
    for file in files_list:
        s3_df_file_check.loc[len(s3_df_file_check)] = ['train_clahe',
                                                    'augment',
                                                    category[3:].replace('_', ' '),
                                                    file,
                                                    os.path.basename(file)]
del category, files_list, file

In [None]:
# handle the result into the compare able shape for validation and testing
s3_df_validate_vt = pd.DataFrame(s3_df_file_check.loc[s3_df_file_check['data'] == 'val_test'].groupby(by=['category',
                                                                                    'type']).count()['file_name'])


s3_df_validate_vt = s3_df_validate_vt.pivot_table(values='file_name',
                                            index='category',
                                            columns='type')
s3_df_validate_vt.loc[s3_df_validate_vt.augment == s3_df_validate_vt.remove,
                    'status'] = 'valid'
s3_df_validate_vt.loc[s3_df_validate_vt.augment != s3_df_validate_vt.remove,
                    'status'] = 'invalid'
s3_df_validate_vt.head(3)

In [None]:
# handle the result into the compare able shape for training augmentation with only clahe
s3_df_validate_vt = pd.DataFrame(s3_df_file_check.loc[s3_df_file_check['data'] == 'train_clahe'].groupby(by=['category',
                                                                                    'type']).count()['file_name'])


s3_df_validate_vt = s3_df_validate_vt.pivot_table(values='file_name',
                                            index='category',
                                            columns='type')
s3_df_validate_vt.loc[s3_df_validate_vt.augment == s3_df_validate_vt.remove,
                    'status'] = 'valid'
s3_df_validate_vt.loc[s3_df_validate_vt.augment != s3_df_validate_vt.remove,
                    'status'] = 'invalid'
s3_df_validate_vt.head(3)

In [None]:
# print the result
print(f'total categories: {s3_df_validate.shape[0]}',
        f'valid file(s)   : {s3_df_validate.loc[s3_df_validate.status == "valid"].shape[0]}',
        f'invalid file(s) : {s3_df_validate.loc[s3_df_validate.status == "invalid"].shape[0]}',
        sep='\n')

##### Removing the file

In [None]:
for files in s3_rm_file.values():
    result_status = augment_image.remove_file(files)

print(f'Files removed: {len(result_status["Success"])}',
        f'Files already removed: {len(result_status["Not Found"])}',
        sep='\n')

del files, result_status

### Removing scenario one augmented directory

In [None]:
s1_rmdir = []
for scenario in scenario_names:
    for dataset in dataset_names:
        for fold in fold_names:
            s1_rmdir.append(os.path.join(path_source,
                                        scenario,
                                        dataset,
                                        fold,
                                        'train_augmented'))
    break

result_status = augment_image.remove_dir(s1_rmdir)
print(f'Directory removed: {len(result_status["Success"])}',
        f'Directory already removed: {len(result_status["Not Found"])}',
        sep='\n')

del scenario, dataset, fold