In [1]:
import os
import sys
from dotenv import load_dotenv

load_dotenv()
sys.path.append(os.environ.get('PATH_CUSTOM_MODULES'))

import pandas as pd
import time

import data_prep

### Prepare all the variable

In [2]:
path_destination = os.environ.get('PATH_DATASET_DESTINATION')
path_source_rimone = os.path.join(os.environ.get('PATH_DATASET_COMBINED'),
                                    'rimone')
path_source_g1020 = os.path.join(os.environ.get('PATH_DATASET_COMBINED'),
                                    'g1020')
path_source_refuge = os.path.join(os.environ.get('PATH_DATASET_COMBINED'),
                                    'refuge')
path_source_papila = os.path.join(os.environ.get('PATH_DATASET_COMBINED'),
                                    'papila')
path_data = os.environ.get('PATH_DATA_DASHBOARD')

In [3]:
dataset_names = ['rimone', 'g1020', 'refuge', 'papila']
dataset_labels = ['normal', 'glaukoma']

### Get the file name
structure of the file in form of dictionary and list:
- file_name
  - rimone
    - normal
    - glaukoma
  - g1020
    - normal
    - glaukoma
  - refuge
    - normal
    - glaukoma
  - papila
    - normal
    - glaukoma

In [4]:
file_names = {}
for index, dataset_path in enumerate([path_source_rimone,
                                    path_source_g1020,
                                    path_source_refuge,
                                    path_source_papila]):
    temp_dict = {}
    for label_name in dataset_labels:
        temp_dict[label_name] = data_prep.get_file_names(path=os.path.join(dataset_path,
                                                                            label_name))
    file_names[dataset_names[index]] = temp_dict

del temp_dict, index, dataset_path, label_name

### Validate the file count

In [5]:
for dataset_name, labels in file_names.items():
    for label_name, file_name in labels.items():
        print(f'{dataset_name} {label_name} : {len(file_name)}')
    print(f'total: {sum([len(file_name) for file_name in labels.values()])}', end='\n\n')

del dataset_name, labels, label_name, file_name

rimone normal : 313
rimone glaukoma : 172
total: 485

g1020 normal : 724
g1020 glaukoma : 296
total: 1020

refuge normal : 1080
refuge glaukoma : 120
total: 1200

papila normal : 333
papila glaukoma : 155
total: 488



### Prepare the splitting data process

In [6]:
# setting the data ratio
test_size = .2
val_size = .125 # 10% converted to the temporary training set become 12.5%

In [7]:
# getting the file name for each dataset
rimone_file = data_prep.fname_dict_df(file_names=file_names[dataset_names[0]],
                                        label_names=dataset_labels)
g1020_file = data_prep.fname_dict_df(file_names=file_names[dataset_names[1]],
                                        label_names=dataset_labels)
refuge_file = data_prep.fname_dict_df(file_names=file_names[dataset_names[2]],
                                        label_names=dataset_labels)
papila_file = data_prep.fname_dict_df(file_names=file_names[dataset_names[3]],
                                        label_names=dataset_labels)

In [8]:
# variable for splitting log
result_dict = {"scenario": [],
                "dataset": [],
                "fold": [],
                "data_type": [],
                "label": [],
                "success file": [],
                "failed file": []}

### Split the data

In [9]:
for scenario in range(1, 4):
    for dataset_name, dataset_file, path_dataset in zip(dataset_names,
                                                        [rimone_file, g1020_file, refuge_file, papila_file],
                                                        [path_source_rimone, path_source_g1020, path_source_refuge, path_source_papila]):
        start_time = time.perf_counter()
        print(f'split scenario {scenario} dataset {dataset_name} ...')
        result = data_prep.split_file(val_size=val_size, test_size=test_size,
                                    random_state=1915026018,
                                    df_file_name=dataset_file,
                                    source_path=path_dataset,
                                    destination_path=os.path.join(path_destination,
                                                                    f'scenario_{scenario}',
                                                                    dataset_name))
        for folds, dataset_per_fold_result in result.items():
            for data_type, data_label_value in dataset_per_fold_result['copy files'].items():
                for label_name, file_names in data_label_value.items():
                    result_dict['scenario'].append(scenario)
                    result_dict['dataset'].append(dataset_name)
                    result_dict['fold'].append(folds)
                    result_dict['data_type'].append(data_type)
                    result_dict['label'].append(label_name)
                    result_dict['success file'].append(len(file_names['Success']))
                    result_dict['failed file'].append(len(file_names['Already Exists']))
        end_time = time.perf_counter()
        print(f'splitting completed in {end_time - start_time: .2f} seconds')
del scenario, dataset_name, dataset_file, path_dataset
del start_time, end_time, result
del folds, dataset_per_fold_result, data_type, data_label_value, label_name, file_names

split scenario 1 dataset rimone ...


splitting completed in  6.23 seconds
split scenario 1 dataset g1020 ...
splitting completed in  31.81 seconds
split scenario 1 dataset refuge ...
splitting completed in  27.56 seconds
split scenario 1 dataset papila ...
splitting completed in  11.76 seconds
split scenario 2 dataset rimone ...
splitting completed in  8.11 seconds
split scenario 2 dataset g1020 ...
splitting completed in  22.17 seconds
split scenario 2 dataset refuge ...
splitting completed in  22.30 seconds
split scenario 2 dataset papila ...
splitting completed in  7.44 seconds
split scenario 3 dataset rimone ...
splitting completed in  6.62 seconds
split scenario 3 dataset g1020 ...
splitting completed in  28.91 seconds
split scenario 3 dataset refuge ...
splitting completed in  41.00 seconds
split scenario 3 dataset papila ...
splitting completed in  15.00 seconds


### Validate the file count

In [10]:
split_result = pd.DataFrame(result_dict)

split_result['total_file'] = split_result['success file'] + split_result['failed file']
split_result['success_rate'] = split_result['success file'] / split_result['total_file'] * 100

split_result.drop(columns=['success file', 'failed file'], inplace=True)

split_result.to_csv(os.path.join(path_data, 'split_result.csv'), index=False)
split_result

Unnamed: 0,scenario,dataset,fold,data_type,label,total_file,success_rate
0,1,rimone,fold 1,train,normal,218,100.0
1,1,rimone,fold 1,train,glaukoma,121,100.0
2,1,rimone,fold 1,val,normal,32,100.0
3,1,rimone,fold 1,val,glaukoma,17,100.0
4,1,rimone,fold 1,test,normal,63,100.0
...,...,...,...,...,...,...,...
355,3,papila,fold 5,train,glaukoma,109,100.0
356,3,papila,fold 5,val,normal,34,100.0
357,3,papila,fold 5,val,glaukoma,15,100.0
358,3,papila,fold 5,test,normal,66,100.0


In [11]:
split_result.loc[(split_result.fold == 'fold 4')
                & (split_result.scenario == 1),].groupby(by=['dataset',
                                                            'label', 'data_type'],
                                                        as_index=False).agg({'total_file': 'sum'})

Unnamed: 0,dataset,label,data_type,total_file
0,g1020,glaukoma,test,59
1,g1020,glaukoma,train,208
2,g1020,glaukoma,val,29
3,g1020,normal,test,145
4,g1020,normal,train,506
5,g1020,normal,val,73
6,papila,glaukoma,test,31
7,papila,glaukoma,train,109
8,papila,glaukoma,val,15
9,papila,normal,test,66


In [12]:
scenario_guide = pd.DataFrame(
    {'scenario': range(1, 4),
    'color_schema': ['rgb', 'rgb', 'rgb'],
    'aug_resize': [1, 1, 1],
    'aug_rescale': [1, 1, 1],
    'aug_horizontal_flip': [0, 1, 1],
    'aug_vertical_flip': [0, 1, 1],
    'aug_rotation': [0, 1, 1],
    'aug_brightness': [0, 1, 1],
    'aug_contrast_clahe': [0, 0, 1]}
)
scenario_guide

Unnamed: 0,scenario,color_schema,aug_resize,aug_rescale,aug_horizontal_flip,aug_vertical_flip,aug_rotation,aug_brightness,aug_contrast_clahe
0,1,rgb,1,1,0,0,0,0,0
1,2,rgb,1,1,1,1,1,1,0
2,3,rgb,1,1,1,1,1,1,1


In [13]:
scenario_guide.to_csv(os.path.join(path_data, 'scenario_guide.csv'), index=False)