In [1]:
import os
import sys
from dotenv import load_dotenv

load_dotenv()
sys.path.append(os.environ.get('PATH_CUSTOM_MODULES'))

from sklearn.model_selection import StratifiedKFold
import pandas as pd

import data_prep

### Prepare all the variable

In [2]:
path_destination = os.environ.get('PATH_DATASET_DESTINATION')
path_source_rimone = os.path.join(os.environ.get('PATH_DATASET_COMBINED'),
                                    'rimone')
path_source_g1020 = os.path.join(os.environ.get('PATH_DATASET_COMBINED'),
                                    'g1020')
path_source_refuge = os.path.join(os.environ.get('PATH_DATASET_COMBINED'),
                                    'refuge')
path_source_papila = os.path.join(os.environ.get('PATH_DATASET_COMBINED'),
                                    'papila')
path_data = os.environ.get('PATH_DATA_DASHBOARD')

In [3]:
dataset_names = ['rimone', 'g1020', 'refuge', 'papila']
dataset_labels = ['normal', 'glaukoma']

### Get the file name
structure of the file in form of dictionary and list:
- file_name
  - rimone
    - normal
    - glaukoma
  - g1020
    - normal
    - glaukoma
  - refuge
    - normal
    - glaukoma
  - papila
    - normal
    - glaukoma

In [4]:
file_names = {}
for index, dataset_path in enumerate([path_source_rimone,
                                    path_source_g1020,
                                    path_source_refuge,
                                    path_source_papila]):
    temp_dict = {}
    for label_name in dataset_labels:
        temp_dict[label_name] = data_prep.get_file_names(path=os.path.join(dataset_path,
                                                                            label_name))
    file_names[dataset_names[index]] = temp_dict

del temp_dict, index, dataset_path, label_name

### Validate the file count

In [5]:
for dataset_name, labels in file_names.items():
    for label_name, file_name in labels.items():
        print(f'{dataset_name} {label_name} : {len(file_name)}')
    print(f'total: {sum([len(file_name) for file_name in labels.values()])}', end='\n\n')

del dataset_name, labels, label_name, file_name

rimone normal : 313
rimone glaukoma : 172
total: 485

g1020 normal : 724
g1020 glaukoma : 296
total: 1020

refuge normal : 1080
refuge glaukoma : 120
total: 1200

papila normal : 333
papila glaukoma : 155
total: 488



### Prepare the splitting data process

In [6]:
# setting the data ratio
test_size = .2
val_size = .1

In [7]:
# getting the file name for each dataset
rimone_file = data_prep.fname_dict_df(file_names=file_names[dataset_names[0]],
                                        label_names=dataset_labels)
g1020_file = data_prep.fname_dict_df(file_names=file_names[dataset_names[1]],
                                        label_names=dataset_labels)
refuge_file = data_prep.fname_dict_df(file_names=file_names[dataset_names[2]],
                                        label_names=dataset_labels)
papila_file = data_prep.fname_dict_df(file_names=file_names[dataset_names[3]],
                                        label_names=dataset_labels)

In [8]:
# variable for splitting log
result_dict = {"scenario": [],
                "dataset": [],
                "fold": [],
                "data_type": [],
                "label": [],
                "success file": [],
                "failed file": []}

### Split the data

In [9]:
for scenario in range(1, 6):
    for dataset_name, dataset_file, path_dataset in zip(dataset_names,
                                                        [rimone_file, g1020_file, refuge_file, papila_file],
                                                        [path_source_rimone, path_source_g1020, path_source_refuge, path_source_papila]):
        result = data_prep.split_file(val_size=val_size, test_size=test_size,
                                    random_state=1915026018,
                                    df_file_name=dataset_file,
                                    source_path=path_dataset,
                                    destination_path=os.path.join(path_destination,
                                                                    f'scenario_{scenario}',
                                                                    dataset_name))
        for folds, rimone_per_fold_result in result.items():
            for data_type, data_label_value in rimone_per_fold_result['copy files'].items():
                for label_name, file_names in data_label_value.items():
                    result_dict['scenario'].append(scenario)
                    result_dict['dataset'].append(dataset_name)
                    result_dict['fold'].append(folds)
                    result_dict['data_type'].append(data_type)
                    result_dict['label'].append(label_name)
                    result_dict['success file'].append(len(file_names['Success']))
                    result_dict['failed file'].append(len(file_names['Already Exists']))

### Validate the file count

In [10]:
split_result = pd.DataFrame(result_dict)

split_result['total_file'] = split_result['success file'] + split_result['failed file']
split_result['success_rate'] = split_result['success file'] / split_result['total_file'] * 100

split_result.drop(columns=['success file', 'failed file'], inplace=True)

split_result.to_csv(os.path.join(path_data, 'split_result.csv'), index=False)
split_result

Unnamed: 0,scenario,dataset,fold,data_type,label,total_file,success_rate
0,1,rimone,fold 1,train,normal,225,100.0
1,1,rimone,fold 1,train,glaukoma,124,100.0
2,1,rimone,fold 1,val,normal,25,100.0
3,1,rimone,fold 1,val,glaukoma,14,100.0
4,1,rimone,fold 1,test,normal,63,100.0
...,...,...,...,...,...,...,...
595,5,papila,fold 5,train,glaukoma,111,100.0
596,5,papila,fold 5,val,normal,27,100.0
597,5,papila,fold 5,val,glaukoma,13,100.0
598,5,papila,fold 5,test,normal,66,100.0


In [15]:
split_result.loc[(split_result.fold == 'fold 5')
                 & (split_result.scenario == 1),].groupby(by=['dataset',
                                                            'label'],
                                                        as_index=False).agg({'total_file': 'sum'})

Unnamed: 0,dataset,label,total_file
0,g1020,glaukoma,296
1,g1020,normal,724
2,papila,glaukoma,155
3,papila,normal,333
4,refuge,glaukoma,120
5,refuge,normal,1080
6,rimone,glaukoma,172
7,rimone,normal,313
