In [1]:
import os
import sys
from dotenv import load_dotenv

load_dotenv()
sys.path.append(os.environ.get('PATH_CUSTOM_MODULES'))


import pandas as pd
import json

import data_prep

### Prepare the destination directory

In [4]:
label = ["normal", "glaukoma"]
dataset_names = ["rimone", 'g1020', 'refuge', 'papila']

# get source dataset paths
path_rimone = os.path.join(os.environ.get("PATH_DATASET_RIMONE"),
                           'partitioned_by_hospital')
path_g1020 = os.path.join(os.environ.get("PATH_DATASET_G1020"),
                          'Images')
path_refuge = os.path.join(os.environ.get("PATH_DATASET_REFUGE"),
                           'Images_Square')
path_papila = os.path.join(os.environ.get("PATH_DATASET_PAPILA"),
                           'FundusImages')

# get destination dataset path
path_dataset_destination = os.environ.get("PATH_DATASET_COMBINED")

In [8]:
result = data_prep.create_destination_directory(path=path_dataset_destination,
                                          dataset_names=dataset_names,
                                          label_names=label)
print(f"Success: {len(result['Success'])}",
      f"Already Exists: {len(result['Already Exists'])}",
      sep='\n')

Success: 0
Already Exists: 8


## RIM-ONE Dataset
### Getting the file name

In [4]:
file_names_raw_rimone = {
    "training_set": {
        "normal": [],
        "glaucoma": []
    },
    "test_set": {
        "normal": [],
        "glaucoma": []
    }
}

In [5]:
for dataset_name_key, dataset_name_value in file_names_raw_rimone.items():
    for label_name_key, label_name_value in dataset_name_value.items():
        file_names_raw_rimone[dataset_name_key][label_name_key] = data_prep.get_file_names(path=os.path.join(path_rimone,
                                                                                                            dataset_name_key,
                                                                                                            label_name_key))

### Validate the file count

In [6]:
for dataset_name_key, dataset_name_value in file_names_raw_rimone.items():
    for label_name_key, label_name_value in dataset_name_value.items():
        print(f"{dataset_name_key[:5]} {label_name_key}",
              f"Total: {len(label_name_value)}",
              sep="\n")

train normal
Total: 195
train glaucoma
Total: 116
test_ normal
Total: 118
test_ glaucoma
Total: 56


### Transform the raw data into cleaned one

In [7]:
file_names_cleaned_rimone = {
    label[0]: [],
    label[1]: []
}
for data_type_key, data_type_value in file_names_raw_rimone.items():
    for file_name_key, file_name_value in data_type_value.items():
        if file_name_key == "normal":
            file_names_cleaned_rimone[label[0]].extend(file_name_value)
        elif file_name_key == "glaucoma":
            file_names_cleaned_rimone[label[1]].extend(file_name_value)

### Check the validity

In [8]:
for label_name, label_file in file_names_cleaned_rimone.items():
    print(f"Total {label_name} : {len(label_file)}")

Total normal : 313
Total glaukoma : 172


### Copying the image

In [9]:
for dataset_name_key, dataset_name_value in file_names_raw_rimone.items():
    for label_name_key, label_name_value in dataset_name_value.items():
        if label_name_key == "normal":
            data_prep.copy_files(source_path=os.path.join(path_rimone,
                                                        dataset_name_key,
                                                        label_name_key),
                                destination_path=os.path.join(path_dataset_destination,
                                                            dataset_names[0],
                                                            label[0]),
                                file_names=label_name_value)
        elif label_name_key == "glaucoma":
            data_prep.copy_files(source_path=os.path.join(path_rimone,
                                                        dataset_name_key,
                                                        label_name_key),
                                destination_path=os.path.join(path_dataset_destination,
                                                            dataset_names[0],
                                                            label[1]),
                                file_names=label_name_value)

### Validate the file

In [12]:
for label_name in label:
    for file_name in data_prep.get_file_names(path=os.path.join(path_dataset_destination,
                                                                 dataset_names[0])):
        if file_name in file_names_cleaned_rimone[label_name]:
            pass
        else:
            print(f"{file_name} not found in {label_name}")

## G1020 Dataset
### Getting the file name

In [17]:
file_names_raw_g1020 = {
    label[0]: [],
    label[1]: []
}
metadata_g1020 = pd.read_csv(os.path.join(path_g1020, "../G1020.csv"))

In [18]:
metadata_g1020.head()

Unnamed: 0,imageID,binaryLabels
0,image_0.jpg,0
1,image_1.jpg,0
2,image_3.jpg,0
3,image_4.jpg,0
4,image_5.jpg,0


In [19]:
metadata_g1020.binaryLabels.value_counts()

binaryLabels
0    724
1    296
Name: count, dtype: int64

In [20]:
metadata_g1020.loc[metadata_g1020.binaryLabels == 0, "binaryLabels"] = label[0]
metadata_g1020.loc[metadata_g1020.binaryLabels == 1, "binaryLabels"] = label[1]
metadata_g1020.binaryLabels.value_counts()

  metadata_g1020.loc[metadata_g1020.binaryLabels == 0, "binaryLabels"] = label[0]


binaryLabels
normal      724
glaukoma    296
Name: count, dtype: int64

In [29]:
metadata_g1020.rename(columns={"imageID":"file name",
                               "binaryLabels":"label"},
                      inplace=True)
metadata_g1020.head()

Unnamed: 0,file name,label
0,image_0.jpg,normal
1,image_1.jpg,normal
2,image_3.jpg,normal
3,image_4.jpg,normal
4,image_5.jpg,normal


In [33]:
for row in metadata_g1020.iterrows():
    if row[1]['label'] == label[0]:
        file_names_raw_g1020[label[0]].append(row[1]['file name'])
    elif row[1]['label'] == label[1]:
        file_names_raw_g1020[label[1]].append(row[1]['file name'])

### Validate the file count

In [34]:
for key in file_names_raw_g1020.keys():
    print(f"Total {key} : {len(file_names_raw_g1020[key])}")

Total normal : 724
Total glaukoma : 296


### Copying the image

In [36]:
for label_name, label_file in file_names_raw_g1020.items():
    data_prep.copy_files(source_path=path_g1020,
                         destination_path=os.path.join(path_dataset_destination,
                                                     dataset_names[1],
                                                     label_name),
                         file_names=label_file)

### Validate the file

In [37]:
for label_name in label:
    for file_name in data_prep.get_file_names(path=os.path.join(path_dataset_destination,
                                                                 dataset_names[1])):
        if file_name in file_names_raw_g1020[label_name]:
            pass
        else:
            print(f"{file_name} not found in {label_name}")

## REFUGE Dataset
### Creating the matadata

In [10]:
print(path_refuge)

d:\\Programming\\Python\\Skripsi\\dataset\\REFUGE\Images_Square


In [5]:
file_names_raw_refuge = {
    label[0]: [],
    label[1]: []
}

meta_refuge_train = json.load(open(os.path.join(path_refuge, "../train/index.json")))
meta_refuge_val = json.load(open(os.path.join(path_refuge, "../val/index.json")))
meta_refuge_test = pd.read_csv(os.path.join(path_refuge, "../REFUGE1Test.csv"))

In [6]:
metadata_refuge_test_cleaned = meta_refuge_test[['imgName', 'label']].copy()
metadata_refuge_test_cleaned.rename(columns={"imgName":"nama file"},
                                    inplace=True)
metadata_refuge_test_cleaned['nama file'] = metadata_refuge_test_cleaned['nama file'].str.rsplit(pat='/', n=1, expand=True)[1]
metadata_refuge_test_cleaned.head()

Unnamed: 0,nama file,label
0,T0001.jpg,0
1,T0002.jpg,0
2,T0003.jpg,0
3,T0004.jpg,0
4,T0005.jpg,0


In [7]:
temp_metadata = {
    "nama file": [],
    "label": []
}

for metadata_temp in [meta_refuge_train, meta_refuge_val]:
    for key, value in metadata_temp.items():
        temp_metadata["nama file"].append(value['ImgName'])
        temp_metadata["label"].append(value['Label'])

In [8]:
metadata_refuge = pd.concat([pd.DataFrame(temp_metadata),
                             metadata_refuge_test_cleaned],
                             axis=0,
                             ignore_index=True)
metadata_refuge.loc[metadata_refuge.label == 0, "label"] = label[0]
metadata_refuge.loc[metadata_refuge.label == 1, "label"] = label[1]
metadata_refuge.head()

  metadata_refuge.loc[metadata_refuge.label == 0, "label"] = label[0]


Unnamed: 0,nama file,label
0,g0001.jpg,glaukoma
1,g0002.jpg,glaukoma
2,g0003.jpg,glaukoma
3,g0004.jpg,glaukoma
4,g0005.jpg,glaukoma


In [11]:
metadata_refuge.to_csv(os.path.join(path_refuge, "../metadata_REFUGE.csv"), index=False)

### Validate the file count

In [9]:
metadata_refuge.label.value_counts()

label
normal      1080
glaukoma     120
Name: count, dtype: int64

### Transform the raw data into the prefered structures

In [163]:
for row in metadata_refuge.iterrows():
    if row[1]['label'] == label[0]:
        file_names_raw_refuge[label[0]].append(row[1]['nama file'])
    elif row[1]['label'] == label[1]:
        file_names_raw_refuge[label[1]].append(row[1]['nama file'])

### Check the validity

In [164]:
for label_name, label_file in file_names_raw_refuge.items():
    print(f"Total {label_name} : {len(label_file)}")

Total normal : 1080
Total glaukoma : 120


### Copying the image

In [165]:
for label_name, label_file in file_names_raw_refuge.items():
    data_prep.copy_files(source_path=path_refuge,
                         destination_path=os.path.join(path_dataset_destination,
                                                     dataset_names[2],
                                                     label_name),
                         file_names=label_file)

### Validate the file

In [166]:
for label_name in label:
    for file_name in data_prep.get_file_names(path=os.path.join(path_dataset_destination,
                                                                 dataset_names[2])):
        if file_name in file_names_raw_refuge[label_name]:
            pass
        else:
            print(f"{file_name} not found in {label_name}")

## PAPILA Dataset
### Creating metadata

In [12]:
file_names_raw_papila = {
    label[0]: [],
    label[1]: []
}

metadata_papila_od_raw = pd.read_excel(os.path.join(path_papila, "../ClinicalData/patient_data_od.xlsx"))
metadata_papila_os_raw = pd.read_excel(os.path.join(path_papila, "../ClinicalData/patient_data_os.xlsx"))

In [13]:
metadata_papila_od = metadata_papila_od_raw[['Unnamed: 0', 'Diagnosis']].copy()

metadata_papila_od.drop(index=[0,1], inplace=True)
metadata_papila_od.reset_index(inplace=True, drop=True)
metadata_papila_od.rename(columns={"Unnamed: 0": "id",
                                   "Diagnosis": "label"}, inplace=True)

metadata_papila_od['file name'] = "RET" + metadata_papila_od.id.str.replace('#', '') + "OD.jpg"
metadata_papila_od['eye side'] = "right eye (OD)"

metadata_papila_od.drop(columns=['id'], inplace=True)

metadata_papila_od.head()

Unnamed: 0,label,file name,eye side
0,2,RET002OD.jpg,right eye (OD)
1,1,RET004OD.jpg,right eye (OD)
2,1,RET005OD.jpg,right eye (OD)
3,2,RET006OD.jpg,right eye (OD)
4,2,RET007OD.jpg,right eye (OD)


In [14]:
metadata_papila_os = metadata_papila_os_raw[['Unnamed: 0', 'Diagnosis']].copy()

metadata_papila_os.drop(index=[0,1], inplace=True)
metadata_papila_os.reset_index(inplace=True, drop=True)
metadata_papila_os.rename(columns={"Unnamed: 0": "id",
                                   "Diagnosis": "label"}, inplace=True)

metadata_papila_os['file name'] = "RET" + metadata_papila_os.id.str.replace('#', '') + "OS.jpg"
metadata_papila_os['eye side'] = "left eye (OS)"

metadata_papila_os.drop(columns=['id'], inplace=True)

metadata_papila_os.head()

Unnamed: 0,label,file name,eye side
0,2,RET002OS.jpg,left eye (OS)
1,1,RET004OS.jpg,left eye (OS)
2,1,RET005OS.jpg,left eye (OS)
3,2,RET006OS.jpg,left eye (OS)
4,2,RET007OS.jpg,left eye (OS)


In [15]:
metadata_papila = pd.concat([metadata_papila_od, metadata_papila_os],
                            axis=0,
                            ignore_index=True)
metadata_papila.loc[metadata_papila.label == 0, 'label'] = label[0]
metadata_papila.loc[metadata_papila.label == 1, 'label'] = label[1]
metadata_papila.loc[metadata_papila.label == 2, 'label'] = "suspect"
metadata_papila.head()

Unnamed: 0,label,file name,eye side
0,suspect,RET002OD.jpg,right eye (OD)
1,glaukoma,RET004OD.jpg,right eye (OD)
2,glaukoma,RET005OD.jpg,right eye (OD)
3,suspect,RET006OD.jpg,right eye (OD)
4,suspect,RET007OD.jpg,right eye (OD)


In [17]:
metadata_papila.to_csv(os.path.join(path_papila, "../metadata_papila.csv"), index=False)

### Validate the file count

In [115]:
pd.DataFrame(metadata_papila[['eye side', 'label']].groupby(['label', 'eye side']).size()).rename(columns={0: "image"})

Unnamed: 0_level_0,Unnamed: 1_level_0,image
label,eye side,Unnamed: 2_level_1
glaukoma,left eye (OS),47
glaukoma,right eye (OD),40
normal,left eye (OS),163
normal,right eye (OD),170
suspect,left eye (OS),34
suspect,right eye (OD),34


In [116]:
metadata_papila.loc[metadata_papila.label == "suspect", "label"] = label[1]
metadata_papila.label.value_counts()

label
normal      333
glaukoma    155
Name: count, dtype: int64

### Getting the file name

In [117]:
for row in metadata_papila.iterrows():
    if row[1]['label'] == label[0]:
        file_names_raw_papila[label[0]].append(row[1]['file name'])
    elif row[1]['label'] == label[1]:
        file_names_raw_papila[label[1]].append(row[1]['file name'])

### Validate the file

In [118]:
for key in file_names_raw_papila.keys():
    print(f"Total {key} : {len(file_names_raw_papila[key])}")

Total normal : 333
Total glaukoma : 155


### Copying the image

In [120]:
for label_name, label_file in file_names_raw_papila.items():
    data_prep.copy_files(source_path=path_papila,
                         destination_path=os.path.join(path_dataset_destination,
                                                     dataset_names[3],
                                                     label_name),
                         file_names=label_file)

### Validate the file

In [121]:
for label_name in label:
    for file_name in data_prep.get_file_names(path=os.path.join(path_dataset_destination,
                                                                 dataset_names[3])):
        if file_name in file_names_raw_papila[label_name]:
            pass
        else:
            print(f"{file_name} not found in {label_name}")