In [199]:
import pandas as pd
import os
import numpy as np
import random

seed = 0
random.seed(seed)
np.random.seed(seed)

In [200]:
BRSET_root = './data/brazilian-ophthalmological/1.0.1'
BRSET_df = pd.read_csv(os.path.join(BRSET_root, 'labels_brset.csv'))

In [203]:
# add a new column 'normal' to indicate whether the image is abnormal, the value is 0 if columes from 20 to 33 are all 0, otherwise 1
BRSET_df['abnormal'] = BRSET_df.iloc[:, 20:33].apply(lambda x: 0 if x.sum() == 0 else 1, axis=1)

In [204]:
BRSET_df_normal_original = BRSET_df[BRSET_df['abnormal'] == 0]

In [None]:
print(len(BRSET_df_normal_original))

In [None]:
BRSET_df_abnormal_original = BRSET_df[BRSET_df['abnormal'] == 1]
print(len(BRSET_df_abnormal_original))
BRSET_df_abnormal_original_no_others = BRSET_df_abnormal_original[(BRSET_df_abnormal_original['macular_edema'] == 0) & (BRSET_df_abnormal_original['scar'] == 0) & (BRSET_df_abnormal_original['nevus'] == 0) & (BRSET_df_abnormal_original['vascular_occlusion'] == 0) & (BRSET_df_abnormal_original['hypertensive_retinopathy'] == 0) & (BRSET_df_abnormal_original['hemorrhage'] == 0) & (BRSET_df_abnormal_original['retinal_detachment'] == 0) & (BRSET_df_abnormal_original['other'] == 0)]
print(len(BRSET_df_abnormal_original_no_others))

In [207]:
# randomly select 50 normal lines and 50 abnormal lines as validation set
BRSET_df_normal_val = BRSET_df_normal_original.sample(n=50, random_state=seed)
BRSET_df_abnormal_val = BRSET_df_abnormal_original_no_others.sample(n=50, random_state=seed)
BRSET_df_val = pd.concat([BRSET_df_normal_val, BRSET_df_abnormal_val])
BRSET_df_val.to_csv('./data/brazilian-ophthalmological/1.0.1/BenchReAD/valid.csv', index=False)

In [None]:
print(len(BRSET_df_val))

In [209]:
# remove the validation set from the original dataset
BRSET_df_normal = BRSET_df_normal_original[~BRSET_df_normal_original.index.isin(BRSET_df_normal_val.index)]
BRSET_df_abnormal_no_others = BRSET_df_abnormal_original_no_others[~BRSET_df_abnormal_original_no_others.index.isin(BRSET_df_abnormal_val.index)]

In [None]:
print(len(BRSET_df_normal))
print(len(BRSET_df_abnormal_no_others))

In [211]:
normal_num = len(BRSET_df_normal)
abnormal_num = len(BRSET_df_abnormal_no_others)

In [None]:
# randomly select 1/3 normal lines as normal_labeled set
BRSET_df_normal_labeled = BRSET_df_normal.sample(n=normal_num//3, random_state=seed)
print(len(BRSET_df_normal_labeled))

# randomly select 1/3 abnormal lines as abnormal_labeled set
BRSET_df_abnormal_labeled = BRSET_df_abnormal_no_others.sample(n=abnormal_num//3, random_state=seed)
print(len(BRSET_df_abnormal_labeled))

BRSET_df_train_labeled = pd.concat([BRSET_df_normal_labeled, BRSET_df_abnormal_labeled])
print(len(BRSET_df_train_labeled))
BRSET_df_train_labeled.to_csv('./data/brazilian-ophthalmological/1.0.1/BenchReAD/train_labeled.csv', index=False)


In [None]:
# other normal lines and abnormal lines as unlabeled set
BRSET_df_normal_unlabeled = BRSET_df_normal[~BRSET_df_normal.index.isin(BRSET_df_normal_labeled.index)]
print(len(BRSET_df_normal_unlabeled))
BRSET_df_abnormal_unlabeled = BRSET_df_abnormal_no_others[~BRSET_df_abnormal_no_others.index.isin(BRSET_df_abnormal_labeled.index)]
print(len(BRSET_df_abnormal_unlabeled))
BRSET_df_unlabeled = pd.concat([BRSET_df_normal_unlabeled, BRSET_df_abnormal_unlabeled])
BRSET_df_unlabeled.to_csv('./data/brazilian-ophthalmological/1.0.1/BenchReAD/train_unlabeled.csv', index=False)
print(len(BRSET_df_unlabeled))

In [214]:
assert len(BRSET_df_normal_labeled) + len(BRSET_df_normal_unlabeled) + len(BRSET_df_normal_val) == len(BRSET_df_normal_original)
assert len(BRSET_df_abnormal_labeled) + len(BRSET_df_abnormal_unlabeled) + len(BRSET_df_abnormal_val) == len(BRSET_df_abnormal_original_no_others)
