# Notebook for dataset processing
Please specify the location of ODIR dataset and designated location for images and run all the codes.

In [6]:
ODIR_path = 'D:/Workdir/ODIR-5K'
img_path = 'D:/Workdir/522proj_data'

# Processing CSV file
Results are saved in this directory.

In [7]:
import pandas as pd
import os
xlsx = pd.read_excel(os.path.join(ODIR_path, 'data.xlsx'))

['ID',
 'Patient Age',
 'Patient Sex',
 'Left-Fundus',
 'Right-Fundus',
 'Left-Diagnostic Keywords',
 'Right-Diagnostic Keywords',
 'N',
 'D',
 'G',
 'C',
 'A',
 'H',
 'M',
 'O']

In [8]:
gender_onehot = pd.get_dummies(xlsx['Patient Sex'], dtype=int)
xlsx = xlsx.drop('Patient Sex', axis=1)
xlsx = xlsx.join(gender_onehot)
del gender_onehot

In [9]:
left_columns = ['ID',
 'Patient Age',
 'Female',
 'Male',
 'Left-Fundus',
 'Left-Diagnostic Keywords']
left_eye = xlsx[left_columns]
left_eye.rename(columns={
    'Left-Fundus': 'Fundus',
    'Left-Diagnostic Keywords': 'Keywords'
}, inplace=True)
left_eye['Left'] = 1
left_eye['Right'] = 0

right_columns = ['ID',
 'Patient Age',
 'Female',
 'Male',
 'Right-Fundus',
 'Right-Diagnostic Keywords']
right_eye = xlsx[right_columns]
right_eye.rename(columns={
    'Right-Fundus': 'Fundus',
    'Right-Diagnostic Keywords': 'Keywords'
}, inplace=True)
right_eye['Left'] = 0
right_eye['Right'] = 1

concat_eyes = pd.concat((left_eye, right_eye))
del left_eye, left_columns, right_eye, right_columns

def label_diagnosis(row):
    diagnosis = row['Keywords']
    Normal = Diabetes = Glaucoma = Cataract = Age_related = Hypertension = Pathological = Other = 0
    other_lst = ['post retinal laser surgery', 'occlusion', 'epiretinal membrane', 'spotted membranous change', 
                 'refractive media opacity', 'vitreous degeneration','white vessel', 'laser spot', 'tessellated fundus', 
                 'drusen', 'myelinated nerve fibers', 'chorioretinal atrophy', 'retinal pigmentation',
                 'retinitis pigmentosa', 'old chorioretinopathy', 'atrophic change', 'maculopathy',
                 'peripapillary atrophy', 'post laser photocoagulation', 'pigment',
                 'optic nerve atrophy', 'retinochoroidal coloboma', 'optic disc edema']
    null_lst = ['anterior segment image', 'image offset', 'low image quality',
                'lens dust', 'no fundus image', 'optic disk photographically invisible',
                ]
    
    diags = diagnosis.split('，')
    for diag in diags:
        normal_flag = True
        if diag in null_lst:
            continue
        if 'cataract' in diag: # cataract
            Cataract = 1
            normal_flag = False
        if 'hypertensive' in diag: # Hypertension
            Hypertension = 1
            normal_flag = False
        if 'glaucoma' in diag: # glaucoma
            Glaucoma = 1
            normal_flag = False
        if sum([1 if dia in diag else 0 for dia in ['nonproliferative', 'non proliferative', 'diabetic']]) > 0: # diabete
            Diabetes = 1
            normal_flag = False
        if 'age-related' in diag:# age-related
            Age_related = 1
            normal_flag = False
        if 'pathological' in diag: # pathological
            Pathological = 1
            normal_flag = False
        if 'myopi' in diag: # pathological
            Pathological = 1
            normal_flag = False
        if sum([1 if oth in diag else 0 for oth in other_lst]) > 0:
            Other = 1
            normal_flag = False

        if 'normal fundus' in diag and normal_flag: # normal
            Normal = 1
        elif normal_flag:
            Other = 1
    return Normal, Diabetes, Glaucoma, Cataract, Age_related, Hypertension, Pathological, Other

eyes_diag = concat_eyes.apply(label_diagnosis, axis=1, result_type='expand')
eyes_diag.rename(columns={
    i: ['Normal', 'Diabetes', 'Glaucoma', 'Cataract', 'Age_related', 'Hypertension', 'Pathological', 'Other'][int(i)] for i in eyes_diag.columns.to_list()
}, inplace=True)

concat_eyes = concat_eyes.drop('Keywords', axis=1)
concat_eyes = concat_eyes[['ID', 'Patient Age', 'Female', 'Male', 'Left', 'Right', 'Fundus']]
dataset = pd.concat((concat_eyes, eyes_diag), axis=1)
dataset.to_csv('dataset.csv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  left_eye.rename(columns={
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  left_eye['Left'] = 1
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  left_eye['Right'] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-v

# Processing Images
Please specify another path (different from this directory) to avoid uploading the whole dataset on GitHub.

In [None]:
import cv2
import matplotlib.pyplot as plt

def img_normalization(image_path, re_size=(2048, 2048)):
    image = cv2.imread(image_path)
    img = cv2.medianBlur(image, 5)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    _, filtered = cv2.threshold(gray, 10, 255, cv2.THRESH_BINARY)

    x_axis = filtered.max(axis=1)
    y_axis = filtered.max(axis=0)
    x_min = (x_axis>0).argmax()
    x_max = x_axis.shape[0] - (x_axis[::-1]>0).argmax()
    y_min = (y_axis>0).argmax()
    y_max = y_axis.shape[0] - (y_axis[::-1]>0).argmax()
    cutted = image[x_min:x_max, y_min:y_max, :]
    res_img = cv2.resize(cutted, re_size)
    return res_img

In [None]:
import os
def get_filenames(dir):
    paths = []
    for root, dirs, files in os.walk(dir):
        for file in files:
            paths.append([root, file])
    return paths

test_paths = get_filenames(os.path.join(ODIR_path, 'Testing Images'))
train_paths = get_filenames(os.path.join(ODIR_path, 'Training Images'))

In [None]:
if not os.path.isdir(os.path.join(img_path, 'test')):
    os.mkdir(os.path.join(img_path, 'test'))
if not os.path.isdir(os.path.join(img_path, 'train')):
    os.mkdir(os.path.join(img_path, 'train'))

for path in test_paths:
    print(os.path.join(path[0], path[1]))
    img = img_normalization(os.path.join(path[0], path[1]))
    cv2.imwrite(os.path.join(img_path, 'test', path[1]), img)
for path in train_paths:
    print(os.path.join(path[0], path[1]))
    img = img_normalization(os.path.join(path[0], path[1]))
    cv2.imwrite(os.path.join(img_path, 'train', path[1]), img)