In [24]:
%%capture
!pip install openpyxl

In [25]:
import numpy as np
import pandas as pd
import os
import cv2
import matplotlib.pyplot as plt
from tqdm import tqdm

In [26]:
DATA_PATH = 'archive/ODIR-5K/ODIR-5K/data.xlsx'
IMG_DIR = 'archive/ODIR-5K/ODIR-5K/Training Images/'

In [27]:
main_df = pd.read_excel(DATA_PATH)
print(main_df.shape)
main_df.head()

(3500, 15)


Unnamed: 0,ID,Patient Age,Patient Sex,Left-Fundus,Right-Fundus,Left-Diagnostic Keywords,Right-Diagnostic Keywords,N,D,G,C,A,H,M,O
0,0,69,Female,0_left.jpg,0_right.jpg,cataract,normal fundus,0,0,0,1,0,0,0,0
1,1,57,Male,1_left.jpg,1_right.jpg,normal fundus,normal fundus,1,0,0,0,0,0,0,0
2,2,42,Male,2_left.jpg,2_right.jpg,laser spot，moderate non proliferative retinopathy,moderate non proliferative retinopathy,0,1,0,0,0,0,0,1
3,3,66,Male,3_left.jpg,3_right.jpg,normal fundus,branch retinal artery occlusion,0,0,0,0,0,0,0,1
4,4,53,Male,4_left.jpg,4_right.jpg,macular epiretinal membrane,mild nonproliferative retinopathy,0,1,0,0,0,0,0,1


In [28]:
IMG_SIZE = 128

In [34]:
def crop(image): 
    # Remove vertical black borders (the image must be already normalized)
    sums = image.sum(axis=0)
    sums = sums.sum(axis=1)
    filter_arr = []
    for s in sums:
        if s == 0:
            filter_arr.append(False)
        else:
            filter_arr.append(True)
    image = image[:, filter_arr]
    
    # Crop to a square shape
    h = image.shape[0]
    w = image.shape[1]    
    
    if h < w:
        x = (w - h)//2
        image = image[:, x:x+h, :]        
    elif h > w:
        x = (h - w)//2
        image = image[x:x+w, :, :]           
    else:
        pass
    
    return image

In [49]:
def preprocess_image(file_name):
    image = cv2.imread(os.path.join(IMG_DIR, file_name))
    
    norm_img = np.zeros(image.shape)
    norm_img = cv2.normalize(image,  norm_img, 0, 255, cv2.NORM_MINMAX)
    
    image = crop(norm_img)
    image = cv2.resize(image, (IMG_SIZE, IMG_SIZE))
    
    return image

def preprocess_patient(patient_id):
    out_file = str(patient_id) + '.jpg'
    image = preprocess_image(out_file)
    return image

Create image files

In [50]:
images = {}
for i in tqdm(range(main_df.shape[0])):
    patient_id = main_df.iloc[i]['ID']
    left_eye = str(patient_id) + '_left'
    right_eye = str(patient_id) + '_right'
    image = preprocess_patient(left_eye)
    images[str(patient_id)+'_left'] = image
    image = preprocess_patient(right_eye)
    images[str(patient_id)+'_right'] = image

100%|██████████████████████████████████████████████████████████████████████████████| 3500/3500 [18:21<00:00,  3.18it/s]


In [55]:
output_dir = "ocular2"
img_dir = os.path.join(output_dir, 'images')

In [56]:
os.listdir(output_dir)

['images']

In [57]:
for i in tqdm(images.keys()):
    out_file_path = os.path.join(img_dir, str(i)+'.jpg')
    cv2.imwrite(out_file_path, images[i])

100%|█████████████████████████████████████████████████████████████████████████████| 7000/7000 [00:13<00:00, 514.93it/s]


In [58]:
total_files = 0
for base, dirs, files in os.walk(img_dir):
    for Files in files:
        total_files += 1

total_files

7000

Create CSV

In [18]:
# fix the comma-like symbol
for i in range(main_df.shape[0]):
    corrected_l = main_df.iloc[i]['Left-Diagnostic Keywords'].replace('，', ', ')  
    main_df.loc[i, 'Left-Diagnostic Keywords'] = corrected_l
    corrected_r = main_df.iloc[i]['Right-Diagnostic Keywords'].replace('，', ', ')  
    main_df.loc[i, 'Right-Diagnostic Keywords'] = corrected_r

main_df.head()

Unnamed: 0,ID,Patient Age,Patient Sex,Left-Fundus,Right-Fundus,Left-Diagnostic Keywords,Right-Diagnostic Keywords,N,D,G,C,A,H,M,O
0,0,69,Female,0_left.jpg,0_right.jpg,cataract,normal fundus,0,0,0,1,0,0,0,0
1,1,57,Male,1_left.jpg,1_right.jpg,normal fundus,normal fundus,1,0,0,0,0,0,0,0
2,2,42,Male,2_left.jpg,2_right.jpg,"laser spot, moderate non proliferative retinop...",moderate non proliferative retinopathy,0,1,0,0,0,0,0,1
3,3,66,Male,3_left.jpg,3_right.jpg,normal fundus,branch retinal artery occlusion,0,0,0,0,0,0,0,1
4,4,53,Male,4_left.jpg,4_right.jpg,macular epiretinal membrane,mild nonproliferative retinopathy,0,1,0,0,0,0,0,1


In [19]:
main_df.to_csv(os.path.join(output_dir, 'data.csv'), index=False)

In [20]:
os.listdir(output_dir)

['data.csv', 'images']

In [21]:
df = pd.read_csv(os.path.join(output_dir, 'data.csv'))
df.tail()

Unnamed: 0,ID,Patient Age,Patient Sex,Left-Fundus,Right-Fundus,Left-Diagnostic Keywords,Right-Diagnostic Keywords,N,D,G,C,A,H,M,O
3495,4686,63,Male,4686_left.jpg,4686_right.jpg,severe nonproliferative retinopathy,proliferative diabetic retinopathy,0,1,0,0,0,0,0,0
3496,4688,42,Male,4688_left.jpg,4688_right.jpg,moderate non proliferative retinopathy,moderate non proliferative retinopathy,0,1,0,0,0,0,0,0
3497,4689,54,Male,4689_left.jpg,4689_right.jpg,mild nonproliferative retinopathy,normal fundus,0,1,0,0,0,0,0,0
3498,4690,57,Male,4690_left.jpg,4690_right.jpg,mild nonproliferative retinopathy,mild nonproliferative retinopathy,0,1,0,0,0,0,0,0
3499,4784,58,Male,4784_left.jpg,4784_right.jpg,"hypertensive retinopathy, age-related macular ...","hypertensive retinopathy, age-related macular ...",0,0,0,0,1,1,0,0
