In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import cv2
from turbojpeg import TurboJPEG
import os
import sys
import joblib
sys.path.append('/home/simon/Code/MasterThesis/project/include')
import utils as utl

In [8]:
DATA_BASE = '/data/simon/Datasets/'
OUTPUT_PATH = 'processed_messidor2'
OUTPUT_RESOLUTION = (700, 700)

labels = pd.read_csv(os.path.join(DATA_BASE, 'processed_messidor2.csv'))
labels.head()

Unnamed: 0,image_id,adjudicated_dr_grade,adjudicated_dme,adjudicated_gradable
0,20051020_43808_0100_PP.png,0.0,0.0,1
1,20051020_43832_0100_PP.png,1.0,0.0,1
2,20051020_43882_0100_PP.png,1.0,0.0,1
3,20051020_43906_0100_PP.png,2.0,1.0,1
4,20051020_44261_0100_PP.png,0.0,0.0,1


In [3]:
labels['severity'] = labels.adjudicated_dr_grade.map(lambda v: 0 if v <= 1 else 1)
labels['image'] = labels.image_id.map(lambda v: os.path.join(DATA_BASE, 'messidor2/', v))

print(labels.adjudicated_dr_grade.unique())
print(labels.severity.unique())

labels.tail()

[ 0.  1.  2.  3.  4. nan]
[0 1]


Unnamed: 0,image_id,adjudicated_dr_grade,adjudicated_dme,adjudicated_gradable,severity,image
1743,IM004806.jpg,0.0,0.0,1,0,/data/simon/messidor2/IM004806.jpg
1744,IM004811.jpg,1.0,0.0,1,0,/data/simon/messidor2/IM004811.jpg
1745,IM004812.jpg,2.0,0.0,1,1,/data/simon/messidor2/IM004812.jpg
1746,IM004831.jpg,0.0,0.0,1,0,/data/simon/messidor2/IM004831.jpg
1747,IM004832.jpg,0.0,0.0,1,0,/data/simon/messidor2/IM004832.jpg


## Adjust image size and type
- crop black borders
- resize to 300x300
- change to png

In [4]:
print('Table size before filtering: ', len(labels))

def crop_image(image, tolerance=20):
    if len(image.shape) == 3:
        flatImage = np.max(image, 2)
    else:
        flatImage = image
    assert len(flatImage.shape) == 2
    flatImage = cv2.medianBlur(flatImage.copy(), 5)

    rows = np.where(np.max(flatImage, 0) > tolerance)[0]
    if rows.size:
        cols = np.where(np.max(flatImage, 1) > tolerance)[0]
        image = image[cols[0]: cols[-1] + 1, rows[0]: rows[-1] + 1]
    else:
        image = image[:1, :1]

    return image

def process_image(image_path, df, idx, size, tolerance=30):    
    print(image_path)
    if 'jpg' in image_path:
        image_path = image_path[:-3] + 'JPG'
    
    img = cv2.imread(image_path)
    
    if img is None:
        return 
    if df.adjudicated_gradable[idx] == 0:
        return
    img_crop = crop_image(img, tolerance)
    if img_crop.shape[0] < OUTPUT_RESOLUTION[0] or img_crop.shape[1] < OUTPUT_RESOLUTION[1]:
        return
    
    img_crop = cv2.resize(img_crop, size, interpolation=cv2.INTER_LINEAR)
    cv2.imwrite(os.path.join(DATA_BASE, OUTPUT_PATH, f'{os.path.splitext(os.path.basename(image_path))[0]}.png'), img_crop)
    return df

joblib.Parallel(n_jobs=-1, batch_size=32, verbose=10)(joblib.delayed(process_image)(row.image, labels, index, OUTPUT_RESOLUTION) for index, row in labels.iterrows())

#for index, row in labels.iterrows():
    
print('Table size after filtering: ', len(labels))


Table size before filtering:  1748
Table size after filtering:  1748


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   17.3s
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:   29.6s
[Parallel(n_jobs=-1)]: Done 568 tasks      | elapsed:   42.9s
[Parallel(n_jobs=-1)]: Done 856 tasks      | elapsed:   51.7s
[Parallel(n_jobs=-1)]: Done 1208 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 1250 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 1263 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 1307 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 1322 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1368 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1540 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1650 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 1669 tasks      | e

## Remove unsuable images from CSV
- check if processed file exists
- remove if necassary from dataframe

In [5]:
labels['image'] = labels.image.map(lambda v: os.path.join(DATA_BASE, OUTPUT_PATH, f'{os.path.splitext(os.path.basename(v))[0]}.png'))
print(labels.image.unique())

for index, row in labels.iterrows():
    try:
        in_file = open(row.image, 'rb')
        in_file.close() 
        #img = cv2.imread(row.image)
    except FileNotFoundError:
        labels.drop(index, inplace=True)

    if index % (len(labels) // 10) == (len(labels) // 10 - 1):
        print('Progress: ', index)

labels['image'] = labels.image.map(lambda v: os.path.basename(v)[:-4])
labels.to_csv(os.path.join(DATA_BASE, 'processed_messidor2_v2.csv'), index=False)

['/data/simon/processed_messidor2/20051020_43808_0100_PP.png'
 '/data/simon/processed_messidor2/20051020_43832_0100_PP.png'
 '/data/simon/processed_messidor2/20051020_43882_0100_PP.png' ...
 '/data/simon/processed_messidor2/IM004812.png'
 '/data/simon/processed_messidor2/IM004831.png'
 '/data/simon/processed_messidor2/IM004832.png']
Progress:  173
Progress:  347
Progress:  521
Progress:  695
Progress:  869
Progress:  1043
Progress:  1217
Progress:  1391
Progress:  1565
Progress:  1739


## Join together messidor2 and eyepacs1 dataframes
- join dataframes
- create unified image folder
- ignore unnecassary columns

In [9]:
df_messidor = pd.read_csv(os.path.join(DATA_BASE, 'processed_messidor2_v2.csv'))
df_eyepacs = pd.read_csv(os.path.join(DATA_BASE, 'processed_eyepacs1_v2.csv'))
df_aptos = pd.read_csv(os.path.join(DATA_BASE, 'processed_aptos2019_v2.csv'))

df_messidor = df_messidor[['image', 'severity']]
df_eyepacs = df_eyepacs[['image', 'severity']]
df_aptos = df_aptos[['image', 'severity']]

joined_df = pd.concat([df_messidor, df_eyepacs, df_aptos], axis=0, ignore_index=True)
joined_df.to_csv(os.path.join(DATA_BASE, 'combined_retina_dataset.csv'), index=False)