In [24]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import cv2
from turbojpeg import TurboJPEG
import os
import sys
import joblib
sys.path.append('/home/simon/Code/MasterThesis/project/include')
import utils as utl

In [25]:
DATA_BASE = '/data/simon'
OUTPUT_PATH = 'processed_eyepacs1'
OUTPUT_RESOLUTION = (500, 500)

labels = pd.read_csv(os.path.join(DATA_BASE, 'eyepacs1.csv'))
labels.head()

Unnamed: 0,image,level
0,10_left,0
1,10_right,0
2,13_left,0
3,13_right,0
4,15_left,1


In [26]:
labels['severity'] = labels.level.map(lambda v: 0 if v <= 1 else 1)
labels['image'] = labels.image.map(lambda v: os.path.join(DATA_BASE, 'eyepacs1/', f'{v}.jpeg'))

print(labels.level.unique())
print(labels.severity.unique())

labels.head()

[0 1 2 4 3]
[0 1]


Unnamed: 0,image,level,severity
0,/data/simon/eyepacs1/10_left.jpeg,0,0
1,/data/simon/eyepacs1/10_right.jpeg,0,0
2,/data/simon/eyepacs1/13_left.jpeg,0,0
3,/data/simon/eyepacs1/13_right.jpeg,0,0
4,/data/simon/eyepacs1/15_left.jpeg,1,0


In [27]:
print('Table size before filtering: ', len(labels))
jpeg = TurboJPEG('/opt/libjpeg-turbo/lib64/libturbojpeg.so')

def crop_image(image, tolerance=20):
    if len(image.shape) == 3:
        flatImage = np.max(image, 2)
    else:
        flatImage = image
    assert len(flatImage.shape) == 2
    flatImage = cv2.medianBlur(flatImage.copy(), 5)

    rows = np.where(np.max(flatImage, 0) > tolerance)[0]
    if rows.size:
        cols = np.where(np.max(flatImage, 1) > tolerance)[0]
        image = image[cols[0]: cols[-1] + 1, rows[0]: rows[-1] + 1]
    else:
        image = image[:1, :1]

    return image

def process_image(image_path, df, idx, size, tolerance=30):    
    img = cv2.imread(image_path)
    
    if img is None:
        return 
    img_crop = crop_image(img, tolerance)
    if img_crop.shape[0] < OUTPUT_RESOLUTION[0] or img_crop.shape[1] < OUTPUT_RESOLUTION[1]:
        return
    
    img_crop = cv2.resize(img_crop, size, interpolation=cv2.INTER_LINEAR)
    cv2.imwrite(os.path.join(DATA_BASE, OUTPUT_PATH, f'{os.path.splitext(os.path.basename(image_path))[0]}.png'), img_crop)
    return df

joblib.Parallel(n_jobs=-1, batch_size=32, verbose=10)(joblib.delayed(process_image)(row.image, labels, index, OUTPUT_RESOLUTION) for index, row in labels.iterrows())

#for index, row in labels.iterrows():
    
print('Table size after filtering: ', len(labels))

Table size before filtering:  35126
Table size after filtering:  35126


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   21.1s
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:   26.6s
[Parallel(n_jobs=-1)]: Done 568 tasks      | elapsed:   43.2s
[Parallel(n_jobs=-1)]: Done 856 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 1208 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 1560 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 1976 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 2392 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 2872 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 3352 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done 3896 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 4440 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 5048 tasks      | e

In [28]:
#labels.groupby('ratio')['image'].nunique()

labels['image'] = labels.image.map(lambda v: os.path.join(DATA_BASE, OUTPUT_PATH, f'{os.path.splitext(os.path.basename(v))[0]}.png'))
print(labels.image.unique())

for index, row in labels.iterrows():
    try:
        in_file = open(row.image, 'rb')
        in_file.close() 
        #img = cv2.imread(row.image)
    except FileNotFoundError:
        labels.drop(index, inplace=True)

    if index % (len(labels) // 10) == (len(labels) // 10 - 1):
        print('Progress: ', index)

labels['image'] = labels.image.map(lambda v: os.path.basename(v)[:-4])
labels.to_csv(os.path.join(DATA_BASE, 'processed_eyepacs1_v2.csv'), index=False)

['/data/simon/processed_eyepacs1/10_left.png'
 '/data/simon/processed_eyepacs1/10_right.png'
 '/data/simon/processed_eyepacs1/13_left.png' ...
 '/data/simon/processed_eyepacs1/44348_right.png'
 '/data/simon/processed_eyepacs1/44349_left.png'
 '/data/simon/processed_eyepacs1/44349_right.png']
Progress:  3511
Progress:  7023
Progress:  10532
Progress:  14039
Progress:  17549
Progress:  21059
Progress:  24569
Progress:  28071
Progress:  31580
Progress:  35079
