In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch
import cv2
from turbojpeg import TurboJPEG
import os
import sys
import joblib
sys.path.append('/home/simon/Code/MasterThesis/project/include')
import utils as utl

In [10]:
DATA_BASE = '/data/simon/Datasets/'
OUTPUT_PATH = 'processed_aptos2019'
OUTPUT_RESOLUTION = (700, 700)

labels = pd.read_csv(os.path.join(DATA_BASE, 'aptos2019.csv'))
labels.head()

Unnamed: 0,id_code,diagnosis
0,000c1434d8d7,2
1,001639a390f0,4
2,0024cdab0c1e,1
3,002c21358ce6,0
4,005b95c28852,0


In [14]:
labels['severity'] = labels.diagnosis.map(lambda v: 0 if v <= 1 else 1)
labels['image'] = labels.id_code.map(lambda v: os.path.join(DATA_BASE, 'aptos2019/', f'{v}.png'))

print(labels.diagnosis.unique())
print(labels.severity.unique())
print(labels.severity.value_counts())

labels.head()

[2 4 1 0 3]
[1 0]
0    1889
1    1442
Name: severity, dtype: int64


Unnamed: 0,id_code,diagnosis,severity,image
0,000c1434d8d7,2,1,/data/simon/Datasets/aptos2019/000c1434d8d7.png
1,001639a390f0,4,1,/data/simon/Datasets/aptos2019/001639a390f0.png
2,0024cdab0c1e,1,0,/data/simon/Datasets/aptos2019/0024cdab0c1e.png
3,002c21358ce6,0,0,/data/simon/Datasets/aptos2019/002c21358ce6.png
4,005b95c28852,0,0,/data/simon/Datasets/aptos2019/005b95c28852.png


In [12]:
print('Table size before filtering: ', len(labels))

def crop_image(image, tolerance=20):
    if len(image.shape) == 3:
        flatImage = np.max(image, 2)
    else:
        flatImage = image
    assert len(flatImage.shape) == 2
    flatImage = cv2.medianBlur(flatImage.copy(), 5)

    rows = np.where(np.max(flatImage, 0) > tolerance)[0]
    if rows.size:
        cols = np.where(np.max(flatImage, 1) > tolerance)[0]
        image = image[cols[0]: cols[-1] + 1, rows[0]: rows[-1] + 1]
    else:
        image = image[:1, :1]

    return image

def process_image(image_path, df, idx, size, tolerance=30):    
    img = cv2.imread(image_path)
    
    if img is None:
        return 
    img_crop = crop_image(img, tolerance)
    if img_crop.shape[0] < OUTPUT_RESOLUTION[0] or img_crop.shape[1] < OUTPUT_RESOLUTION[1]:
        return
    
    img_crop = cv2.resize(img_crop, size, interpolation=cv2.INTER_LINEAR)
    cv2.imwrite(os.path.join(DATA_BASE, OUTPUT_PATH, f'{os.path.splitext(os.path.basename(image_path))[0]}.png'), img_crop)
    return df

joblib.Parallel(n_jobs=-1, batch_size=32, verbose=10)(joblib.delayed(process_image)(row.image, labels, index, OUTPUT_RESOLUTION) for index, row in labels.iterrows())

#for index, row in labels.iterrows():
    
print('Table size after filtering: ', len(labels))

Table size before filtering:  3662
Table size after filtering:  3662


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:    8.7s
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:   11.2s
[Parallel(n_jobs=-1)]: Done 568 tasks      | elapsed:   18.7s
[Parallel(n_jobs=-1)]: Done 856 tasks      | elapsed:   24.1s
[Parallel(n_jobs=-1)]: Done 1208 tasks      | elapsed:   32.2s
[Parallel(n_jobs=-1)]: Done 1560 tasks      | elapsed:   38.2s
[Parallel(n_jobs=-1)]: Done 1976 tasks      | elapsed:   47.5s
[Parallel(n_jobs=-1)]: Done 2392 tasks      | elapsed:   55.7s
[Parallel(n_jobs=-1)]: Done 2872 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 3135 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 3245 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 3324 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 3374 tasks      | e

In [13]:
#labels.groupby('ratio')['image'].nunique()

labels['image'] = labels.image.map(lambda v: os.path.join(DATA_BASE, OUTPUT_PATH, f'{os.path.splitext(os.path.basename(v))[0]}.png'))
print(labels.image.unique())

for index, row in labels.iterrows():
    try:
        in_file = open(row.image, 'rb')
        in_file.close() 
        #img = cv2.imread(row.image)
    except FileNotFoundError:
        labels.drop(index, inplace=True)

    if index % (len(labels) // 10) == (len(labels) // 10 - 1):
        print('Progress: ', index)

labels['image'] = labels.image.map(lambda v: os.path.basename(v)[:-4])
labels.to_csv(os.path.join(DATA_BASE, 'processed_aptos2019_v2.csv'), index=False)

['/data/simon/Datasets/processed_aptos2019/000c1434d8d7.png'
 '/data/simon/Datasets/processed_aptos2019/001639a390f0.png'
 '/data/simon/Datasets/processed_aptos2019/0024cdab0c1e.png' ...
 '/data/simon/Datasets/processed_aptos2019/ffcf7b45f213.png'
 '/data/simon/Datasets/processed_aptos2019/ffd97f8cd5aa.png'
 '/data/simon/Datasets/processed_aptos2019/ffec9a18a3ce.png']
Progress:  362
Progress:  719
Progress:  1070
Progress:  1411
Progress:  1749
Progress:  2081
Progress:  2407
Progress:  2727
Progress:  3041
Progress:  3349
