In [1]:
import pandas as pd
import os
import numpy as np
import h5py

In [3]:
image_path = '/home/ge209/Documents/Data/ISIC_2018_feature_segm/ISIC2018_Task1-2_Training_Input/'

In [4]:
img_names = os.listdir(image_path)
img_names = filter(lambda x: x.endswith('jpg'), img_names)

def get_ind(img_name):
    return img_name.split('.')[0]

img_inds = list(map(get_ind, img_names))

In [5]:
img_inds[:10]

['ISIC_0000080',
 'ISIC_0014263',
 'ISIC_0014013',
 'ISIC_0014599',
 'ISIC_0000042',
 'ISIC_0012911',
 'ISIC_0013512',
 'ISIC_0015455',
 'ISIC_0014708',
 'ISIC_0012700']

In [6]:
def get_ind_int(img_ind): return int(img_ind.split('_')[1])

img_inds_sorted = sorted(img_inds, key=lambda x: get_ind_int(x))

In [7]:
img_inds_sorted[:10]

['ISIC_0000000',
 'ISIC_0000001',
 'ISIC_0000003',
 'ISIC_0000004',
 'ISIC_0000006',
 'ISIC_0000007',
 'ISIC_0000008',
 'ISIC_0000009',
 'ISIC_0000011',
 'ISIC_0000012']

In [8]:
NUMBER_OF_ISIC_IMAGES = 2594

In [11]:
assert len(img_inds_sorted) == NUMBER_OF_ISIC_IMAGES

In [12]:
np.random.seed(10)
valid_indices = np.random.choice(list(range(NUMBER_OF_ISIC_IMAGES)), 400, replace=False)
train_indices = set(range(NUMBER_OF_ISIC_IMAGES)).difference(set(valid_indices))
train_indices = sorted(list(train_indices))
valid_indices = sorted(list(valid_indices))
is_train = [i in train_indices for i in range(NUMBER_OF_ISIC_IMAGES)]

In [13]:
def summarize_indices(indices):
    print(indices[:10], 'len', len(indices), 'min', min(indices), 'max', max(indices))

In [14]:
summarize_indices(train_indices)
summarize_indices(valid_indices)

[0, 2, 3, 4, 5, 6, 7, 8, 9, 10] len 2194 min 0 max 2592
[1, 16, 20, 25, 27, 28, 43, 47, 54, 56] len 400 min 1 max 2593


In [15]:
attr_types = ['pigment_network', 'negative_network', 'streaks', 'milia_like_cyst', 'globules']

In [55]:
df2 = pd.DataFrame({'ID': img_inds_sorted, 'Split': ['train' if i else 'valid' for i in is_train]})

In [56]:
df2.head()

Unnamed: 0,ID,Split
0,ISIC_0000000,train
1,ISIC_0000001,valid
2,ISIC_0000003,train
3,ISIC_0000004,train
4,ISIC_0000006,train


In [18]:
df3 = pd.DataFrame(columns = attr_types, data = np.zeros((NUMBER_OF_ISIC_IMAGES, len(attr_types)), dtype=np.int))

In [19]:
df3.head()

Unnamed: 0,pigment_network,negative_network,streaks,milia_like_cyst,globules
0,0,0,0,0,0
1,0,0,0,0,0
2,0,0,0,0,0
3,0,0,0,0,0
4,0,0,0,0,0


In [25]:
def load_mask(image_path, img_id, attribute='pigment_network'):
    mask_file = image_path + '%s_attribute_%s.h5' % (img_id, attribute)
    f = h5py.File(mask_file, 'r')
    mask_np = f['img'].value

    mask_np = mask_np.astype('uint8')
    return mask_np

In [27]:
save_path = '/home/ge209/Documents/Data/ISIC_2018_feature_segm/h5/'

In [28]:
for i in range(NUMBER_OF_ISIC_IMAGES):
    print(f'\r {i}', end='')
    for attr in attr_types:
        xxx = load_mask(save_path, img_inds_sorted[i], 'all')
        df3.loc[i, attr_types] = np.any(xxx, axis=(0,1)) * 1

 0 1 2 3 4 5 6 7



 2593

In [33]:
df3.head()

Unnamed: 0,pigment_network,negative_network,streaks,milia_like_cyst,globules
0,1,0,0,0,0
1,1,0,1,0,1
2,1,0,0,1,0
3,0,0,0,0,0
4,1,0,0,0,0


In [44]:
df2.head()

Unnamed: 0,ID,Split
0,ISIC_0000000,True
1,ISIC_0000001,False
2,ISIC_0000003,True
3,ISIC_0000004,True
4,ISIC_0000006,True


In [57]:
df2.to_csv('train_test_id.csv', index=False)

In [58]:
df2_2 = pd.read_csv('train_test_id.csv')

In [59]:
df2_2.head()

Unnamed: 0,ID,Split
0,ISIC_0000000,train
1,ISIC_0000001,valid
2,ISIC_0000003,train
3,ISIC_0000004,train
4,ISIC_0000006,train


In [51]:
np.all((df2 == df2_2).values)

True

In [52]:
df3.to_csv('mask_ind.csv', index=False)

In [78]:
df3_2 = pd.read_csv('mask_ind.csv')
np.all((df3 == df3_2).values)

True

In [79]:
df3.loc[298, attr_types].values.astype('uint8')

array([0, 0, 0, 0, 1], dtype=uint8)

In [80]:
df3.columns

Index(['pigment_network', 'negative_network', 'streaks', 'milia_like_cyst',
       'globules'],
      dtype='object')

In [82]:
df3_2[df2['Split'] == 'valid'].head()

Unnamed: 0,pigment_network,negative_network,streaks,milia_like_cyst,globules
1,1,0,1,0,1
16,1,0,0,0,0
20,0,0,0,0,0
25,1,0,0,0,0
27,0,0,0,0,1


In [76]:
df5 = pd.DataFrame(df3[df2['Split'] == 'valid']).reset_index(drop=True)

In [77]:
df5.head()

Unnamed: 0,pigment_network,negative_network,streaks,milia_like_cyst,globules
0,1,0,1,0,1
1,1,0,0,0,0
2,0,0,0,0,0
3,1,0,0,0,0
4,0,0,0,0,1


In [84]:
df3_2.values

array([[1, 0, 0, 0, 0],
       [1, 0, 1, 0, 1],
       [1, 0, 0, 1, 0],
       ...,
       [1, 0, 0, 1, 0],
       [1, 0, 0, 0, 0],
       [1, 0, 0, 0, 0]])

In [83]:
(df3_2 == df5).values

ValueError: Can only compare identically-labeled DataFrame objects

In [72]:
df5.loc[4, df5.columns]

KeyError: 4