In [7]:
from time import time
import pandas as pd
from os import listdir
from os.path import isfile, join
import os, shutil, glob
import cv2, csv
from PIL import Image
import random
import albumentations as A
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [8]:
# Prepare directories with images on classified folders

images = []
df = pd.read_csv('HAM10000_metadata.csv')
df_dx = df.dx.unique()
images_by_folder = {}

if os.path.exists('data_images'):
    shutil.rmtree('data_images') 

os.mkdir('data_images')
    
for dx in df_dx:
    images_by_folder[dx] = []
    dirName = 'data_images/' + dx
    if not os.path.exists(dirName):
        os.mkdir(dirName)
        
images_1 = [f for f in listdir('HAM10000_images_part_1') if isfile(join('HAM10000_images_part_1', f))]
images_2 = [f for f in listdir('HAM10000_images_part_2') if isfile(join('HAM10000_images_part_2', f))]

for f in images_1:
    dx = df.loc[df['image_id'] == os.path.splitext(f)[0], 'dx'].values.item()
    images_by_folder[dx].append(f)
    dx_dir = 'data_images/' + dx
    shutil.copy('HAM10000_images_part_1/' + f, dx_dir)
    images.append(dx_dir + '/' + f)
    
for f in images_2:
    dx = df.loc[df['image_id'] == os.path.splitext(f)[0], 'dx'].values.item()
    images_by_folder[dx].append(f)
    dx_dir = 'data_images/' + dx
    shutil.copy('HAM10000_images_part_2/' + f, dx_dir)
    images.append(dx_dir + '/' + f)

df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear


In [9]:
transforms = [A.Compose([A.Flip(always_apply=True, p=1.0)]),
A.Compose([A.HorizontalFlip(always_apply=True, p=1.0)]),
A.Compose([A.Rotate(always_apply=True, p=1.0, limit=(44, 45), interpolation=0, border_mode=1, value=(0, 0, 0), mask_value=None)]),
A.Compose([A.Rotate(always_apply=True, p=1.0, limit=(89, 90), interpolation=0, border_mode=1, value=(0, 0, 0), mask_value=None)]),
A.Compose([A.Rotate(always_apply=True, p=1.0, limit=(134, 135), interpolation=0, border_mode=1, value=(0, 0, 0), mask_value=None)]),
A.Compose([A.Rotate(always_apply=True, p=1.0, limit=(179, 180), interpolation=0, border_mode=1, value=(0, 0, 0), mask_value=None)]),
A.Compose([A.Rotate(always_apply=True, p=1.0, limit=(224, 225), interpolation=0, border_mode=1, value=(0, 0, 0), mask_value=None)]),
A.Compose([A.Rotate(always_apply=True, p=1.0, limit=(269, 270), interpolation=0, border_mode=1, value=(0, 0, 0), mask_value=None)]),
A.Compose([A.Rotate(always_apply=True, p=1.0, limit=(314, 315), interpolation=0, border_mode=1, value=(0, 0, 0), mask_value=None)])]

In [None]:
# Oversampling

files_amount = []
for folder in images_by_folder:
    files_amount.append(len(images_by_folder[folder]))
    
max_copies = min(files_amount) * 10
last_sample_name = 34321

def visualize(image):
    plt.figure(figsize=(10, 10))
    plt.axis('off')
    plt.imshow(image)

def writeLinesInCsv(file, last_sample_name, csv_line):
    with open(file, 'a') as fd:
        csv_line[1] = 'ISIC_00' + str(last_sample_name)
        fd.write(csv_line[0] + ',' + csv_line[1] + ',' + csv_line[2] + ',' + csv_line[3] + ',' + csv_line[4] + ',' + csv_line[5] + ',' + csv_line[6] +'\n')

for folder in images_by_folder:
    for image in images_by_folder[folder]:
        with open('HAM10000_metadata_with_more_samples.csv') as f_obj:
            reader = csv.reader(f_obj, delimiter=',')
            for line in reader:
                if image.replace('.jpg', '') in line:  
                    csv_line = line
                    break
        imageToTrans = cv2.imread('data_images/' + folder + '/' + image)
        
        for transform in transforms:
            
            image_path = 'data_images/' + folder + '/' + 'ISIC_00' + str(last_sample_name) + '.jpg'
            transformed = transform(image=imageToTrans)['image']
            #visualize(transformed)
            cv2.imwrite(image_path, transformed)
            writeLinesInCsv('HAM10000_metadata_with_more_samples.csv', last_sample_name, csv_line)
            last_sample_name += 1
        
        n_files = os.listdir('data_images/'  + folder)
        if len(n_files) > max_copies: 
            break


In [5]:
# Undersampling

for folder in images_by_folder:
    folder_list = os.listdir('data_images/' + folder)
    if len(folder_list) > max_copies + (max_copies * .05):
        iterator = 0
        for to_remove in random.sample(folder_list, int(len(folder_list) - max_copies)):
            os.remove('data_images/' + folder + '/' + to_remove)
            iterator +=1

6706
5556


In [6]:
# Separate a validation dataset in other folder moving 30% images. (¿BALANCEAR DATOS?)

if os.path.exists('data_test'):
    shutil.rmtree('data_test') 

os.mkdir('data_test')
list = os.listdir('data_images/')
for folder in list:
    n_files = os.listdir('data_images/'  + folder)
    os.mkdir('data_test/' + folder)
    folder_list = os.listdir('data_images/' + folder)
    for to_copy in random.sample(folder_list, int(len(folder_list)*.3)):
        shutil.move('data_images/' + folder + '/' + to_copy, 'data_test/' + folder + '/' + to_copy)

654
1028
1151
230
1151
1150
284
