In [2]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import os
from sklearn.model_selection import train_test_split
import shutil #pip install pytest-shutil
from PIL import Image
import PIL
import imageio
import matplotlib.pyplot as plt
import glob

In [22]:
# Create new directory for the images
base_dir = 'base_dir'
os.mkdir(base_dir)

In [23]:
# Training file directory
train_dir = os.path.join(base_dir, 'train_dir')
os.mkdir(train_dir)

# Validation file directory
val_dir = os.path.join(base_dir, 'val_dir')
os.mkdir(val_dir)

# Testing file directory
test_dir = os.path.join(base_dir, 'test_dir')
os.mkdir(test_dir)

In [24]:
# Create new folders in training directory for each of the 7 classes

nv = os.path.join(train_dir, 'nv')
os.mkdir(nv)
mel = os.path.join(train_dir, 'mel')
os.mkdir(mel)
bkl = os.path.join(train_dir, 'bkl')
os.mkdir(bkl)
bcc = os.path.join(train_dir, 'bcc')
os.mkdir(bcc)
akiec = os.path.join(train_dir, 'akiec')
os.mkdir(akiec)
vasc = os.path.join(train_dir, 'vasc')
os.mkdir(vasc)
df = os.path.join(train_dir, 'df')
os.mkdir(df)

# Create new folders in validation directory for each of the 7 classes

nv = os.path.join(val_dir, 'nv')
os.mkdir(nv)
mel = os.path.join(val_dir, 'mel')
os.mkdir(mel)
bkl = os.path.join(val_dir, 'bkl')
os.mkdir(bkl)
bcc = os.path.join(val_dir, 'bcc')
os.mkdir(bcc)
akiec = os.path.join(val_dir, 'akiec')
os.mkdir(akiec)
vasc = os.path.join(val_dir, 'vasc')
os.mkdir(vasc)
df = os.path.join(val_dir, 'df')
os.mkdir(df)

# Create new folders in test directory for each of the 7 classes

nv = os.path.join(test_dir, 'nv')
os.mkdir(nv)
mel = os.path.join(test_dir, 'mel')
os.mkdir(mel)
bkl = os.path.join(test_dir, 'bkl')
os.mkdir(bkl)
bcc = os.path.join(test_dir, 'bcc')
os.mkdir(bcc)
akiec = os.path.join(test_dir, 'akiec')
os.mkdir(akiec)
vasc = os.path.join(test_dir, 'vasc')
os.mkdir(vasc)
df = os.path.join(test_dir, 'df')
os.mkdir(df)

In [25]:
# Read metadata file
mdf = pd.read_csv('HAM10000_metadata.csv')

mdf.sample(5)

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
6289,HAM_0000660,ISIC_0028953,nv,follow_up,40.0,male,abdomen
4183,HAM_0005538,ISIC_0028246,nv,follow_up,45.0,male,abdomen
3409,HAM_0005629,ISIC_0029317,nv,follow_up,45.0,female,upper extremity
6947,HAM_0006747,ISIC_0029254,nv,histo,50.0,male,face
3316,HAM_0003438,ISIC_0031151,nv,follow_up,45.0,female,trunk


In [26]:
# Set y as the data labels (column dx)
y = mdf['dx']

In [27]:
# Split into training, validation, and test
val_size = 0.1
test_size = 0.1
mdf_train_temp, mdf_remain, y_train_temp, y_remain = train_test_split(mdf, y, test_size = (0.2), random_state = 101)
new_test_size = np.around(test_size / (val_size + test_size),2)
mdf_val, mdf_test, y_val, y_test = train_test_split(mdf_remain, y_remain, test_size = new_test_size, random_state = 101)
mdf_train, mdf_discard, y_train, y_discard = train_test_split(mdf_train_temp, y_train_temp, test_size = 0.5, random_state = 101)
#print(mdf_train_temp.shape)
print(mdf_val.shape)
print(mdf_test.shape)
print(mdf_train.shape)

(1001, 7)
(1002, 7)
(4006, 7)


In [28]:
# Transfer the images into folders, set the image id as the index
mdf.set_index('image_id', inplace = True)

# Get a list of the images in each of the two folders
folder_1 = os.listdir('C:/Users/hp/Downloads/skin cancer/HAM10000_images_part_1')
folder_2 = os.listdir('C:/Users/hp/Downloads/skin cancer/HAM10000_images_part_2')

In [29]:
# Get a list of the train and validation images
train_list = list(mdf_train['image_id'])
val_list = list(mdf_val['image_id'])
test_list = list(mdf_test['image_id'])

In [30]:
akiec = 0
bcc = 0
bkl = 0
df = 0
mel = 0
nv = 0
vasc = 0
for image in train_list:
  r=mdf.loc[image, 'dx']
  if r == 'akiec':
    akiec = akiec + 1
  if r == 'bcc':
    bcc = bcc + 1
  if r == 'bkl':
    bkl = bkl + 1
  if r == 'df':
    df = df + 1
  if r == 'mel':
    mel = mel + 1
  if r == 'nv':
    nv = nv + 1
  if r == 'vasc':
    vasc = vasc + 1
print('akiec =', akiec)
print('bcc =', bcc)
print('bkl =', bkl)
print('df =', df)
print('mel =', mel)
print('nv =', nv)
print('vasc =', vasc)

akiec = 138
bcc = 195
bkl = 464
df = 46
mel = 429
nv = 2669
vasc = 65


In [31]:
# Transfer the training images

for image in train_list:
    
    fname = image + '.jpg'
    label = mdf.loc[image, 'dx']
    
    if fname in folder_1:
        # Source path to image
        src = os.path.join('C:/Users/hp/Downloads/skin cancer/HAM10000_images_part_1', fname)
        # Destination path to image
        dst = os.path.join(train_dir, label, fname)
        # Copy the image from the source to the destination
        print(dst)
        shutil.copyfile(src, dst)

    if fname in folder_2:
        # Source path to image
        src = os.path.join('C:/Users/hp/Downloads/skin cancer/HAM10000_images_part_2', fname)
        # Destination path to image
        dst = os.path.join(train_dir, label, fname)
        # Copy the image from the source to the destination
        print(dst)
        shutil.copyfile(src, dst)

# print(len(os.listdir('base_dir/train_dir')))

base_dir\train_dir\nv\ISIC_0029224.jpg
base_dir\train_dir\nv\ISIC_0032177.jpg
base_dir\train_dir\nv\ISIC_0025045.jpg
base_dir\train_dir\nv\ISIC_0025664.jpg
base_dir\train_dir\bkl\ISIC_0031989.jpg
base_dir\train_dir\bkl\ISIC_0029288.jpg
base_dir\train_dir\mel\ISIC_0025766.jpg
base_dir\train_dir\nv\ISIC_0032281.jpg
base_dir\train_dir\nv\ISIC_0026425.jpg
base_dir\train_dir\nv\ISIC_0032980.jpg
base_dir\train_dir\nv\ISIC_0024642.jpg
base_dir\train_dir\nv\ISIC_0032101.jpg
base_dir\train_dir\nv\ISIC_0025349.jpg
base_dir\train_dir\nv\ISIC_0026898.jpg
base_dir\train_dir\nv\ISIC_0030802.jpg
base_dir\train_dir\bkl\ISIC_0026265.jpg
base_dir\train_dir\akiec\ISIC_0029460.jpg
base_dir\train_dir\nv\ISIC_0032738.jpg
base_dir\train_dir\nv\ISIC_0033233.jpg
base_dir\train_dir\nv\ISIC_0029330.jpg
base_dir\train_dir\nv\ISIC_0033926.jpg
base_dir\train_dir\mel\ISIC_0030382.jpg
base_dir\train_dir\mel\ISIC_0033479.jpg
base_dir\train_dir\nv\ISIC_0029904.jpg
base_dir\train_dir\nv\ISIC_0034063.jpg
base_dir\train_d

In [32]:
# Transfer the validation images

for image in val_list:

    fname = image + '.jpg'
    label = mdf.loc[image, 'dx']
    
    if fname in folder_1:
        # Source path to image
        src = os.path.join('C:/Users/hp/Downloads/skin cancer/HAM10000_images_part_1', fname)
        # Destination path to image
        dst = os.path.join(val_dir, label, fname)
        # Copy the image from the source to the destination
        print(dst)
        shutil.copyfile(src, dst)

    if fname in folder_2:
        # Source path to image
        src = os.path.join('C:/Users/hp/Downloads/skin cancer/HAM10000_images_part_2', fname)
        # Destination path to image
        dst = os.path.join(val_dir, label, fname)
        # Copy the image from the source to the destination
        print(dst)
        shutil.copyfile(src, dst)

base_dir\val_dir\mel\ISIC_0028317.jpg
base_dir\val_dir\nv\ISIC_0024340.jpg
base_dir\val_dir\mel\ISIC_0033653.jpg
base_dir\val_dir\bcc\ISIC_0027976.jpg
base_dir\val_dir\nv\ISIC_0031856.jpg
base_dir\val_dir\nv\ISIC_0033434.jpg
base_dir\val_dir\bkl\ISIC_0030188.jpg
base_dir\val_dir\bkl\ISIC_0029302.jpg
base_dir\val_dir\nv\ISIC_0030503.jpg
base_dir\val_dir\mel\ISIC_0029021.jpg
base_dir\val_dir\bcc\ISIC_0032894.jpg
base_dir\val_dir\bkl\ISIC_0031352.jpg
base_dir\val_dir\nv\ISIC_0032601.jpg
base_dir\val_dir\nv\ISIC_0024820.jpg
base_dir\val_dir\mel\ISIC_0033655.jpg
base_dir\val_dir\nv\ISIC_0031549.jpg
base_dir\val_dir\bkl\ISIC_0027737.jpg
base_dir\val_dir\nv\ISIC_0027322.jpg
base_dir\val_dir\nv\ISIC_0029835.jpg
base_dir\val_dir\nv\ISIC_0032340.jpg
base_dir\val_dir\nv\ISIC_0030693.jpg
base_dir\val_dir\nv\ISIC_0025916.jpg
base_dir\val_dir\nv\ISIC_0030398.jpg
base_dir\val_dir\nv\ISIC_0032237.jpg
base_dir\val_dir\bkl\ISIC_0032306.jpg
base_dir\val_dir\bkl\ISIC_0034201.jpg
base_dir\val_dir\nv\ISIC_0

In [33]:
# Transfer the testing images

for image in test_list:

    fname = image + '.jpg'
    label = mdf.loc[image, 'dx']

    if fname in folder_1:
        # Source path to image
        src = os.path.join('C:/Users/hp/Downloads/skin cancer/HAM10000_images_part_1', fname)
        # Destination path to image
        dst = os.path.join(test_dir, label, fname)
        # Copy the image from the source to the destination
        shutil.copyfile(src, dst)

    if fname in folder_2:
        # Source path to image
        src = os.path.join('C:/Users/hp/Downloads/skin cancer/HAM10000_images_part_2', fname)
        # Destination path to image
        dst = os.path.join(test_dir, label, fname)
        # Copy the image from the source to the destination
        shutil.copyfile(src, dst)

In [19]:
# Check how many training images are in each folder

print(len(os.listdir('base_dir/train_dir/nv')))
print(len(os.listdir('base_dir/train_dir/mel')))
print(len(os.listdir('base_dir/train_dir/bkl')))
print(len(os.listdir('base_dir/train_dir/bcc')))
print(len(os.listdir('base_dir/train_dir/akiec')))
print(len(os.listdir('base_dir/train_dir/vasc')))
print(len(os.listdir('base_dir/train_dir/df')))

# Check how many validation images are in each folder

print(len(os.listdir('base_dir/val_dir/nv')))
print(len(os.listdir('base_dir/val_dir/mel')))
print(len(os.listdir('base_dir/val_dir/bkl')))
print(len(os.listdir('base_dir/val_dir/bcc')))
print(len(os.listdir('base_dir/val_dir/akiec')))
print(len(os.listdir('base_dir/val_dir/vasc')))
print(len(os.listdir('base_dir/val_dir/df')))

# Check how many test images are in each folder

print(len(os.listdir('base_dir/test_dir/nv')))
print(len(os.listdir('base_dir/test_dir/mel')))
print(len(os.listdir('base_dir/test_dir/bkl')))
print(len(os.listdir('base_dir/test_dir/bcc')))
print(len(os.listdir('base_dir/test_dir/akiec')))
print(len(os.listdir('base_dir/test_dir/vasc')))
print(len(os.listdir('base_dir/test_dir/df')))

2669
429
464
195
138
65
46
679
106
107
60
28
13
8
657
136
111
49
25
12
12


In [20]:
# custom rotation function by increments of 90 degrees
import random
def myFunc(image):
    image = np.array(image)
    return np.rot90(image,random.randint(1,4))

# Augment the data in the training set to 4000 images

class_list = ['nv', 'mel', 'bkl', 'bcc', 'akiec', 'vasc', 'df']

for item in class_list:

    # Create a temporary directory for the augmented images
    aug_dir = 'aug_dir'
    os.mkdir(aug_dir)

    # Create a directory within the base dir to store images of the same class
    img_dir = os.path.join(aug_dir, 'img_dir')
    os.mkdir(img_dir)

    # Choose a class
    img_class = item

    # List all the images in the directory
    img_list = os.listdir('base_dir/train_dir/' + img_class)

    # Copy images from the class train dir to the img_dir
    for fname in img_list:
        # Source path to image
        src = os.path.join('base_dir/train_dir/' + img_class, fname)
        # Destination path to image
        dst = os.path.join(img_dir, fname)
        # Copy the image from the source to the destination
        shutil.copyfile(src, dst)

    # Point to a dir containing the images and not to the images themselves
    path = aug_dir
    save_path = 'base_dir/train_dir/' + img_class

    # Create a data generator to augment the images in real time
    datagen = ImageDataGenerator(
        preprocessing_function=myFunc,
        zoom_range=0.1,
        horizontal_flip=True,
        vertical_flip=True,
        brightness_range=(0.9,1.1))

    batch_size = 5

    aug_datagen = datagen.flow_from_directory(path,
                                              save_to_dir=save_path,
                                              save_prefix=fname[:-4],
                                              save_format='jpg',
                                              target_size=(224, 224),
                                              batch_size=batch_size)

    # Generate the augmented images and add them to the training folders
    num_aug_images_wanted = 4000  # total number of images we want to have in each class
    num_files = len(os.listdir(img_dir))
    num_batches = int(np.ceil((num_aug_images_wanted - num_files) / batch_size))

    # Run the generator and create about 6000 augmented images
    for i in range(0, num_batches):
        imgs, labels = next(aug_datagen)

    # Delete temporary directory with the raw image files
    shutil.rmtree('aug_dir')

Found 2669 images belonging to 1 classes.
Found 429 images belonging to 1 classes.
Found 464 images belonging to 1 classes.
Found 195 images belonging to 1 classes.
Found 138 images belonging to 1 classes.
Found 65 images belonging to 1 classes.
Found 46 images belonging to 1 classes.


In [34]:
# Resizing
class_list = ['mel', 'bkl', 'bcc', 'akiec', 'vasc', 'df','nv']
cwd = os.getcwd()
for item in class_list:
    for fname in os.listdir(os.path.join(cwd, 'base_dir', 'train_dir', item)):
        if len(fname) == 16:
            img = Image.open(os.path.join(os.path.join(cwd, 'base_dir', 'train_dir', item), fname))
            img = img.resize((224,224))
            img.save(os.path.join(os.path.join(cwd, 'base_dir', 'train_dir', item), fname), 'JPEG')
                     
for item in class_list:
    for fname in os.listdir(os.path.join(cwd, 'base_dir', 'val_dir', item)):
        if len(fname) == 16:
            img = Image.open(os.path.join(os.path.join(cwd, 'base_dir', 'val_dir', item), fname))
            img = img.resize((224,224))
            img.save(os.path.join(os.path.join(cwd, 'base_dir', 'val_dir', item), fname), 'JPEG')

for item in class_list:
    for fname in os.listdir(os.path.join(cwd, 'base_dir', 'test_dir', item)):
        if len(fname) == 16:
            img = Image.open(os.path.join(os.path.join(cwd, 'base_dir', 'test_dir', item), fname))
            img = img.resize((224,224))
            img.save(os.path.join(os.path.join(cwd, 'base_dir', 'test_dir', item), fname), 'JPEG')