In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from IPython.display import display # Allows the use of display() for DataFrames
from time import time
import matplotlib.pyplot as plt
import seaborn as sns # Plotting library
import keras
from keras.preprocessing import image
from keras.preprocessing.image import ImageDataGenerator, img_to_array
from keras.utils import np_utils
from sklearn.datasets import load_files   
from tqdm import tqdm
from collections import Counter


print(os.listdir("../input"))

There are a lot more samples of nevus compared to the other two. This might cause the network to be biased. It will try to maximize the error function, and by classifying everything as nevus it will accomplish that.

For this problem we will need to be careful with the accuracy metric. I will try to balance the data in the model.

data_train_path = '../input/skin-lesion-analysis-towards-melanoma-detection/train/train'
data_valid_path = '../input/skin-lesion-analysis-towards-melanoma-detection/valid/valid'
data_test_path = '../input/skin-lesion-analysis-towards-melanoma-detection/test/test'

EDA

Lets find out how many samples we have for each category.

In [None]:
# define function to load train, test, and validation datasets
def load_data_raw (path):
    data = load_files(path)
    files = np.array(data['filenames'])
    targets = np_utils.to_categorical(np.array(data['target']), 3)
    
    return files, targets

train_filenames, train_targets = load_data_raw(data_train_path)

In [None]:
filenames_trimmed = [filename.split('/')[-2] for filename in train_filenames]
classes_count = Counter(filenames_trimmed)

# Plot the classes
plt.bar(classes_count.keys(), classes_count.values(), color=['blue', 'orange', 'green'])

Upsampling function for imbalanced data

Using scikit learn's resample function I will create new samples of the under-represented data.

In [None]:
def plot_n_samples(filenames):
    filenames_trimmed = [filename.split('/')[-2] for filename in filenames]
    classes_count = Counter(filenames_trimmed)

    # Plot the classes
    plt.bar(classes_count.keys(), classes_count.values(), color=['blue', 'orange', 'green'])

In [None]:
from sklearn.utils import resample, shuffle

# Choose one of the 3 for the feature_name
feature_names = {0: 'melanoma', 1: 'nevus', 2: 'seborrheic_keratosis'}

def upsample(filenames, targets, feature_name, n_samples = 1372):
    upsample_idx = []
    

    # Find all the indices for nevus
    for i, path in enumerate(filenames):
        # If feature matches, save the index
        if feature_name in path.split('/'):
            upsample_idx.append(i)
    
    # Remove selected features from filenames to add the upsampled after
    new_filenames = [filename for i, filename in enumerate(filenames) if i not in upsample_idx]
    new_targets = [target for i, target in enumerate(targets) if i not in upsample_idx]

    # Upsample
    resampled_x, resampled_y = resample(filenames[upsample_idx], targets[upsample_idx], n_samples=n_samples, random_state=0)

    # Add the upsampled features to new_filenames and new_targets
    new_filenames += list(resampled_x)
    new_targets += list(resampled_y) 
    
    return np.array(new_filenames), np.array(new_targets)
    
# We upsample twice: once for each feature we want upsampled
upsample_train_x, upsample_train_y = upsample(train_filenames, train_targets, feature_names[0])
upsample_train_x, upsample_train_y = upsample(upsample_train_x, upsample_train_y, feature_names[2])

plot_n_samples(upsample_train_x)

Downsampling function for imbalanced data

In [None]:
'''
# Use only if not using the up-sampling function
def downsample(filenames, targets, n_samples = 370):
    nevus_idx = []
    
    # Find all the indices for nevus
    for i, path in enumerate(filenames):
        # If nevus, save the index
        if 'nevus' in path.split('/'):
            nevus_idx.append(i)
    
    nevus_idx = np.sort(shuffle(nevus_idx)[n_samples:]) # shuffle indices

    # Downsample
    new_filenames = [filename for i, filename in enumerate(filenames) if i not in nevus_idx]
    new_targets = [target for i, target in enumerate(targets) if i not in nevus_idx]
    
    
    return new_filenames, new_targets
            
downsample_train_x, downsample_train_y = downsample(train_filenames, train_targets)

plot_n_samples(downsample_train_x)
'''

In [None]:
from keras.preprocessing import image   

# Convert the image paths to tensors Manually
def path_to_tensor(img_path):
    # loads RGB image as PIL.Image.Image type
    img = image.load_img(img_path, target_size=(224,224))
    # convert PIL.Image.Image type to 3D tensor with shape (224, 224, 3)
    x = image.img_to_array(img)
    # convert 3D tensor to 4D tensor with shape (1, 224, 224, 3) and return 4D tensor
    return np.expand_dims(x, axis=0)

def paths_to_tensor(img_paths):
    list_of_tensors = [path_to_tensor(img_path) for img_path in tqdm(img_paths)]
    return np.vstack(list_of_tensors)


train_filenames = paths_to_tensor(upsample_train_x)
train_targets = upsample_train_y