In [None]:
import warnings
warnings.filterwarnings("ignore")

import os
import glob
import h5py
import shutil


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
from pathlib import Path
import matplotlib.pylab as pl
import matplotlib.pyplot as plt
from itertools import product, starmap
from multiprocessing import Pool, Manager
from functools import partial

from skimage.io import imread, imsave
from skimage.transform import resize
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.applications import resnet50, mobilenet
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dense, Dropout
from tensorflow.keras.layers import Input, Flatten, Activation
from tensorflow.keras.optimizers import Adam, SGD, RMSprop
from tensorflow.keras.callbacks import ModelCheckpoint, Callback, EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.metrics import top_k_categorical_accuracy
from tensorflow.keras.preprocessing.image import ImageDataGenerator

from tensorflow.keras import backend as K
import tensorflow as tf


color = sns.color_palette()
%matplotlib inline
%config InlineBackend.figure_format="svg"

In [None]:
# Set the seed for hash based operations in python
os.environ['PYTHONHASHSEED'] = '0'

seed=1234

# Set the numpy seed
np.random.seed(seed)

# Set the random seed in tensorflow at graph level
tf.random.set_seed(seed)

In [None]:
# segregate train test data
folder1_images = list(folder1.rglob("*.jpg"))
folder2_images = list(folder2.rglob("*.jpg"))  

print("Number of images in folder 1: ", len(folder1_images))
print("Number of images in folder 2: ", len(folder2_images))

In [None]:
# Read the given csv
data = pd.read_csv("cancer_data/HAM10000_metadata.csv")
print("Total number of data samples: ", len(data))
print("Columns: ", data.columns)
data.head()

In [None]:
# Check for duplicated entries
data.duplicated(['lesion_id']).sum()

In [None]:
# Check if the duplicates is across all the columns
data.duplicated(['lesion_id', 'dx', 'dx_type', 'age', 'sex', 'localization']).sum()

In [None]:
# get the duplicates index
duplicates = data[data.duplicated(['lesion_id', 'dx', 'dx_type', 'age', 'sex', 'localization'])]
idx_to_drop = duplicates.index

In [None]:
# drop the duplicates 
data = data.drop(idx_to_drop).reset_index(drop=True)
data.shape, data.head()

A random `train_test_split` with a ratio normally taken as `80-20 or 85-15` works well in many situations. However this is not the case. Columns like `age`, `sex` and `localization` provide some concrete info about a lesion. Hence we need to do the splitting in a much wiser manner. We will take the following approach:
* Define the split for each category
* Stratify the split according to localization

Why are we stratifying the split using `localization`? 
**This is to make sure that the distribution is same across training and validation sets.** A lesion on ear looks different than how it appears on the back. If your training data consists of lesions, for example on back but not on ears, and your validation data consists of lesions on ear, then your model will perform very poorly

In [None]:
# Define the split keeping in mind the number of training examples we have for each class
split_dict = {'nv': 0.9, 'bkl': 0.85, 'mel': 0.85, 'bcc': 0.85, 'akiec': 0.85, 'vasc': 0.85, 'df': 0.85}

In [None]:
# Get startified split for each category
def get_stratified_samples(df, 
                           cls, 
                           train_size,
                           min_samples=5,
                           sample=False, 
                           sample_count=None):
    """
    This function is used to create stratified
    training and validation sets for each category
    separately. 
    
    Args:
        df         : train/validation dataframe
        cls        : category to consdier
        train_size : size for training set
        min_samples: min samples for a particluar 
                     localization across a category
        sample     : do random sampling
        sample_count: how many random samples to choose
        
    Returns:
        train_df   : training split set
        valid_df   : validation split set
    """
    
    cls_df = df[df["dx"]==cls]
    counts = cls_df['localization'].value_counts()
    cat_to_remove = list(counts[counts < min_samples].keys())
    cls_df = cls_df[~(cls_df['localization'].isin(cat_to_remove))]
    cls_df = cls_df.reset_index(drop=True)
    
    train_df, test_df = train_test_split(cls_df, 
                                         train_size=train_size, 
                                         stratify=cls_df['localization'], 
                                         random_state=seed)
    if sample and sample_count is None:
        raise ValueError("Please provide an integer for sample count")
    elif sample_count:
        train_df = train_df.sample(n=sample_count, replace=True, random_state=seed)
    
    train_df = train_df.reset_index(drop=True)
    test_df = test_df.reset_index(drop=True)
    return train_df, test_df

In [None]:
# Make a dataframe for training and validation
# that will contain all train and valid sets 
# for diff categories respectively
final_train_df = pd.DataFrame(columns=data.columns)
final_valid_df = pd.DataFrame(columns=data.columns)

In [None]:
# Do a split for each category
for cls in split_dict:
    train_df, valid_df = get_stratified_samples(data, cls, split_dict[cls])
    # add train and validation splits to the final dataframe
    final_train_df = pd.concat([final_train_df, train_df]).reset_index(drop=True)
    final_valid_df = pd.concat([final_valid_df, valid_df]).reset_index(drop=True)
    del train_df, valid_df

In [None]:
# Count the samples in training and validation sets
final_train_df.shape, final_valid_df.shape

In [None]:
# Category count for training set
final_train_df.dx.value_counts()

In [None]:
# Category count for validation set
final_valid_df.dx.value_counts()

In [None]:
# Get the path where all the images are stored
folder1 = Path("cancer_data/HAM10000_images_part_1/")
folder2 = Path("cancer_data/HAM10000_images_part_2/")

In [None]:
# Make directories for training and validation samples
data = Path("data")
train = data / "train"
valid = data / "valid"

data.mkdir()
train.mkdir(parents=True)
valid.mkdir(parents=True)

## Segregation of images class-wise into different folders

In [None]:
def separate_images(df, cat="train"):
    """
    This function takes a dataframe and
    copies the corresponding images of the images
    into different folders for a particluar set
    
    Args:
        df   : dataframe containing data info
        cat  : train/valid
    """
    if cat=="train":
        main_dir = train
    else:
        main_dir = valid
        
    for cls in split_dict: 
        subdir = main_dir / cls
        subdir.mkdir(parents=True)
        print(f"Separating image data for {cls} class")
        
        images_list = df[df['dx']==cls]['image_id'].values
        print(f"Found {len(images_list)} images")
        print(f"Saving images in {str(subdir)}")
        
        for img in images_list:
            img_name = img + ".jpg"
            img1 = folder1 / img_name
            img2 = folder2 / img_name
            if img1 in folder1_images:
                shutil.copyfile(img1, subdir/img_name)
            elif img2 in folder2_images:
                shutil.copyfile(img2, subdir/img_name)
            else:
                print(f"{img_name} not found anywhere on the disk")
        print("="*50)

In [None]:
# Arrange training images category wise
separate_images(final_train_df, cat="train")

In [None]:
# Arrange validation images category-wise
separate_images(final_valid_df, cat="valid")

## Augmentation

The dataset is pretty small and that too **highly imbalanced** hence we need to do augmentation. But this isn't so straight forward in this case. There are two strategies for augmentation:

* **On the fly augmentation:** This is normally done by doing random augmentation on a batch inside your data generator or whatever your data generation pipeline is. Advantage of this approach is that it is super easy to setup and your are generating samples on fly, hence no disk write-read overhead happens. On the other hand, it has a pretty big disadvantage when the dataset is skewed. You are augmenting randomly, it doesn't mean you are anyhow generating equal samples for the same class. Plus the augmentation is happening for other categories as well for which it might not be required.

* **Off-line augmentation:** This is the kind of augmentation where you generate augmented samples before the training and save them on the disk for later use. The read-write overhead is an issue here for sure but the advantage is much bigger. You now have control over which category to augment and how many augmented samples to generate for a particular class so that the each class is balanced after augmentation 

Here we will use offline augmentation as we are having a huge imbalance in classes. For augmentation, we will use `ImageDataGenerator` class in `tf.keras`. 

In [None]:
# Define data generators 
def get_data_generator():
    data_gen = ImageDataGenerator(brightness_range=(0.3, 1.1), 
                                  rotation_range=60, 
                                  shear_range=0.2,
                                  width_shift_range=0.2,
                                  height_shift_range=0.2,
                                  horizontal_flip=True, 
                                  vertical_flip=True,
                                  zoom_range=0.2,
                                  fill_mode="reflect")
    return data_gen

In [None]:
# classes to augment
classes_to_aug = ['akiec', 'bcc', 'bkl', 'df', 'mel', 'vasc']

# some constants
img_height, img_width, img_channels = 224,224,3
batch_size = 32
nb_classes = 7

In [None]:
def augment_and_save(df, to_generate=5000, batch_size=16):
    aug_dir = Path("aug_images")
    final_aug_dir = Path("final_aug")
    
    for cls in classes_to_aug:
        aug_path = aug_dir / cls
        save_path = final_aug_dir / cls
        aug_path.mkdir(parents=True)
        save_path.mkdir(parents=True)
        
        orig_images = df[df["dx"]==cls]['image_id'].tolist()
        orig_count = len(orig_images)
        nb_images_to_gen = to_generate - orig_count
        print(f"Category: {cls}  Images found: {len(orig_images)}  nb_images_to_gen: {nb_images_to_gen}")
        
        
        for img in orig_images:
            img = img + ".jpg"
            img_path = train / cls / img
            img_name = img_path.name
            shutil.copyfile(img_path, aug_path / img_name) 
       
        
        # get data generator
        data_gen = get_data_generator()
        image_gen = data_gen.flow_from_directory(aug_dir,
                                    save_to_dir=save_path,
                                    save_format='jpg',
                                    save_prefix = "aug_",
                                    target_size=(img_height, img_width),
                                    batch_size=batch_size)
        
        nb_batches = int(np.ceil(nb_images_to_gen / batch_size))
        
        for j in range(nb_batches):
            _, _ = next(image_gen)
            
        shutil.rmtree(aug_dir)

In [None]:
# augment and save to disk
augment_and_save(df=final_train_df, to_generate=5000, batch_size=batch_size)