In [None]:
import pandas as pd
from pathlib import Path

In [None]:
# basics
import os
import time
import numpy as np

# EDA
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image

# Data preprocessing
import tensorflow as tf
import tensorflow_io as tfio
from sklearn.utils import resample
from sklearn.model_selection import train_test_split

# Convolutional neural network
from keras.models import Sequential
from tensorflow.keras import layers, models

# helper functions
from sklearn.metrics import roc_auc_score
from tensorflow.keras.models import load_model


In [None]:
# Print list of files and directories in folder
input_dir = '/kaggle/input/histopathologic-cancer-detection'
list_l = [os.path.join(input_dir, x) for x in os.listdir(input_dir)]
list_l


In [None]:
# Set datasets and directory names
sample_data = pd.read_csv(list_l[0])
train_data = pd.read_csv(list_l[1])
train_dir = list_l[3] + '/'
test_dir = list_l[2] + '/'


In [None]:
# Cleaning
del list_l


In [None]:
def print_short_summary(name, data):
    """
    Prints data head, shape and info.
    Args:
        name (str): name of dataset
        data (dataframe): dataset in a pd.DataFrame format
    """
    print(name)
    print('\n1. Data head:')
    print(data.head())
    print('\n2. Data shape: {}'.format(data.shape))
    print('\n3. Data info:')
    data.info()
    
def print_number_files(dirpath):
    print('{}: {} files'.format(dirpath, len(os.listdir(dirpath))))


In [None]:
print_short_summary('Train data', train_data)


In [None]:
print_short_summary('Sample data', sample_data)


In [None]:
print_number_files(train_dir)


In [None]:
print_number_files(test_dir)


In [None]:
# Cleaning
del print_short_summary, print_number_files


In [None]:
# Plot horizontal barplot of number of records per label
plt.figure(figsize=(16, 9))
tmp = train_data['label'].value_counts()
sns.barplot(y=['No Cancer', 'Cancer'], x=tmp.values, orient='h')
plt.xlabel('Number of records')
plt.ylabel('Label')
plt.title('Number of records per label')
plt.show()


In [None]:
# Cleaning
del tmp


In [None]:
def get_images_to_plot(file_names):
    """
    Returns list of images
    Args:
        file_names: list of filenames
    Returns:
        list of image objects
    """
    return [Image.open(f) for f in file_names]

def get_image_label(dirname, data, labels, n = 5):
    """
    Return dictionary with label-imagepath
    Args:
        dirname: name of the directory
        data: dataset of file names
        labels: list of labels
        n (opt): number of images per label
    Returns:
        dict_img: dictionary with label-imagepath pairs
    """
    dict_img = {}
    for l in labels:
        indexes = data['label'] == l
        tmp = data[indexes][:n]
        tmp = dirname + tmp['id'] + '.tif'
        tmp = tmp.values
        tmp = get_images_to_plot(tmp)
        dict_img[l] = tmp
        
    return dict_img


In [None]:
# Print original image size
img_path = train_dir + train_data['id'][0] + '.tif'
img = Image.open(img_path)
print('Original image size: {}'.format(img.size))


In [None]:
# Get 5 filenames per label
data = get_image_label(train_dir,train_data, [0,1])


In [None]:
# Initialize subplots with 2 rows and 5 columns
fig, axes = plt.subplots(nrows=2, ncols=5, figsize=(16, 9))

# Loop through selected images and display in the respective rows
labels = ['No Cancer', 'Cancer']
for i in range(10):
    row = i // 5
    col = i % 5
    axes[row, col].imshow(data[row][col])
    axes[row, col].set_title(labels[row])
    axes[row, col].axis('off')

plt.tight_layout()
plt.show()


In [None]:
# Cleaning
del get_images_to_plot, get_image_label, img_path, img
del data, fig, axes, labels, row, col


In [None]:
# Majority class
no_cancer = train_data[train_data['label'] == 0]
# Minority class
cancer = train_data[train_data['label'] == 1]

# Downsample majority class to match minority class
no_cancer_downsampled = resample(no_cancer,
                              replace=False, 
                              n_samples=len(cancer),
                              random_state=0)

balanced_train_data = pd.concat([no_cancer_downsampled, cancer])

# Shuffle train data for training
balanced_train_data = balanced_train_data.sample(frac=1, random_state=0).reset_index(drop=True)


In [None]:
# Cleaning
del no_cancer, cancer, no_cancer_downsampled


In [None]:
# Get full path to image including extension
image_paths = train_dir + balanced_train_data['id'] + '.tif'
image_paths = image_paths.values

labels = balanced_train_data['label'].values

X_train, X_test, y_train, y_test = train_test_split(image_paths
                                                    , labels
                                                    , test_size = 0.25
                                                    , shuffle = True
                                                    , random_state = 0)


In [None]:
# Cleaning
del image_paths, labels


In [None]:
def get_decoded_image(image_path, label=None):
    """
    Load and preprocess images using TensorFlow I/O.
    Decode image with 4 channels RGBA.
    Resize image to 32x32px.
    Scale pixels from 0 to 1.
    Args:
        image_path: path to TIFF image
        label (optional): true label from train data
    Returns:
        (img, label): for train data
        img: for test data
    """
    img = tf.io.read_file(image_path)
    img = tfio.experimental.image.decode_tiff(img)
    img = tf.image.resize(img, [32, 32])
    img = tf.cast(img, tf.float32) / 255.0
    
    return img if label is None else (img, label)

def get_prefetched_data(data, batch_size):
    """
    Create a TensorFlow dataset from image paths and labels.
    Execution in parallel.
    Load, preprocess images and batch the data.
    Prefetch batches to improve training performance.
    Args:
        data (tuple): image paths and corresponding labels
        batch_size (int): number of samples per batch
    Returns:
        tf.data.Dataset: preprocessed and preloaded TensorFlow dataset for keras CNN
    """
    # Autotune the degree of parallelism during training
    AUTOTUNE = tf.data.experimental.AUTOTUNE
    
    # Create dataset from image paths and labels
    dataset = tf.data.Dataset.from_tensor_slices(data)
    
    # Apply parallel processing to load and preprocess images
    dataset = dataset.map(get_decoded_image, num_parallel_calls=AUTOTUNE)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(buffer_size=AUTOTUNE)
    
    return dataset


In [None]:
# Set 128 samples to be processed in each training step
BATCH_SIZE = 128

train_dataset = get_prefetched_data((X_train, y_train)
                                    , BATCH_SIZE)
test_dataset = get_prefetched_data((X_test, y_test)
                                   , BATCH_SIZE)


In [None]:
del X_train, y_train, X_test, y_test


In [None]:
def get_model_base():
    """
    Return base model architecture
    """
    model = models.Sequential([
        layers.Conv2D(32, (3, 3), activation='relu', input_shape=(32, 32, 4))

        , layers.Flatten()

        , layers.Dense(32, activation='relu')

        , layers.Dense(1, activation='sigmoid')
    ])
    
    return model


In [None]:
def get_model_base_deep():
    """
    Return deeper model architecture
    """
    model = models.Sequential([
        layers.Conv2D(32, (3, 3), activation='relu', input_shape=(32, 32, 4))
        # Add new convolutional layer
        , layers.Conv2D(32, (3, 3), activation='relu')

        , layers.Flatten()

        , layers.Dense(32, activation='relu')
        # Add new dense layer
        , layers.Dense(32, activation='relu')

        , layers.Dense(1, activation='sigmoid')
    ])
    
    return model


In [None]:
def get_model_base_wide():
    """
    Return wider model architecture
    """
    model_drop_bn = models.Sequential([
        # Increase number of units from 32 to 64
        layers.Conv2D(64, (3, 3), activation='relu', input_shape=(32, 32, 4))

        , layers.Flatten()
        
        # Increase number of units from 32 to 64
        , layers.Dense(64, activation='relu')
 
        , layers.Dense(1, activation='sigmoid')
    ])
    
    return model_drop_bn


In [None]:
def get_model_base_maxpool():
    """
    Return maxpool model architecture
    """
    model = models.Sequential([
        layers.Conv2D(32, (3, 3), activation='relu', input_shape=(32, 32, 4))
        # Add new layer of max pooling
        , layers.MaxPooling2D((2, 2), strides = (2,2))

        , layers.Flatten()

        , layers.Dense(32, activation='relu')
        
        , layers.Dense(1, activation='sigmoid')
    ])
    
    return model


In [None]:
def get_model_base_dropout():
    """
    Return dropout model architecture
    """
    model = models.Sequential([
        layers.Conv2D(32, (3, 3), activation='relu', input_shape=(32, 32, 4))

        , layers.Flatten()

        , layers.Dense(32, activation='relu')
        
        # Add new dropout layer 
        , layers.Dropout(0.25)
        
        , layers.Dense(1, activation='sigmoid')
    ])
    
    return model


In [None]:
def get_compiled_model(func):
    """
    Create model to be trained with a multi-GPU strategy.
    Args:
        func: function to get model architecture
    Returns:
        compiled_model: tensorflow model that performs data parallelism
                            by copying all of the model's variables
                            to each processor
    """
    # Check if GPU is available
    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        # Create a MirroredStrategy.
        strategy = tf.distribute.MirroredStrategy()

        print('Number of devices: {}'.format(strategy.num_replicas_in_sync))
    else:
        strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0")
        print('No GPU available, falling back to CPU.')

    with strategy.scope():
        compiled_model = func()
        compiled_model.compile(optimizer = tf.keras.optimizers.Adam()
                              , loss = tf.keras.losses.BinaryCrossentropy()
                              , metrics = [tf.keras.metrics.AUC()])

    return compiled_model


In [None]:
def plot_model_scores(scores, model_name):
    """
    Plot train and test ROC AUC scores of a model by epoch
    """
    train_scores, test_scores = scores
    epochs = range(1, len(train_scores) + 1)

    # Plot train and test scores
    plt.figure(figsize=(16, 9))
    plt.plot(epochs, train_scores, label='Train score')
    plt.plot(epochs, test_scores, label='Test score')
    plt.title('Train and test ROC AUC scores of the {}'.format(model_name))
    plt.xlabel('Epoch')
    plt.ylabel('ROC AUC Score')
    plt.legend()
    plt.grid(True)
    plt.show()

    
def get_model_results(model_name, model):
    """
    Return tuple of runtime, train and test scores.
    Compile, fit and save model along the way.
    Args:
        model: fitted model
    Returns:
        (runtime, (train_scores, test_scores) )
    """
    model = get_compiled_model(model)
    
    st = time.time()
    model.fit(train_dataset, epochs=5, validation_data=test_dataset)
    runtime = time.time() - st
    
    model.save('{}.h5'.format(model_name))
    
    train_scores = model.history.history['auc']
    test_scores = model.history.history['val_auc']
    
    tf.keras.backend.clear_session()
    
    return (runtime, (train_scores, test_scores))


In [None]:
# Get train and test scores of every epoch
runtime_base, scores_base = get_model_results('model_base',get_model_base)


In [None]:
# Plot scores
plot_model_scores(scores_base, 'base model')


In [None]:
# Get train and test scores of every epoch
runtime_base_deep, scores_base_deep = get_model_results('model_base_deep'
                                                          ,get_model_base_deep)


In [None]:
# Plot scores
plot_model_scores(scores_base_deep, 'base + additional layers model')


In [None]:
# Get train and test scores of every epoch
runtime_base_wide, scores_base_wide = get_model_results('model_base_wide'
                                                          ,get_model_base_wide)


In [None]:
# Plot scores
plot_model_scores(scores_base_wide, 'base + wider layers model')


In [None]:
# Get train and test scores of every epoch
runtime_base_maxpool, scores_base_maxpool = get_model_results('model_base_maxpool'
                                                                ,get_model_base_maxpool)


In [None]:
# Plot scores
plot_model_scores(scores_base_maxpool, 'base + max pooling model')


In [None]:
# Get train and test scores of every epoch
runtime_base_dropout, scores_base_dropout = get_model_results('model_base_dropout'
                                                                ,get_model_base_dropout)


In [None]:
# Plot scores
plot_model_scores(scores_base_dropout, 'base + dropout model')


In [None]:
# Print table results comparison
results = [('Base', runtime_base, scores_base)
          ,('Base + Add. layers', runtime_base_deep, scores_base_deep)
          ,('Base + Wider layers', runtime_base_wide, scores_base_wide)
          ,('Base + Max pooling', runtime_base_maxpool, scores_base_maxpool)
          ,('Base + Dropout', runtime_base_dropout, scores_base_dropout)]
table = []
for i in range(len(results)):
    tmp = {
            'model': results[i][0]
            , 'runtime (sec)': results[i][1]
            , 'train_roc_auc_score': results[i][2][0][-1]
            , 'test_roc_auc_score': results[i][2][1][-1]
        }
    table.append(tmp)


pd.DataFrame(table).sort_values(by = ['test_roc_auc_score'
                                      ,'runtime (sec)']
                                , ascending = [False, True])


In [None]:
# Cleaning
del results, tmp, table, train_dataset, test_dataset, train_data, train_dir, test_dir
del get_model_results, plot_model_scores, get_compiled_model


In [None]:
# Load top performed model
model = load_model('model_base_deep.h5')


In [None]:
# Create prefethed dataset of images to classify
submis_data = test_dir + sample_data['id'] + '.tif'
submis_data = submis_data.values

submis_dataset = get_prefetched_data((submis_data)
                                    , BATCH_SIZE)


In [None]:
# Set results
results = model.predict(submis_dataset)


In [None]:
# Create table of ids and labels like sample_submission
sample_data['label'] = np.ravel(np.round(results))


In [None]:
# Print submission table
sample_data


In [None]:
# Make submission
sample_data.to_csv('submission.csv', index=False)


In [None]:
# Cleaning
del submis_data, submis_dataset, sample_data
del get_decoded_image, get_prefetched_data
