# Introduction
This paper's dataset is taken from the Kaggle competition on Histopathologic Cancer Detection. It uses the PatchCamelyon (PCam) dataset, around 300k fixed-size histopathology (the study of tissue disease)
colored scans of lymph nodes all around the body. 

The specific challenge in the original dataset and competition is to train a model that can most accurately detect metastatic cancer.

The overall .zip file contains pictures and train-test csv files. The .csv files contains only two columns: id, and label, where the id contains the unique id or name of the picture, and the label determines whether the picture is indeed indicative of metastatic cancer.

**Note: This code was written in colab so installing dependencies and structuring directories might be different if you use, say, a local jupyter notebook.**

#Importing Packages

In [1]:
import pandas as pd
import numpy as np
import os
from google.colab import files
import cv2
from sklearn.utils import shuffle
import itertools
import shutil


np.random.seed(101)

#Downloading Dataset (Guide)

In [2]:
# Install a kaggle package to download the dataset
! pip install -q kaggle
! pip install --upgrade --force-reinstall --no-deps kaggle

Collecting kaggle
[?25l  Downloading https://files.pythonhosted.org/packages/fe/52/3d13208c0f24c72b886c400e94748076222d5ffa4913fb410af50cb09219/kaggle-1.5.9.tar.gz (58kB)
[K     |█████▋                          | 10kB 14.2MB/s eta 0:00:01[K     |███████████▎                    | 20kB 1.9MB/s eta 0:00:01[K     |████████████████▉               | 30kB 2.2MB/s eta 0:00:01[K     |██████████████████████▌         | 40kB 2.5MB/s eta 0:00:01[K     |████████████████████████████▏   | 51kB 2.1MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 1.9MB/s 
[?25hBuilding wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... [?25l[?25hdone
  Created wheel for kaggle: filename=kaggle-1.5.9-cp36-none-any.whl size=73265 sha256=1f7e21c2db8550e69bc8d701d2645914f56f11ee1768deeee680e9d33b780198
  Stored in directory: /root/.cache/pip/wheels/68/6d/9b/7a98271454edcba3b56328cbc78c037286e787d004c8afee71
Successfully built kaggle
Installing collected packages: k

In [3]:
# Run this cell, then upload your "kaggle.json" file when prompted.
# This is basically a password linked to your account to allow you to download
# the dataset

files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"abdulqadiruneeb","key":"5ff8fc1364da100ae220f8324362b423"}'}

In [4]:
# Below is code to gain permission to download the dataset

! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [5]:
# Download the desired dataset (in the default zip format)

! kaggle competitions download -c histopathologic-cancer-detection

Downloading histopathologic-cancer-detection.zip to /content
100% 6.30G/6.31G [01:31<00:00, 60.8MB/s]
100% 6.31G/6.31G [01:31<00:00, 73.7MB/s]


In [6]:
# Unzip and load the dataset onto your current directory
import zipfile
zip = zipfile.ZipFile('histopathologic-cancer-detection.zip')
zip.extractall()

In [7]:
# See image count in each folder
import os
print(len(os.listdir('../content/train')))
print(len(os.listdir('../content/test')))

220025
57458


In [8]:
# Create a Dataframe containing all images
# One column has the image ids, while the other has the label (0 for no cancer, and 1 for cancer)

data = pd.read_csv('../content/train_labels.csv')

# Removing this image because it causes a training error
data[data['id'] != 'dd6dfed324f9fcb6f93f46f32fc800f2ec196be2']

# Removing this image because it's black
data[data['id'] != '9369c7278ec8bcc6c880d99194de09fc2bd4efbe']

data.head(5)

Unnamed: 0,id,label
0,f38a6374c348f90b587e046aac6079959adf3835,0
1,c18f2d887b7ae4f6742ee445113fa1aef383ed77,1
2,755db6279dae599ebb4d39a9123cce439965282d,0
3,bc3f0c64fb968ff4a8bd33af6971ecae77c75e08,0
4,068aba587a4950175d04c680d38943fd488d6a9d,0


#Image Augmentation

In [11]:
#Import packages
from skimage.transform import rotate, AffineTransform
import cv2
from skimage.util import random_noise
import random
import os
from skimage import io
from skimage import img_as_ubyte


def readCroppedImage(path, augmentations = True,ORIGINAL_SIZE=96, CROP_SIZE=90, RANDOM_ROTATION=3,
                     RANDOM_SHIFT = 2, RANDOM_BRIGHTNESS = 7, RANDOM_CONTRAST = 5,
                     RANDOM_90_DEG_TURN=1):
    '''
    This is a custom function to convert an input image, augment it through
    random rotation, random x or y shift, random cropping, random flipping, 
    random changes in brightness and contrast, and returning it as an rgb tensor.

    # AUGMENTATION VARIABLES
    ORIGINAL_SIZE       Original size of the images 
    CROP_SIZE           Final size after crop
    RANDOM_ROTATION     Range (0-180), 180 allows all rotation variations, 0=no change
    RANDOM_SHIFT        Center crop shift in x and y axes, 0=no change. This cannot be more than (ORIGINAL_SIZE - CROP_SIZE)//2 
    RANDOM_BRIGHTNESS   Range (0-100), 0=no change
    RANDOM_CONTRAST     Range (0-100), 0=no change
    RANDOM_90_DEG_TURN  0 or 1= random turn to left or right

    '''
    # augmentations parameter is included for counting statistics from images, where we don't want augmentations
    
    # OpenCV reads the image in bgr format by default
    bgr_img = cv2.imread(path)
    # We flip it to rgb for visualization purposes
    b,g,r = cv2.split(bgr_img)
    rgb_img = cv2.merge([r,g,b])
    
    if(not augmentations):
        return rgb_img / 255
    
    #random rotation
    rotation = random.randint(-RANDOM_ROTATION,RANDOM_ROTATION)
    if(RANDOM_90_DEG_TURN == 1):
        rotation += random.randint(-1,1) * 90
    M = cv2.getRotationMatrix2D((48,48),rotation,1)   # the center point is the rotation anchor
    rgb_img = cv2.warpAffine(rgb_img,M,(96,96))
    
    #random x,y-shift
    x = random.randint(-RANDOM_SHIFT, RANDOM_SHIFT)
    y = random.randint(-RANDOM_SHIFT, RANDOM_SHIFT)
    
    # crop to center and normalize to 0-1 range
    start_crop = (ORIGINAL_SIZE - CROP_SIZE) // 2
    end_crop = start_crop + CROP_SIZE
    rgb_img = rgb_img[(start_crop + x):(end_crop + x), (start_crop + y):(end_crop + y)] / 255
    
    # Random flip
    flip_hor = bool(random.getrandbits(1))
    flip_ver = bool(random.getrandbits(1))
    if(flip_hor):
        rgb_img = rgb_img[:, ::-1]
    if(flip_ver):
        rgb_img = rgb_img[::-1, :]
        
    # Random brightness
    br = random.randint(-RANDOM_BRIGHTNESS, RANDOM_BRIGHTNESS) / 100.
    rgb_img = rgb_img + br
    
    # Random contrast
    cr = 1.0 + random.randint(-RANDOM_CONTRAST, RANDOM_CONTRAST) / 100.
    rgb_img = rgb_img * cr
    
    # clip values to 0-1 range
    rgb_img = np.clip(rgb_img, 0, 1.0)
    
    return img_as_ubyte(rgb_img)




In [12]:
#Augment test images randomly

images_path="train" #path to original images
augmented_path="train" # path to store augmented images
images=[] # to store paths of images from folder

for im in os.listdir(images_path):  # read image name from folder and append its path into "images" array     
    images.append(os.path.join(images_path,im))

images_to_generate=10000 #you can change this value according to your requirement

for i in range(images_to_generate):    
    image=random.choice(images)
    id = image[6:-4]
    label = data[data['id'] == id].iloc[0]['label']
    data = data.append({"id":'augmented_'+id,'label':label},ignore_index=True)
    transformed_image= readCroppedImage(image)
    new_image_path= "train/augmented_%s.tif" %(id)
    cv2.imwrite(new_image_path, transformed_image) # save transformed image to path

#Save new label file which has the augmented images
data.to_csv('new_train_labels.csv')

In [13]:
# Load the new csv that now includes the augmented images 
df_data = pd.read_csv('../content/new_train_labels.csv')

# Check the class distribution
print(df_data['label'].value_counts())


0    136811
1     93215
Name: label, dtype: int64


#Balance the target distribution
We will subset our original data into 160000 images half labelled 0, the other labelled 1.

In [14]:
IMAGE_SIZE = 96 #we have 96x96 images
IMAGE_CHANNELS = 3 #rgb images
SAMPLE_SIZE = 80000 # the number of images we use from each of the two classes

# take a random sample of class 0 with size equal to num samples in class 1
df_0 = df_data[df_data['label'] == 0].sample(SAMPLE_SIZE, random_state = 101)
# filter out class 1
df_1 = df_data[df_data['label'] == 1].sample(SAMPLE_SIZE, random_state = 101)

# concat the dataframes
df_data = pd.concat([df_0, df_1], axis=0).reset_index(drop=True)
# shuffle
df_data = shuffle(df_data)

df_data['label'].value_counts()

1    80000
0    80000
Name: label, dtype: int64

Train-Test-Validation Split

In [15]:
# train_test_split

df_train, df_val, df_test = np.split(df_data.sample(frac=1), [int(.6*len(df_data)), int(.8*len(df_data))])

print(df_train.shape)
print(df_test.shape)
print(df_val.shape)


(96000, 3)
(32000, 3)
(32000, 3)


In [16]:
# Check the training set counts
df_train['label'].value_counts()

1    48002
0    47998
Name: label, dtype: int64

In [17]:
#Check the test set counts
df_test['label'].value_counts()

1    16001
0    15999
Name: label, dtype: int64

In [18]:
# Check the validation set counts
df_val['label'].value_counts()

0    16003
1    15997
Name: label, dtype: int64

#Directory Structure for the Keras Model


In [19]:
# Create a new directory
base_dir = 'base_dir'
os.mkdir(base_dir)


#[CREATE FOLDERS INSIDE THE BASE DIRECTORY]

# now we create 2 folders inside 'base_dir':

# train_dir
    # a_no_tumor_tissue
    # b_has_tumor_tissue

# test_dir
    # a_no_tumor_issue
    # b_has_tumor_issue

# val_dir
    # a_no_tumor_tissue
    # b_has_tumor_tissue




# create a path to 'base_dir' to which we will join the names of the new folders
# train_dir
train_dir = os.path.join(base_dir, 'train_dir')
os.mkdir(train_dir)

# test_dir
test_dir = os.path.join(base_dir, 'test_dir')
os.mkdir(test_dir)

# val_dir
val_dir = os.path.join(base_dir, 'val_dir')
os.mkdir(val_dir)



# [CREATE FOLDERS INSIDE THE TRAIN AND VALIDATION FOLDERS]
# Inside each folder we create seperate folders for each class

# create new folders inside train_dir
no_tumor_tissue = os.path.join(train_dir, 'a_no_tumor_tissue')
os.mkdir(no_tumor_tissue)
has_tumor_tissue = os.path.join(train_dir, 'b_has_tumor_tissue')
os.mkdir(has_tumor_tissue)


#create new folders inside test_dir
no_tumor_tissue = os.path.join(test_dir, 'a_no_tumor_tissue')
os.mkdir(no_tumor_tissue)
has_tumor_tissue = os.path.join(test_dir, 'b_has_tumor_tissue')
os.mkdir(has_tumor_tissue)


# create new folders inside val_dir
no_tumor_tissue = os.path.join(val_dir, 'a_no_tumor_tissue')
os.mkdir(no_tumor_tissue)
has_tumor_tissue = os.path.join(val_dir, 'b_has_tumor_tissue')
os.mkdir(has_tumor_tissue)

In [20]:
# check that the folders have been created
print(os.listdir('base_dir/train_dir'))
print(os.listdir('base_dir/test_dir'))
print(os.listdir('base_dir/val_dir'))

['a_no_tumor_tissue', 'b_has_tumor_tissue']
['a_no_tumor_tissue', 'b_has_tumor_tissue']
['a_no_tumor_tissue', 'b_has_tumor_tissue']


#Transferring Images into new directories

In [21]:
# Set the id as the index in df_data
df_data.set_index('id', inplace=True)

In [None]:
# Get a list of train and val images
train_list = list(df_train['id'])
test_list = list(df_test['id'])
val_list = list(df_val['id'])


# Transfer the train images
count = 0
for image in train_list:
    count += 1
    # the id in the csv file does not have the .tif extension therefore we add it here
    fname = image + '.tif'
    # get the label for a certain image
    target = df_data.loc[image,'label']

    # these must match the folder names
    if target.any() == 0:
        label = 'a_no_tumor_tissue'
    if target.any() == 1:
        label = 'b_has_tumor_tissue'
    
    # source path to image
    src = os.path.join('../content/train', fname)
    # destination path to image
    dst = os.path.join(train_dir, label, fname)
    # copy the image from the source to the destination
    shutil.copyfile(src, dst)


# Transfer the test images
for image in test_list:
    
    # the id in the csv file does not have the .tif extension therefore we add it here
    fname = image + '.tif'
    # get the label for a certain image
    target = df_data.loc[image,'label']
    
    # these must match the folder names
    if target.any() == 0:
        label = 'a_no_tumor_tissue'
    if target.any() == 1:
        label = 'b_has_tumor_tissue'
    

    # source path to image
    src = os.path.join('../content/train', fname)
    # destination path to image
    dst = os.path.join(test_dir, label, fname)
    # copy the image from the source to the destination
    shutil.copyfile(src, dst)
# Transfer the val images

for image in val_list:
    
    # the id in the csv file does not have the .tif extension therefore we add it here
    fname = image + '.tif'
    # get the label for a certain image
    target = df_data.loc[image,'label']
    
    # these must match the folder names
    if target.any() == 0:
        label = 'a_no_tumor_tissue'
    if target.any() == 1:
        label = 'b_has_tumor_tissue'
    

    # source path to image
    src = os.path.join('../content/train', fname)
    # destination path to image
    dst = os.path.join(val_dir, label, fname)
    # copy the image from the source to the destination
    shutil.copyfile(src, dst)

In [None]:
# check how many train images we have in each folder
print(len(os.listdir('base_dir/train_dir/a_no_tumor_tissue')))
print(len(os.listdir('base_dir/train_dir/b_has_tumor_tissue')))


In [None]:
# check how many test images we have in each folder
print(len(os.listdir('base_dir/test_dir/a_no_tumor_tissue')))
print(len(os.listdir('base_dir/test_dir/b_has_tumor_tissue')))


In [None]:
# check how many val images we have in each folder
print(len(os.listdir('base_dir/val_dir/a_no_tumor_tissue')))
print(len(os.listdir('base_dir/val_dir/b_has_tumor_tissue')))