# EDA Cervix Cancer 

In [None]:
#Dependancies 
import numpy as np 
import pandas as pd
import seaborn as sns 
import cv2
import skimage.io as io
import keras
import os
import glob
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls 
py.init_notebook_mode(connected=True)
import random 
import shutil

from skimage.io import imread, imshow
from keras.models import Sequential
from keras.layers import Activation, Dropout, Flatten, Dense, Reshape
from keras.preprocessing.image import ImageDataGenerator
from keras.layers import Convolution2D, MaxPooling2D, ZeroPadding2D
from keras import optimizers

from subprocess import check_output
print(check_output(["ls", "/home/kray/Practicum/train"]).decode("utf8"))

from keras.models import Sequential
from keras.layers.core import Dense, Activation, Flatten, Dropout
from keras.layers.convolutional import Convolution2D, MaxPooling2D
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.optimizers import SGD
from sklearn.utils import shuffle
from keras.utils.np_utils import to_categorical
from keras import backend as K

In [None]:
# Number of images for each cervix type in train with plot.
# Code modified from Poonam Ligade, Intel cervical cancer EDA https://www.kaggle.com/poonaml/intel-cervical-cancer-eda 
sub_folders = check_output(["ls", "/home/kray/Practicum/train"]).decode("utf8").strip().split('\n')
count_dict = {}
for sub_folder in sub_folders:
    num_of_files = len(check_output(["ls", "/home/kray/Practicum/train/"+sub_folder]).decode("utf8").strip().split('\n'))
    print("{0} photos of cervix type {1}".format(num_of_files, sub_folder))
                            
    count_dict[sub_folder] = num_of_files
                            
plt.figure(figsize=(12,4))
sns.barplot(list(count_dict.keys()), list(count_dict.values()), alpha = 0.8)
plt.xlabel('Cervix types', fontsize = 11)
plt.ylabel('Number of images in train', fontsize = 11)
plt.title('train dataset')
                            
plt.show()

In [None]:
#Number of test images. 
num_test_files = len(check_output(["ls", "/home/kray/Practicum/test/"]).decode("utf8").strip().split('\n'))
print("Number of test images present:", num_test_files)

In [None]:
fig = plt.figure(figsize = (12, 8))

i = 1
for t in "/home/kray/Practicum/train"['type'].unique():
    ax = fig.add_subplot(1,3,i)
    i +=1
    f = "/home/kray/Practicum/train"["/home/kray/Practicum/train"['type'] == t]['imagepath'].values[0]
    plt.imshow(plt.imread(f))
    plt.title('sample for cervix {}'.format(t))

In [None]:
# Image size distribution, barplot. 
# Code modified from Poonam Ligade, Intel cervical cancer EDA https://www.kaggle.com/poonaml/intel-cervical-cancer-eda 

plt.figure(figsize=(12,4))
#sns.barplot(list(different_file_sizes.values()), list(different_file_sizes.keys()), alpha = 0.8)
import pprint
pprint.pprint(different_file_sizes)
#plt.ylabel('Image size', fontsize = 11)plt.xlabel('Number of images in train', fontsize = 11)
plt.title("Image sizes present in train dataset")
plt.show()

Split dataset train and validation

In [None]:
#Split data into training and validation sets, keeping file organization. 
allFiles = glob.glob('/home/kray/Practicum/train/*/*')

split_point = int(round(0.7*len(allFiles)))
random.shuffle(allFiles)

train_list = allFiles[:split_point]
valid_list = allFiles[split_point:]
print('Train images: {}'.format(len(train_list)))
print('Validation images {}'. format(len(valid_list)))

# Divy them into their own directories.
train_data_path = '/home/kray/Practicum/train_data'
validation_data_path = '/home/kray/Practicum/valid_data'
if not os.path.exists(train_data_path):
    os.makedirs(train_data_path)
if not os.path.exists(validation_data_path):
    os.makedirs(validation_data_path)

for fpath in train_list:
    basename = fpath.split('/')[-2:]
    dest = '/'.join([train_data_path] +basename)
    if not os.path.exists('/'.join(dest.split('/')[:-1])):
        os.makedirs('/'.join(dest.split('/')[:-1]))
    shutil.copyfile(fpath, '/'.join([train_data_path] +basename))
for fpath in valid_list:
    basename = fpath.split('/')[-2:]
    dest = '/'.join([validation_data_path] +basename)
    if not os.path.exists('/'.join(dest.split('/')[:-1])):
        os.makedirs('/'.join(dest.split('/')[:-1]))    
    shutil.copyfile(fpath, '/'.join([validation_data_path] +basename))

In [None]:
K.set_image_dim_ordering('tf')

# Create numpy array
train_image_list = list()
train_label_list = list()
for image in train_list[:10]:
    im = cv2.imread(image)
    im_type = image.split('/')[-2]
    train_label_list.append(im_type)
    train_image_list.append(im)
print (train_label_list)
print(train_image_list)

# Reshape
train_list = train_list.reshape(train_list[0], 32, 32, 3).astype('float32')
valid_list = Reshape(valid_list[0], 32, 32, 3).astype('float32')


# Model

In [None]:
#dimensions of images
img_width, img_height = 255, 255

In [None]:
# used to rescale the pixel values from [0, 255] to [0, 1] interval
datagen = ImageDataGenerator(rescale=1./255)

# automagically retrieve images and their classes for train and validation sets
train_generator = datagen.flow_from_directory(
        train_list,
        target_size=(img_width, img_height),
        batch_size=16,
        class_mode='binary')

validation_generator = datagen.flow_from_directory(
        valid_list,
        target_size=(img_width, img_height),
        batch_size=32,
        class_mode='binary')

In [None]:
model = Sequential()
model.add(Convolution2D(32, 3, 3, input_shape=(img_width, img_height,3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Convolution2D(32, 3, 3))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Convolution2D(64, 3, 3))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Flatten())
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))

In [None]:
# Compile model
model.compile(loss = "categorical_crossentropy", optimizer = 'adam', metrics = ['accuracy']) 

In [None]:
# Training
nb_epoch = 5
nb_train_samples = 1036
nb_validation_samples = 444

In [None]:
model.fit_generator(
        train_list,
        samples_per_epoch = nb_train_samples,
        nb_epoch = nb_epoch,
        valid_list = valid_list,
        nb_val_samples = nb_validation_samples)

In [None]:
# second CNN model
# batch generator 

def batch_generator_train(files, batch_size):
    number_of_batches = np.ceil(len(files)/batch_size)
    counter = 0
    random.shuffle(files)
    while True:
        batch_files = files[batch_size*counter:batch_size*(counter+1)]
        image_list = []
        mask_list = []
        for f in batch_files:
            image = cv2.imread(f)
            image = cv2.resize(image, conf['image_shape'])

            train_label_list = f[20:21] # relies on path lengths that is hard coded below
            if cancer_type == '1':
                mask = [1, 0, 0]
            elif cancer_type == '2':
                mask = [0, 1, 0]
            else:
                mask = [0, 0, 1]

            image_list.append(image)
            mask_list.append(mask)
        counter += 1
        image_list = np.array(image_list)
        mask_list = np.array(mask_list)

        yield image_list, mask_list

        if counter == number_of_batches:
            random.shuffle(files)
            counter = 0

In [None]:
# used to rescale the pixel values from [0, 255] to [0, 1] interval
# Code modified from Siraj Raval, Intro to Deep Learning https://github.com/llSourcell/how_to_make_an_image_classifier/blob/master/demo.ipynb

datagen = ImageDataGenerator(rescale = 1./255)

# automagically retrieve images and their classes for train and validation sets
# Code modified from Siraj Raval, Intro to Deep Learning https://github.com/llSourcell/how_to_make_an_image_classifier/blob/master/demo.ipynb

train_generator = datagen.flow_from_directory(
        train_list,
        target_size=(img_width, img_height),
        batch_size=16,
        class_mode='binary')

validation_generator = datagen.flow_from_directory(
        validation_list,
        target_size=(img_width, img_height),
        batch_size=32,
        class_mode='binary')

In [None]:
# Simple CNN model.
# Code modified from libphy https://github.com/libphy/dldev/blob/master/MNIST-Keras.ipynb

def simple_cnn_model():
    model = Sequential()
    model.add(Convolution2D(32, 3, 3, input_shape=(img_width, img_height,3), activation = 'relu'))
    model.add(MaxPooling2D(pool_size = (2, 2)))
    model.add(Dropout(0.2))
    model.add(Flatten())
    model.add(Dense(64, activation = 'relu'))
    #model.add(Dense(num_classes, activation = 'softmax'))
    # Compile model
    # Code modified from libphy https://github.com/libphy/dldev/blob/master/MNIST-Keras.ipynb

    model.compile(loss = "categorical_crossentropy", optimizer = 'adam', metrics = ['accuracy']) 
    return model

# Build model
model = simple_cnn_model()

In [None]:
# Create numpy array
train_image_list = list()
train_label_list = list()
for image in train_list[:10]:
    im = cv2.imread(image)
    im_type = image.split('/')[-2]
    train_label_list.append(im_type)
    train_image_list.append(im)
print (train_label_list)
print(train_image_list)

In [None]:
# Fit the model
model.fit(train_image_list, train_label_list, batch_size = 32, nb_epoch = 5, verbose = 1)

# Final evaluation of the model
scores = model.evaluate(train_data_path, validation_data_path, verbose = 0)
print("CNN Error: %.2f%%" % (100-scores[1]*100))

In [None]:
model.save_weights('models/basic_cnn_1_epoch.h5')

In [None]:
#Evaluating on validation set

In [None]:
#Computing loss and accuracy 
model.evaluate_generator(validation_generator, nb_validation_samples)

In [None]:
#Data augmentation for improving model by applying random transformation to the train set. 
#reduces overfitting 

# Confusion Matrix

# Next Steps

1. Create an ensembe with pre-trained models: VGG16, ResNet_V2, and Inception_V3, then fine tune the top layers of the pre-trained networks. 
https://gihub.com/tensorflow/models/blob/master/slim/README.md#Pretrained

2. Experiment with more k-folds on the training images

3. Explore bounding box annotations 

4. Train and test the improved model on the additional images and second relase images

5. Incorrporate additional visualizations