In [None]:
import pandas as pd
from pathlib import Path

In [None]:
import numpy as np
import pandas as pd
import os
import cv2
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import random
from sklearn.utils import shuffle
from tqdm import tqdm_notebook
#https://pythonhosted.org/keras-tqdm/
import math
from keras_preprocessing.image import ImageDataGenerator
import keras
from keras.models import Sequential
from keras.layers import *
from keras.optimizers import RMSprop,Adam
import shutil


In [None]:
train_path = "../input/histopathologic-cancer-detection/train/"
test_path = "../input/histopathologic-cancer-detection/test/"

print('Training Images:', len(os.listdir(train_path)))
print('Testing Images: ', len(os.listdir(test_path)))


In [None]:
train_data = pd.read_csv('/kaggle/input/histopathologic-cancer-detection/train_labels.csv')
train_data['label'].value_counts()


In [None]:
test_data = pd.read_csv("../input/histopathologic-cancer-detection/sample_submission.csv", dtype=str)


In [None]:
train_data.info()


In [None]:
test_data.info()


In [None]:
train_data.head()


In [None]:
test_data.head()


In [None]:
train_data.id = train_data.id + '.tif'
test_data.id = test_data.id + '.tif'
print(train_data.head())


In [None]:
print(test_data.head())


In [None]:
train_data.shape


In [None]:
SAMPLE_SIZE = 10000
df_normal = train_data[train_data['label'] == 0].sample(SAMPLE_SIZE, random_state = 42)
df_cancer = train_data[train_data['label'] == 1].sample(SAMPLE_SIZE, random_state = 42)

# Join the two data frame, so that both cancer and normal got in one data frame, 
## Remember to shuffle the data set , to avaoid biasing
df_subset = pd.concat([df_normal, df_cancer], axis=0).reset_index(drop=True)

#shuffle the dataframe using shuffle 
from sklearn.utils import shuffle
train_data_subset = shuffle(df_subset)

train_data_subset.head()


In [None]:
train_data_subset.info()


In [None]:
####We can now split the dataset in trian and spllit####

### Here we are split the data into TRAIN and VALIDATION ###
from sklearn.model_selection import train_test_split

def split_data(df_train):
        df_train, df_valid = train_test_split(df_train, test_size=0.02, random_state=42,
                                     stratify=df_train['label'])
        
        # We have to set the iindex as 'id', otherwise was giving trouble while uploadiung
        train_data_subset.set_index('id', inplace=True)
        
        train_list = list(df_train['id'])
        valid_list = list(df_valid['id'])
        
        return df_train, df_valid, train_list, valid_list
#Lets split it now###
df_train, df_valid, train_list, valid_list = split_data(train_data_subset)
print('df_train_shape', df_train.shape)
print('df_validation_shape', df_valid.shape)


In [None]:
df_train=df_train.astype(str)


In [None]:
df_valid=df_valid.astype(str)


In [None]:
df_valid.info()


In [None]:
df_train.info()


In [None]:
train_datagen = ImageDataGenerator(
       horizontal_flip=True,
       vertical_flip=True,
       brightness_range=[0.5, 1.5],
       fill_mode='reflect',                               
        rotation_range=15,
        rescale=1./255,
        shear_range=0.2,
        zoom_range=0.2)
        #validation_split=0.15)


In [None]:
validation_datagen = ImageDataGenerator(
    rescale=1./255)


In [None]:
test_datagen = ImageDataGenerator(
       #horizontal_flip=True,
       #vertical_flip=True,
       #brightness_range=[0.5, 1.5],
       #fill_mode='reflect',                               
        #rotation_range=15,
        rescale=1./255)
        #shear_range=0.2,
        #zoom_range=0.2)


In [None]:
tr_size = 19600
va_size = 400
bs = 64

tr_steps = math.ceil(tr_size / bs)
va_steps = math.ceil(va_size / bs)

#math.ceil() function returns the smallest integral value greater than the number. 
#If number is already integer, same number is returned.

train_generator = train_datagen.flow_from_dataframe(
    dataframe = df_train,
    directory = train_path,
    x_col = "id",
    y_col = "label",
    #subset = "training",
    batch_size = bs,
    seed = 1,
    shuffle = True,
    class_mode = "categorical",
    target_size = (96,96))


In [None]:
valid_generator = validation_datagen.flow_from_dataframe(
    dataframe = df_valid,
    directory = train_path,
    x_col = "id",
    y_col = "label",
    #subset = "validation",
    batch_size = bs,
    seed = 1,
    shuffle = True,
    class_mode = "categorical",
    target_size = (96,96))


In [None]:
test_generator = test_datagen.flow_from_dataframe(
    dataframe = test_data,
    directory = test_path,
    x_col = "id",
    y_col = None,
    batch_size = 32,
    seed = 1,
    shuffle = False,
    class_mode = None,
    target_size = (96,96))


In [None]:
def training_images(seed):
    np.random.seed(seed)
    train_generator.reset()
    imgs, labels = next(train_generator)
    tr_labels = np.argmax(labels, axis=1)
    
    plt.figure(figsize=(12,12))
    for i in range(16):
        text_class = labels[i]
        plt.subplot(4,4,i+1)
        plt.imshow(imgs[i,:,:,:])
        if(text_class[0] == 0):
            plt.text(0, -5, 'Positive', color='r')
        else:
            plt.text(0, -5, 'Negative', color='b')
        plt.axis('off')
    plt.show()

training_images(1)


In [None]:
model = Sequential()
model.add(Conv2D(filters = 16, kernel_size = 3, padding = 'same', activation = 'relu', input_shape = (96, 96, 3)))
model.add(Conv2D(filters = 16, kernel_size = 3, padding = 'same', activation = 'relu'))
model.add(Conv2D(filters = 16, kernel_size = 3, padding = 'same', activation = 'relu'))
model.add(Dropout(0.2))
model.add(MaxPooling2D(pool_size = 3))
model.add(BatchNormalization())

model.add(Conv2D(filters = 32, kernel_size = 3, padding = 'same', activation = 'relu'))
model.add(Conv2D(filters = 32, kernel_size = 3, padding = 'same', activation = 'relu'))
model.add(Conv2D(filters = 32, kernel_size = 3, padding = 'same', activation = 'relu'))
model.add(Dropout(0.2))
model.add(MaxPooling2D(pool_size = 3))
model.add(BatchNormalization())

model.add(Conv2D(filters = 64, kernel_size = 3, padding = 'same', activation = 'relu'))
model.add(Conv2D(filters = 64, kernel_size = 3, padding = 'same', activation = 'relu'))
model.add(Conv2D(filters = 64, kernel_size = 3, padding = 'same', activation = 'relu'))
model.add(Dropout(0.2))
model.add(MaxPooling2D(pool_size = 3))
model.add(BatchNormalization())

model.add(Conv2D(filters = 128, kernel_size = 3, padding = 'same', activation = 'relu'))
model.add(Conv2D(filters = 128, kernel_size = 3, padding = 'same', activation = 'relu'))
model.add(Conv2D(filters = 128, kernel_size = 3, padding = 'same', activation = 'relu'))
model.add(Dropout(0.3))
model.add(BatchNormalization())


model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(2, activation = 'sigmoid'))
model.summary()


In [None]:
epochs = 5


In [None]:
%%time

optimizer=Adam(learning_rate=0.000001,beta_1=0.9,beta_2=0.999,epsilon=1e-08)

model.compile(optimizer=optimizer,loss=['binary_crossentropy'],metrics=['accuracy'])

h4 = model.fit_generator(train_generator, steps_per_epoch=tr_steps, epochs=5, validation_data=valid_generator, validation_steps=va_steps, verbose=1)


In [None]:
model.save('cnn_v01.h4')


In [None]:
test_pred = model.predict_generator(test_generator)


In [None]:
print(test_pred[:5])


In [None]:
test_filenames = test_generator.filenames
test_filenames[ :5]


In [None]:
test_filenames = [x.split(".")[0] for x in test_filenames]


In [None]:
test_filenames[ :5]


In [None]:
len(test_filenames)


In [None]:
classes = list(np.argmax(test_pred, axis=1))


In [None]:
classes[:5]


In [None]:
submission = pd.DataFrame({'id':test_filenames,
     'label':classes
    })
submission.head()


In [None]:
submission.to_csv("submission.csv", index = False)
