In [None]:
import pandas as pd
from pathlib import Path

In [None]:
#import necessary packages and training labels
import numpy as np
import pandas as pd

train_labels = pd.read_csv('../input/histopathologic-cancer-detection/train_labels.csv', dtype=str)
print(train_labels.shape)


In [None]:
train_labels.head()


In [None]:
train_labels.dtypes


In [None]:
train_labels['label'] = train_labels['label'].astype(float)


In [None]:
import os
print(len(os.listdir('../input/histopathologic-cancer-detection/train/')))
print(len(os.listdir('../input/histopathologic-cancer-detection/test/')))


In [None]:
len(train_labels)


In [None]:
train_labels['label'].value_counts()


In [None]:
train_labels['label'].value_counts().plot(kind='pie')


In [None]:
#split into two sets based on labels
train_labels_pos = train_labels[train_labels['label']==1]
train_labels_neg = train_labels[train_labels['label']==0]


In [None]:
#take a random sample of the neg labels of the same size as the set of pos labels
train_labels_neg = train_labels_neg.sample(n = train_labels_pos.shape[0])


In [None]:
#confirm both sets are of the same size
print(train_labels_neg.shape[0])
print(train_labels_pos.shape[0])


In [None]:
#combine and randomize the two sets
train_labels_balanced = pd.concat([train_labels_neg,train_labels_pos]).sample(frac=1, random_state=12345).reset_index(drop=True)
train_labels_balanced.head()


In [None]:
#confirm final set has the expected amount and shape
train_labels_balanced.shape


In [None]:
#confirm final set has the expected value counts
train_labels_balanced['label'].value_counts()


In [None]:
train_labels_balanced['label'].value_counts().plot(kind='pie')


In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

img = mpimg.imread(f'../input/histopathologic-cancer-detection/train/{train_labels_balanced.iloc[47,0]}.tif')
imgplot = plt.imshow(img)


In [None]:
print(img.shape)


In [None]:
sample_imgs = np.random.choice(train_labels_balanced.index,15)


In [None]:
fig, ax = plt.subplots(5, 3,figsize=(20,20))

for i in range(0, sample_imgs.shape[0]):
    ax = plt.subplot(5, 3, i+1)
    img = mpimg.imread(f'../input/histopathologic-cancer-detection/train/{train_labels_balanced.iloc[sample_imgs[i],0]}.tif')
    ax.imshow(img)
    lab = train_labels_balanced.iloc[sample_imgs[i],1]
    ax.set_title('Label: %s'%lab)
    
plt.tight_layout()


In [None]:
from sklearn.model_selection import train_test_split


In [None]:
train_df, valid_df = train_test_split(train_labels_balanced, test_size=0.25, random_state=1234, stratify=train_labels_balanced.label)


In [None]:
#import tensorflow and keras as well as any necessary packages
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow import keras
from keras.layers import Dense, Activation, Flatten, Dropout, BatchNormalization
from keras.layers import Conv2D, MaxPooling2D
from keras import regularizers, optimizers
from keras.layers import PReLU
from keras.initializers import Constant

from tensorflow.keras.preprocessing.image import ImageDataGenerator


In [None]:
train_df['id'] = train_df['id']+'.tif'
valid_df['id'] = valid_df['id']+'.tif'


In [None]:
train_df['label'] = train_df['label'].astype(str)
valid_df['label'] = valid_df['label'].astype(str)


In [None]:
#create the training and validation subsets
train_datagen=ImageDataGenerator(rescale=1/255)

train_generator=train_datagen.flow_from_dataframe(dataframe=train_df,directory="../input/histopathologic-cancer-detection/train/",
                x_col="id",y_col="label",batch_size=64,seed=1234,shuffle=True,
                class_mode="binary",target_size=(96,96))

valid_generator=train_datagen.flow_from_dataframe(dataframe=valid_df,directory="../input/histopathologic-cancer-detection/train/",
                x_col="id",y_col="label",batch_size=64,seed=1234,shuffle=True,
                class_mode="binary",target_size=(96,96))


In [None]:
#initial model with 4 sets of 2 convolutional layers
model = Sequential()
model.add(Conv2D(32, (3, 3), padding='same',input_shape=(96,96,3)))
model.add(Activation('relu'))
model.add(Conv2D(32, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(BatchNormalization())

model.add(Conv2D(64, (3, 3)))
model.add(Activation('relu'))
model.add(Conv2D(64, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(BatchNormalization())

model.add(Conv2D(128, (3, 3)))
model.add(Activation('relu'))
model.add(Conv2D(128, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(BatchNormalization())

model.add(Conv2D(256, (3, 3)))
model.add(Activation('relu'))
model.add(Conv2D(256, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(BatchNormalization())
          
model.add(Flatten())
model.add(Dropout(0.25))
model.add(Dense(512))
model.add(Activation('relu'))

model.add(Dropout(0.25))
model.add(Dense(256))
model.add(Activation('relu'))

model.add(Dropout(0.25))
model.add(Dense(64))
model.add(Activation('relu')) 

model.add(Dropout(0.25))
model.add(Dense(1, activation='sigmoid'))
opt = tf.keras.optimizers.Adam(0.001)
model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])


In [None]:
model.summary()


In [None]:
STEP_SIZE_TRAIN=train_generator.n//train_generator.batch_size
STEP_SIZE_VALID=valid_generator.n//valid_generator.batch_size

history = model.fit_generator(generator=train_generator,
                    steps_per_epoch=STEP_SIZE_TRAIN,
                    validation_data=valid_generator,
                    validation_steps=STEP_SIZE_VALID,
                    epochs=30, verbose=1
)


In [None]:
#next model with 3 sets of 5 convolutional layers
model2 = Sequential()
model2.add(Conv2D(32, (3, 3), padding='same',input_shape=(96,96,3)))
model2.add(Activation('relu'))
model2.add(Conv2D(32, (3, 3)))
model2.add(Activation('relu'))
model2.add(Conv2D(32, (3, 3)))
model2.add(Activation('relu'))
model2.add(Conv2D(32, (3, 3)))
model2.add(Activation('relu'))
model2.add(Conv2D(32, (3, 3)))
model2.add(Activation('relu'))
model2.add(MaxPooling2D(pool_size=(2, 2)))
model2.add(BatchNormalization())

model2.add(Conv2D(64, (3, 3)))
model2.add(Activation('relu'))
model2.add(Conv2D(64, (3, 3)))
model2.add(Activation('relu'))
model2.add(Conv2D(64, (3, 3)))
model2.add(Activation('relu'))
model2.add(Conv2D(64, (3, 3)))
model2.add(Activation('relu'))
model2.add(Conv2D(64, (3, 3)))
model2.add(Activation('relu'))
model2.add(MaxPooling2D(pool_size=(2, 2)))
model2.add(BatchNormalization())

model2.add(Conv2D(128, (3, 3)))
model2.add(Activation('relu'))
model2.add(Conv2D(128, (3, 3)))
model2.add(Activation('relu'))
model2.add(Conv2D(128, (3, 3)))
model2.add(Activation('relu'))
model2.add(Conv2D(128, (3, 3)))
model2.add(Activation('relu'))
model2.add(Conv2D(128, (3, 3)))
model2.add(Activation('relu'))
model2.add(MaxPooling2D(pool_size=(2, 2)))
model2.add(BatchNormalization())
          
model2.add(Flatten())
model2.add(Dropout(0.25))
model2.add(Dense(512))
model2.add(Activation('relu'))

model2.add(Dropout(0.25))
model2.add(Dense(256))
model2.add(Activation('relu'))

model2.add(Dropout(0.25))
model2.add(Dense(64))
model2.add(Activation('relu')) 

model2.add(Dropout(0.25))
model2.add(Dense(1, activation='sigmoid'))
opt = tf.keras.optimizers.Adam(0.001)
model2.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])


In [None]:
model2.summary()


In [None]:
STEP_SIZE_TRAIN=train_generator.n//train_generator.batch_size
STEP_SIZE_VALID=valid_generator.n//valid_generator.batch_size

history2 = model2.fit_generator(generator=train_generator,
                    steps_per_epoch=STEP_SIZE_TRAIN,
                    validation_data=valid_generator,
                    validation_steps=STEP_SIZE_VALID,
                    epochs=30, verbose=1
)


In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()


In [None]:
plt.plot(history2.history['accuracy'])
plt.plot(history2.history['val_accuracy'])
plt.title('model2 accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()


In [None]:
#next model with 3 sets of 5 convolutional layers
model3 = Sequential()
model3.add(Conv2D(32, (3, 3), padding='same',input_shape=(96,96,3)))
model3.add(Activation('relu'))
model3.add(Conv2D(32, (3, 3)))
model3.add(Activation('relu'))
model3.add(Conv2D(32, (3, 3)))
model3.add(Activation('relu'))
model3.add(Conv2D(32, (3, 3)))
model3.add(Activation('relu'))
model3.add(Conv2D(32, (3, 3)))
model3.add(Activation('relu'))
model3.add(MaxPooling2D(pool_size=(2, 2)))
model3.add(BatchNormalization())

model3.add(Conv2D(64, (3, 3)))
model3.add(Activation('relu'))
model3.add(Conv2D(64, (3, 3)))
model3.add(Activation('relu'))
model3.add(Conv2D(64, (3, 3)))
model3.add(Activation('relu'))
model3.add(Conv2D(64, (3, 3)))
model3.add(Activation('relu'))
model3.add(Conv2D(64, (3, 3)))
model3.add(Activation('relu'))
model3.add(MaxPooling2D(pool_size=(2, 2)))
model3.add(BatchNormalization())

model3.add(Conv2D(128, (3, 3)))
model3.add(Activation('relu'))
model3.add(Conv2D(128, (3, 3)))
model3.add(Activation('relu'))
model3.add(Conv2D(128, (3, 3)))
model3.add(Activation('relu'))
model3.add(Conv2D(128, (3, 3)))
model3.add(Activation('relu'))
model3.add(Conv2D(128, (3, 3)))
model3.add(Activation('relu'))
model3.add(MaxPooling2D(pool_size=(2, 2)))
model3.add(BatchNormalization())
          
model3.add(Flatten())
model3.add(Dropout(0.25))
model3.add(Dense(512))
model3.add(Activation('relu'))

model3.add(Dropout(0.25))
model3.add(Dense(256))
model3.add(Activation('relu'))

model3.add(Dropout(0.25))
model3.add(Dense(64))
model3.add(Activation('relu')) 

model3.add(Dropout(0.25))
model3.add(Dense(1, activation='sigmoid'))
opt = tf.keras.optimizers.RMSprop(0.001)
model3.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])


In [None]:
model3.summary()


In [None]:
STEP_SIZE_TRAIN=train_generator.n//train_generator.batch_size
STEP_SIZE_VALID=valid_generator.n//valid_generator.batch_size

history3 = model3.fit_generator(generator=train_generator,
                    steps_per_epoch=STEP_SIZE_TRAIN,
                    validation_data=valid_generator,
                    validation_steps=STEP_SIZE_VALID,
                    epochs=30, verbose=1
)


In [None]:
plt.plot(history3.history['accuracy'])
plt.plot(history3.history['val_accuracy'])
plt.title('model3 accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()


In [None]:
#next model with 3 sets of 5 convolutional layers, using prelu activations
model4 = Sequential()
model4.add(Conv2D(32, (3, 3), padding='same',input_shape=(96,96,3)))
model4.add(PReLU(alpha_initializer=Constant(value=0.25)))
model4.add(Conv2D(32, (3, 3)))
model4.add(PReLU(alpha_initializer=Constant(value=0.25)))
model4.add(Conv2D(32, (3, 3)))
model4.add(PReLU(alpha_initializer=Constant(value=0.25)))
model4.add(Conv2D(32, (3, 3)))
model4.add(PReLU(alpha_initializer=Constant(value=0.25)))
model4.add(Conv2D(32, (3, 3)))
model4.add(PReLU(alpha_initializer=Constant(value=0.25)))
model4.add(MaxPooling2D(pool_size=(2, 2)))
model4.add(BatchNormalization())

model4.add(Conv2D(64, (3, 3)))
model4.add(PReLU(alpha_initializer=Constant(value=0.25)))
model4.add(Conv2D(64, (3, 3)))
model4.add(PReLU(alpha_initializer=Constant(value=0.25)))
model4.add(Conv2D(64, (3, 3)))
model4.add(PReLU(alpha_initializer=Constant(value=0.25)))
model4.add(Conv2D(64, (3, 3)))
model4.add(PReLU(alpha_initializer=Constant(value=0.25)))
model4.add(Conv2D(64, (3, 3)))
model4.add(PReLU(alpha_initializer=Constant(value=0.25)))
model4.add(MaxPooling2D(pool_size=(2, 2)))
model4.add(BatchNormalization())

model4.add(Conv2D(128, (3, 3)))
model4.add(PReLU(alpha_initializer=Constant(value=0.25)))
model4.add(Conv2D(128, (3, 3)))
model4.add(PReLU(alpha_initializer=Constant(value=0.25)))
model4.add(Conv2D(128, (3, 3)))
model4.add(PReLU(alpha_initializer=Constant(value=0.25)))
model4.add(Conv2D(128, (3, 3)))
model4.add(PReLU(alpha_initializer=Constant(value=0.25)))
model4.add(Conv2D(128, (3, 3)))
model4.add(PReLU(alpha_initializer=Constant(value=0.25)))
model4.add(MaxPooling2D(pool_size=(2, 2)))
model4.add(BatchNormalization())
          
model4.add(Flatten())
model4.add(Dropout(0.25))
model4.add(Dense(512))
model4.add(PReLU(alpha_initializer=Constant(value=0.25)))

model4.add(Dropout(0.25))
model4.add(Dense(256))
model4.add(PReLU(alpha_initializer=Constant(value=0.25)))

model4.add(Dropout(0.25))
model4.add(Dense(64))
model4.add(PReLU(alpha_initializer=Constant(value=0.25)))

model4.add(Dropout(0.25))
model4.add(Dense(1, activation='sigmoid'))
opt = tf.keras.optimizers.RMSprop(0.001)
model4.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy'])


In [None]:
model4.summary()


In [None]:
STEP_SIZE_TRAIN=train_generator.n//train_generator.batch_size
STEP_SIZE_VALID=valid_generator.n//valid_generator.batch_size

history4 = model4.fit_generator(generator=train_generator,
                    steps_per_epoch=STEP_SIZE_TRAIN,
                    validation_data=valid_generator,
                    validation_steps=STEP_SIZE_VALID,
                    epochs=30, verbose=1
)


In [None]:
plt.plot(history4.history['accuracy'])
plt.plot(history4.history['val_accuracy'])
plt.title('model3 accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()


In [None]:
test_set = os.listdir('../input/histopathologic-cancer-detection/test/')


In [None]:
test_df = pd.DataFrame(test_set)
test_df.columns = ['id']
test_df.head()


In [None]:
test_datagen=ImageDataGenerator(rescale=1/255)

test_generator=test_datagen.flow_from_dataframe(dataframe=test_df,directory="../input/histopathologic-cancer-detection/test/",
                x_col="id",batch_size=64,seed=1234,shuffle=False,
                class_mode=None,target_size=(96,96))


In [None]:
STEP_SIZE_TEST=test_generator.n/2

preds = model4.predict_generator(generator=test_generator,steps=STEP_SIZE_TEST, verbose = 1)


In [None]:
predictions = []

for pred in preds:
    if pred >= 0.5:
        predictions.append(1)
    else:
        predictions.append(0)
        
predictions[:10]


In [None]:
submission = test_df.copy()
submission['id']=submission['id'].str[:-4]
submission['label']=predictions
submission.head()


In [None]:
submission.to_csv('submission.csv',index=False)
