In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

data = pd.read_csv('../input/train_labels.csv')
sub = pd.read_csv('../input/sample_submission.csv')
train_path = '../input/train/'
test_path = '../input/test/'

In [2]:
# Identify Outliers 
import os
from scipy import ndimage
from tqdm import tqdm_notebook # display a progress bar 

dark_thres = 10 / 255      # If no pixel reaches this threshold, image is considered too dark 
bright_thres = 245 / 255   # If no pixel is under this threshold, image is considerd too bright
too_dark_idx = []
too_bright_idx = []

for i, idx in tqdm_notebook(enumerate(data['id']), '({} images)'.format(len(data))):
    img_path = os.path.join(train_path, idx)
    imagearray = (ndimage.imread(img_path + '.tif')/255).reshape(-1,3) # Normalized to 0~1
    # is this too dark
    if(imagearray.max() < dark_thres):
        too_dark_idx.append(idx)
    # is this too bright
    if(imagearray.min() > bright_thres):
        too_bright_idx.append(idx)

HBox(children=(IntProgress(value=1, bar_style='info', description='(220025 images)', max=1, style=ProgressStyl…

`imread` is deprecated in SciPy 1.0.0.
Use ``matplotlib.pyplot.imread`` instead.
  del sys.path[0]





In [3]:
# Remove outliers
data = data.set_index('id')
data = data.drop(labels=too_dark_idx, axis=0)
data = data.drop(labels=too_bright_idx, axis=0)

train_names = data.index.values
train_labels = np.asarray(data['label'].values)
train_dict = {'id':  train_names, 'label': train_labels}
train_data = pd.DataFrame(data=train_dict)

In [4]:
# Splitting data into train and validation
train_samples, validation_samples = train_test_split(train_data, stratify=train_data.label, test_size=0.2)
len(train_samples), len(validation_samples)

(176014, 44004)

In [5]:
### Image Augmentation ###########
import random
import cv2
from sklearn.utils import shuffle


cols = 96
rows = 96

#    RANDOM_ROTATION = 5 # range (0-180), 180 allows all rotation variations, 0=no change

def random_rotation(img,RANDOM_ROTATION = 5):
    #random rotation
    rotation = np.random.randint(-RANDOM_ROTATION,RANDOM_ROTATION)
    M = cv2.getRotationMatrix2D((48,48),rotation,1)   # the center point is the rotation anchor
    img = cv2.warpAffine(img,M,(cols,rows))
    return img
def random_translation(img, RANDOM_TRANS = 5):
    #random x, y translation
    transX, transY = np.random.randint(-RANDOM_TRANS, RANDOM_TRANS, 2)
    M = np.float32([[1,0,transX],[0,1,transY]])
    img = cv2.warpAffine(img,M,(cols,rows))
    return img
def random_contrast_and_brightness(img, RANDOM_CONTRAST = 0.02, RANDOM_BRIGHTNESS = 0.03):
    # random contrast and brightness 
    random_contrast = 1+np.random.uniform(-RANDOM_CONTRAST,RANDOM_CONTRAST)
    random_bright = np.random.uniform(-RANDOM_BRIGHTNESS,RANDOM_BRIGHTNESS)
    img = img*random_contrast +random_bright
    img= np.clip(img,0,1.0)
    return img
    
    

In [6]:
### Python generator function
def generator(samples, batch_size=64, aug_flag=True):
    num_samples = len(samples)
    while 1: # Loop forever so the generator never terminates
        shuffle(samples)
        samples_df = samples.set_index('id')
        img_ids = samples_df.index.values
        img_labels = np.asarray(samples_df['label'].values)
        for offset in range(0, num_samples, batch_size):
            batch_samples = img_ids[offset:offset+batch_size]
            batch_labels = img_labels[offset:offset+batch_size]

            images = []
            labels = []
            for batch_sample, batch_label in zip(batch_samples,batch_labels):
                image = ndimage.imread(train_path+batch_sample+'.tif')
                image = image /255.0
                images.append(image)
                labels.append(batch_label)
                if aug_flag:
                    # flip horizontally
                    image = np.fliplr(image)
                    images.append(image)
                    labels.append(batch_label)
                    # flip vertically
                    image = np.flipud(image)
                    images.append(image)
                    labels.append(batch_label)
                    # random rotation
                    image =random_rotation(image)
                    images.append(image)
                    labels.append(batch_label)
                    # random translation
                    image = random_translation(image)
                    images.append(image)
                    labels.append(batch_label)
                    # random contrast and brightness
                    image = random_contrast_and_brightness(image)
                    images.append(image)
                    labels.append(batch_label)
                
            X_train = np.array(images)
            y_train = np.array(labels)
            yield shuffle(X_train, y_train)                

            

In [7]:
# Train and validation generator
batch_size = 64
train_generator = generator(train_samples, batch_size,aug_flag=True)
validation_generator = generator(validation_samples, batch_size,aug_flag=True)

In [8]:
# Import keras functions
from keras.models import Sequential
from keras.models import model_from_json
from keras.models import load_model
from keras.callbacks import ModelCheckpoint, EarlyStopping, CSVLogger
from keras.utils.vis_utils import plot_model
from keras.layers import Dense, Dropout, Flatten, BatchNormalization, Activation, Lambda
from keras.layers import Conv2D, MaxPool2D

Using TensorFlow backend.


In [9]:
######### CNN from Scratch #######################

model = Sequential()

## Convolutional Layer 1 
model.add(Conv2D(32,kernel_size = (3,3), strides=(1,1),padding='same', input_shape =(96,96,3)))
model.add(BatchNormalization())
model.add(Activation("relu"))
model.add(MaxPool2D((2,2)))
## Convolutional Layer 2 
model.add(Conv2D(64,kernel_size = (3,3), strides=(1,1),padding='same'))
model.add(BatchNormalization())
model.add(Activation("relu"))
model.add(MaxPool2D((2,2)))
## Convolutional Layer 3 
model.add(Conv2D(128,kernel_size = (3,3), strides=(1,1),padding='same'))
model.add(BatchNormalization())
model.add(Activation("relu"))
model.add(MaxPool2D((2,2)))
## Convolutional Layer 4 
model.add(Conv2D(256,kernel_size = (3,3), strides=(1,1),padding='same'))
model.add(BatchNormalization())
model.add(Activation("relu"))
model.add(MaxPool2D((2,2)))
## Convolutional Layer 5
model.add(Conv2D(512,kernel_size = (3,3), strides=(1,1),padding='same'))
model.add(BatchNormalization())
model.add(Activation("relu"))
model.add(MaxPool2D((2,2)))
model.add(Flatten())
## Full-connected Layer 1
model.add(Dense(256))
model.add(BatchNormalization())
model.add(Activation("relu"))
model.add(Dropout(0.3))
## Output Layer 
model.add(Dense(1, activation = "sigmoid"))





Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [10]:
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)
print(model.summary()) 

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 96, 96, 32)        896       
_________________________________________________________________
batch_normalization_1 (Batch (None, 96, 96, 32)        128       
_________________________________________________________________
activation_1 (Activation)    (None, 96, 96, 32)        0         
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 48, 48, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 48, 48, 64)        18496     
_________________________________________________________________
batch_normalization_2 (Batch (None, 48, 48, 64)        256       
_________________________________________________________________
activation_2 (Activation)    (None, 48, 48, 64)        0         
__________

In [11]:
# Specify loss functions, optimizer, and metrics
import keras
model.compile(loss=keras.losses.binary_crossentropy,
              optimizer=keras.optimizers.Adam(0.001), 
              metrics=['accuracy'])

In [12]:
### Save the best model 
checkpoint = ModelCheckpoint('model.h5', monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=False, mode='auto', period=1)

### Stop training when val_loss has stopped improving
earlystop = EarlyStopping(monitor='val_loss', min_delta=0, patience=2, verbose=1, mode='auto')

### Stream epoch results to a csv file
csv_logger = CSVLogger('training.log')

In [13]:
# Training
model.fit_generator(train_generator, steps_per_epoch=len(train_samples)//batch_size, \
                    validation_data=validation_generator,validation_steps=len(validation_samples)//batch_size, \
                    nb_epoch=8,verbose=1,callbacks = [checkpoint,earlystop,csv_logger])

Instructions for updating:
Use tf.cast instead.


  
  


Epoch 1/8


`imread` is deprecated in SciPy 1.0.0.
Use ``matplotlib.pyplot.imread`` instead.
  app.launch_new_instance()



Epoch 00001: val_loss improved from inf to 0.28803, saving model to model.h5
Epoch 2/8

Epoch 00002: val_loss improved from 0.28803 to 0.27239, saving model to model.h5
Epoch 3/8

Epoch 00003: val_loss did not improve from 0.27239
Epoch 4/8

Epoch 00004: val_loss did not improve from 0.27239
Epoch 00004: early stopping


<keras.callbacks.History at 0x7f092eb6f898>

In [14]:
# Predict training labels
train_samples_df = train_samples.set_index('id')
img_ids = train_samples_df.index.values
y_train_true = np.asarray(train_samples_df['label'].values)
y_train_pred = []
for idx in img_ids:
    image = ndimage.imread(train_path+idx+'.tif')
    predicted_label = model.predict(np.expand_dims(image/255.0, axis=0))[0][0]
    y_train_pred.append(predicted_label)
y_train_pred = np.asarray(y_train_pred)

`imread` is deprecated in SciPy 1.0.0.
Use ``matplotlib.pyplot.imread`` instead.
  import sys


In [15]:
from sklearn.metrics import roc_curve, auc

# Training auc-roc score
# Compute false positive rate and  true positive rate
fpr, tpr, _ = roc_curve(y_train_true, y_train_pred)

# Compute ROC-AUC score
roc_auc = auc(fpr, tpr)
print('ROC-AUC score for training set is {}'.format(roc_auc))

ROC-AUC score for training set is 0.9037307389404272


In [16]:
# Predict validation labels

val_samples_df = validation_samples.set_index('id')
img_ids = val_samples_df.index.values
y_val_true = np.asarray(val_samples_df['label'].values)
y_val_pred = []
for idx in img_ids:
    image = ndimage.imread(train_path+idx+'.tif')
    predicted_label = model.predict(np.expand_dims(image/255.0, axis=0))[0][0]
    y_val_pred.append(predicted_label)
y_val_pred = np.asarray(y_val_pred)

`imread` is deprecated in SciPy 1.0.0.
Use ``matplotlib.pyplot.imread`` instead.
  


In [17]:
# Validation auc-roc score
# Compute false positive rate and  true positive rate
fpr, tpr, _ = roc_curve(y_val_true, y_val_pred)

# Compute ROC-AUC score
roc_auc = auc(fpr, tpr)
print('ROC-AUC score for validation set is {}'.format(roc_auc))

ROC-AUC score for validation set is 0.9004382701138233


In [18]:
from glob import glob 
import os

# Test set submissions
test_files = glob(os.path.join(test_path,'*.tif')) #find the test file names
submission = pd.DataFrame() #create a dataframe to hold results
file_batch = 5000 #we will predict 5000 images at a time
max_idx = len(test_files) #last index to use
for idx in range(0, max_idx, file_batch): #iterate over test image batches
    print("Indexes: %i - %i"%(idx, idx+file_batch))
    test_df = pd.DataFrame({'path': test_files[idx:idx+file_batch]}) #add the filenames to the dataframe
    test_df['id'] = test_df.path.map(lambda x: x.split('/')[3].split(".")[0]) #add the ids to the dataframe
    test_df['image'] = test_df['path'].map(ndimage.imread) #read the batch
    images = np.stack(test_df.image, axis=0) #convert to numpy array
    predicted_labels = [model.predict(np.expand_dims(image/255.0, axis=0))[0][0] for image in images]
    predictions = np.array(predicted_labels)
    test_df['label'] = predictions
    submission = pd.concat([submission, test_df[["id", "label"]]])  
submission.to_csv("submission.csv", index=False, header=True)

Indexes: 0 - 5000
Indexes: 5000 - 10000
Indexes: 10000 - 15000
Indexes: 15000 - 20000
Indexes: 20000 - 25000
Indexes: 25000 - 30000
Indexes: 30000 - 35000
Indexes: 35000 - 40000
Indexes: 40000 - 45000
Indexes: 45000 - 50000
Indexes: 50000 - 55000
Indexes: 55000 - 60000


In [19]:
submission.head(5) #display first 5 lines    

Unnamed: 0,id,label
0,d3a0e75dd14a12773d7ad53bda6f1e1c5ba97f5c,0.516965
1,bdb69de941bb1dedf3d15564b39a67dec276f701,0.000176
2,371816c763c118a62ac1f4139f45806167c7e88b,0.022199
3,d18e5df26368164b4cd531941e489f2f19a5302d,0.010909
4,d57c22c04cd9c20540edff394de0f50fcdf55d0d,0.900185
