In [None]:
import os
import sys
import random
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(color_codes=True)

from tqdm import tqdm
from skimage.io import imread, imshow
from skimage.transform import resize

from sklearn.cluster import KMeans

from keras.models import Model, load_model
from keras.layers import Input
from keras.layers.core import Dropout, Lambda
from keras.layers.convolutional import Conv2D, Conv2DTranspose
from keras.layers.pooling import MaxPooling2D
from keras.layers.merge import concatenate
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import backend as K

import tensorflow as tf

TRAIN_PATH = 'data-science-bowl-2018/stage1_train/'
TEST_PATH = 'data-science-bowl-2018/stage1_test/'

warnings.filterwarnings('ignore', category=UserWarning, module='skimage')
seed = 30
random.seed = seed
np.random.seed = seed


# Get train and test IDs
train_ids = next(os.walk(TRAIN_PATH))[1]
test_ids = next(os.walk(TEST_PATH))[1]
%matplotlib inline

In [None]:
train_img = []
train_masks = []

train_img_data = []
train_mask_data = []

test_img = []
test_img_data = []

print("Reading train images and masks and getting their metadata")
sys.stdout.flush()
for n, id_ in tqdm(enumerate(train_ids), total=len(train_ids)):
    path = TRAIN_PATH + id_
    img = imread('{}/images/{}.png'.format(path, id_))
    
    train_img.append(img)
    img_height = img.shape[0]
    img_width = img.shape[1]
    
    nucleus_count = 0
    
    for mask_file in next(os.walk('{}/masks/'.format(path)))[2]:
        mask = imread('{}/masks/{}'.format(path, mask_file))
        train_masks.append(mask)
        mask_height, mask_width = mask.shape
        
        nucleus_area = (np.sum(mask) / 255)
        
        mask_to_img_ratio = nucleus_area / (mask_height * mask_width)
        
        train_mask_data.append([n,mask_height, mask_width, mask_to_img_ratio])
        
        nucleus_count += 1
        
    train_img_data.append([id_, img_height, img_width, nucleus_count])

print("Reading test images and getting metadata")
sys.stdout.flush()
for n, id_ in tqdm(enumerate(test_ids), total=len(test_ids)):
    path = TEST_PATH + id_
    img = imread('{}/images/{}.png'.format(path, id_))

    test_img.append(img)
    img_height = img.shape[0]
    img_width = img.shape[1]
    
    test_img_data.append([id_, img_height, img_width])

In [None]:
df_train_img = pd.DataFrame(train_img_data, columns=['id', 'height', 'width', 'nuclei'])
df_train_img.head()

In [None]:
df_train_img.describe(include=[np.number])

In [None]:
df_train_mask = pd.DataFrame(train_mask_data, 
                             columns=['img_index', 'height', 'width', 
                                      'mask_to_img_ratio'])
df_train_mask.describe()

In [None]:
df_train_mask.head()

In [None]:
df_test_img = pd.DataFrame(test_img_data, 
                           columns=['id', 'height', 'width'])
df_test_img.head()

In [None]:
df_test_img.describe(include=[np.number])

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(10,5))
width_plt = sns.distplot(df_train_img['width'].values, ax=ax[0])
width_plt.set(xlabel='width (px)')
width_plt.set(ylim=(0, 0.01))
height_plt = sns.distplot(df_train_img['height'].values, ax=ax[1])
height_plt.set(xlabel='height (px)')
height_plt.set(ylim=(0, 0.015))
plt.tight_layout();

In [None]:
sns.distplot(df_train_img['nuclei'].values)
plt.xlabel("nuclei")
plt.show();

In [None]:
plt.figure(figsize=(18, 18))
much_nuclei = df_train_img['nuclei'].argmax()
print(df_train_img['nuclei'][much_nuclei])
plt.grid(None)
plt.imshow(train_img[much_nuclei]);

In [None]:
plt.figure(figsize=(18, 18))
not_much_nuclei = df_train_img['nuclei'].argmin()
print(df_train_img['nuclei'][not_much_nuclei])
plt.grid(None)
plt.imshow(train_img[not_much_nuclei]);


In [None]:
smallest_mask_index = df_train_mask['mask_to_img_ratio'].argmin()

fig, ax = plt.subplots(1, 2, figsize=(16, 16))
ax[0].grid(None)
ax[0].imshow(train_masks[smallest_mask_index])
ax[1].grid(None)
ax[1].imshow(train_img[df_train_mask.iloc[[smallest_mask_index], [0]].values[0][0]])
plt.tight_layout();

In [None]:
smallest_mask_resized_128 = resize(train_masks[smallest_mask_index], (128, 128))
smallest_mask_resized_256 = resize(train_masks[smallest_mask_index], (256, 256))
smallest_mask_resized_512 = resize(train_masks[smallest_mask_index], (512, 512))
print(np.sum(smallest_mask_resized_128))
print(np.sum(smallest_mask_resized_256))
print(np.sum(smallest_mask_resized_512))
fig, ax = plt.subplots(1, 3, figsize=(14, 14))
ax[0].grid(None)
ax[1].grid(None)
ax[2].grid(None)
ax[0].imshow(smallest_mask_resized_128)
ax[1].imshow(smallest_mask_resized_256)
ax[2].imshow(smallest_mask_resized_512);

In [None]:
biggest_mask_index = df_train_mask['mask_to_img_ratio'].argmax()
biggest_mask_img_index = df_train_mask.iloc[[biggest_mask_index], [0]].values[0][0]

fig, ax = plt.subplots(1, 2, figsize=(12, 12))
ax[0].grid(None)
ax[1].grid(None)
ax[0].imshow(train_masks[biggest_mask_index])
ax[1].imshow(train_img[biggest_mask_img_index])
plt.tight_layout()

In [None]:
big_nuclei = df_train_mask.index[df_train_mask['img_index'] == biggest_mask_img_index]
plt.figure(figsize=(18, 18))
for i, mask_id in enumerate(big_nuclei):
    plt.grid(None)
    plt.imshow(train_masks[mask_id], interpolation='none', alpha=0.1)

In [None]:
sample_nuclei = df_train_img.sample(20).index
fig, ax = plt.subplots(5, 4, figsize=(16, 16))
row = 0
col = 0
for i, img_id in enumerate(sample_nuclei):
    ax[row, col].grid(False)
    ax[row, col].imshow(train_img[img_id])
    
    # Increment col index and reset each time
    # it gets to 4 to start a new row
    col = col + 1
    if(col == 4):
        col = 0
    
    # Increment row index every 4 items
    if((i + 1) % 4 == 0):
        row = row + 1
plt.tight_layout()

In [None]:
def get_color_state1(imgs):
    color_state = []
    for img in imgs:
        g = np.mean(img[:,:,0])
        grm = np.mean(img[:,:,1]-img[:,:,0])
        grs = np.std(img[:,:,1]-img[:,:,0])
        color_state.append([g,grm, grs])
    return color_state    

In [None]:
train_cs1 = get_color_state1(train_img)
test_cs1 = get_color_state1(test_img)

In [None]:
X_tr = train_cs1
X_te = test_cs1

kmeans = KMeans(n_clusters=4).fit(X_tr)
train_cl1 = np.argmin(kmeans.transform(X_tr), -1)
test_cl1 = np.argmin(kmeans.transform(X_te), -1)

In [None]:
def visualizer(imgs, n=4, figsize=(16,16), title=''):
    fig = plt.figure(figsize=figsize)

    n_samples = list(range(len(imgs)))

    for i in range(int(n**2)):
        try:
            rsample = random.choice(n_samples)
            n_samples.remove(rsample)
            img = imgs[rsample]
            ax = fig.add_subplot(n,n,i+1)
            ax.imshow(img)
            ax.axis('off')
        except IndexError:
            pass
    fig.suptitle(title)

In [None]:
for j in range(4):
    train_img_cl = []
    for i in range(len(train_cl1)):
        if train_cl1[i]==j:
            train_img_cl.append(train_img[i])
    visualizer(train_img_cl, title='Cluster '+str(j))

In [None]:
for j in range(4):
    test_img_cl = []
    for i in range(len(test_cl1)):
        if test_cl1[i]==j:
            test_img_cl.append(test_img[i])
    visualizer(test_img_cl, title='Cluster '+str(j))

In [None]:
IMG_WIDTH = 128
IMG_HEIGHT = 128
IMG_CHANNELS = 3

In [None]:
X_train = np.zeros((len(train_img), IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS), dtype=np.uint8)
Y_train = np.zeros((len(train_img), IMG_HEIGHT, IMG_WIDTH, 1), dtype=np.bool)
sys.stdout.flush()
for n in tqdm(range(len(train_img)), total=670):
    img = train_img[n][:, :, :IMG_CHANNELS]
    X_train[n] = resize(img, (IMG_HEIGHT, IMG_WIDTH), mode='constant', preserve_range=True)
    mask = np.zeros((IMG_HEIGHT, IMG_WIDTH, 1), dtype=np.bool)
    masks = df_train_mask[df_train_mask['img_index'] == n].index.values
    for mask_num in masks:
        mask_ = np.expand_dims(resize(train_masks[mask_num], 
                                      (IMG_HEIGHT, IMG_WIDTH), mode='constant', 
                                      preserve_range=True), axis=-1)
        mask = np.maximum(mask, mask_)
    Y_train[n] = mask

In [None]:
ix = random.randint(0, len(train_img))
imshow(X_train[ix])
plt.grid(None)
plt.show()
imshow(np.squeeze(Y_train[ix]))
plt.grid(None)
plt.show()

In [None]:
X_test = np.zeros((len(test_img), IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS), dtype=np.uint8)
sys.stdout.flush()
for n in tqdm(range(len(test_img)), total=65):
    img = test_img[n][:, :, :IMG_CHANNELS]
    X_test[n] = resize(img, (IMG_HEIGHT, IMG_WIDTH), mode='constant', preserve_range=True)

In [None]:
ix = random.randint(0, len(test_img))
imshow(X_test[ix])
plt.grid(None)
plt.show();

In [None]:
smooth = 1.
def dice_coef(y_true, y_pred):
    y_true_f = K.flatten(y_true)
    y_pred_f = K.flatten(y_pred)
    intersection = K.sum(y_true_f * y_pred_f)
    return (2. * intersection + smooth) / (K.sum(y_true_f) + K.sum(y_pred_f) + smooth)

In [None]:
size_train_set = len(train_ids)
train_images = []
train_masks = []

print('Reading train images and masks:')
sys.stdout.flush()
for n, id_ in tqdm(enumerate(train_ids), total=size_train_set):
    path = TRAIN_PATH + id_
    img = imread('{}/images/{}.png'.format(path, id_))
    train_images.append(img)
    
    path_to_masks = '{}/masks/'.format(path)
    for mask_file in next(os.walk(path_to_masks))[2]:
        mask = imread(path_to_masks + mask_file)
        train_masks.append(mask)