# Kaggle TGS Salt Identification Challenge U-Net

This notebook provides a simple exploration of the dataset for the Kaggle TGS Salt Identification Challenge

Some code from: https://www.kaggle.com/dingli/seismic-data-analysis-with-u-net

## Setup

In [32]:
# Standard python packages
import os
import sys

# Other package imports
import cv2
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

from tqdm import tqdm_notebook, tnrange
from itertools import chain
from skimage.io import imread, imshow, concatenate_images
from skimage.transform import resize
from skimage.morphology import label

from sklearn.model_selection import train_test_split

from keras import backend as K
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.layers import Input, BatchNormalization, Concatenate
from keras.layers.core import Dropout
from keras.layers.convolutional import Conv2D, Conv2DTranspose, MaxPooling2D, UpSampling2D
from keras.layers.pooling import MaxPooling2D
from keras.layers.merge import concatenate
from keras.models import Model, load_model
from keras.optimizers import Adam

import tensorflow as tf

from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img

Setup some global settings and configuration

In [17]:
project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir))
data_folder = os.path.join(project_root, 'data')
data_folder_raw = os.path.join(data_folder, 'raw')
src_folder = os.path.join(project_root, 'src')

train_path = os.path.join(data_folder_raw, 'train')
train_images_path = os.path.join(train_path, 'images')
train_masks_path = os.path.join(train_path, 'masks')
train_file = os.path.join(data_folder_raw, 'train.csv')

test_path = os.path.join(data_folder_raw, 'test')
test_images_path = os.path.join(test_path, 'images')

depth_file = os.path.join(data_folder_raw, 'depths.csv')

# Other parameters / shared functions
img_size_ori = 101
img_size_target = 128

def upsample(img):
    if img_size_ori == img_size_target:
        return img
    return resize(img, (img_size_target, img_size_target), mode='constant', preserve_range=True)
    #res = np.zeros((img_size_target, img_size_target), dtype=img.dtype)
    #res[:img_size_ori, :img_size_ori] = img
    #return res
    
def downsample(img):
    if img_size_ori == img_size_target:
        return img
    return resize(img, (img_size_ori, img_size_ori), mode='constant', preserve_range=True)
    #return img[:img_size_ori, :img_size_ori]

This notebook uses the shared package however first we need to ensure it is available (otherwise you get an error about the module not being found). You can either run setup.py as discussed in the readme to install the package or modify the path to include the src folder.

In [6]:
# Explicitly set path so don't need to run setup.py - if we have multiple copies of 
# the code we would otherwise need to setup a seperate environment for each to
# ensure the code pointers are correct.
sys.path.insert(0, src_folder)

from examplepackage import examplemodule

## Load data
Look at the train file that contains image id's along with a mask of salt regions

In [7]:
train_df = pd.read_csv(train_file, index_col="id", usecols=[0])
depths_df = pd.read_csv(depth_file, index_col="id")
train_df = train_df.join(depths_df)
test_df = depths_df[~depths_df.index.isin(train_df.index)]

In [11]:
train_df["images"] = [np.array(load_img(os.path.join(train_images_path, "{}.png").format(idx), color_mode="grayscale")) / 255 for idx in tqdm_notebook(train_df.index)]

HBox(children=(IntProgress(value=0, max=4000), HTML(value='')))

In [13]:
train_df["masks"] = [np.array(load_img(os.path.join(train_masks_path, "{}.png").format(idx), color_mode="grayscale")) / 255 for idx in tqdm_notebook(train_df.index)]

HBox(children=(IntProgress(value=0, max=4000), HTML(value='')))

## Calculating the salt coverage and salt coverage classes
Counting the number of salt pixels in the masks and dividing them by the image size. Also create 11 coverage classes, -0.1 having no salt at all to 1.0 being salt only. Plotting the distribution of coverages and coverage classes, and the class against the raw coverage.

In [19]:
train_df["coverage"] = train_df.masks.map(np.sum) / pow(img_size_ori, 2)

In [20]:
def cov_to_class(val):    
    for i in range(0, 11):
        if val * 10 <= i :
            return i
        
train_df["coverage_class"] = train_df.coverage.map(cov_to_class)

## Create train/validation split stratified by salt coverage
Using the salt coverage as a stratification criterion. Also show an image to check for correct upsampling.

In [21]:
ids_train, ids_valid, x_train, x_valid, y_train, y_valid, cov_train, cov_test, depth_train, depth_test = train_test_split(
    train_df.index.values,
    np.array(train_df.images.map(upsample).tolist()).reshape(-1, img_size_target, img_size_target, 1), 
    np.array(train_df.masks.map(upsample).tolist()).reshape(-1, img_size_target, img_size_target, 1), 
    train_df.coverage.values,
    train_df.z.values,
    test_size=0.2, stratify=train_df.coverage_class, random_state=1337)

  warn("Anti-aliasing will be enabled by default in skimage 0.15 to "


## Data augmentation
Add a flipped version of the images

In [37]:
x_train = np.append(x_train, [np.fliplr(x) for x in x_train], axis=0)
y_train = np.append(y_train, [np.fliplr(x) for x in y_train], axis=0)

## Build Model

In [33]:
def conv_block(m, dim, acti, bn, res, do=0):
    n = Conv2D(dim, 3, activation=acti, padding='same')(m)
    n = BatchNormalization()(n) if bn else n
    n = Dropout(do)(n) if do else n
    n = Conv2D(dim, 3, activation=acti, padding='same')(n)
    n = BatchNormalization()(n) if bn else n
    return Concatenate()([m, n]) if res else n

def level_block(m, dim, depth, inc, acti, do, bn, mp, up, res):
    if depth > 0:
        n = conv_block(m, dim, acti, bn, res)
        m = MaxPooling2D()(n) if mp else Conv2D(dim, 3, strides=2, padding='same')(n)
        m = level_block(m, int(inc*dim), depth-1, inc, acti, do, bn, mp, up, res)
        if up:
            m = UpSampling2D()(m)
            m = Conv2D(dim, 2, activation=acti, padding='same')(m)
        else:
            m = Conv2DTranspose(dim, 3, strides=2, activation=acti, padding='same')(m)
        n = Concatenate()([n, m])
        m = conv_block(n, dim, acti, bn, res)
    else:
        m = conv_block(m, dim, acti, bn, res, do)
    return m

def UNet(img_shape, out_ch=1, start_ch=64, depth=4, inc_rate=2., activation='relu', 
         dropout=0.5, batchnorm=False, maxpool=True, upconv=True, residual=False):
    i = Input(shape=img_shape)
    o = level_block(i, start_ch, depth, inc_rate, activation, dropout, batchnorm, maxpool, upconv, residual)
    o = Conv2D(out_ch, 1, activation='sigmoid')(o)
    return Model(inputs=i, outputs=o)

In [35]:
model = UNet((img_size_target,img_size_target,1),start_ch=16,depth=5,batchnorm=True)
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [36]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 128, 128, 1)  0                                            
__________________________________________________________________________________________________
conv2d_30 (Conv2D)              (None, 128, 128, 16) 160         input_3[0][0]                    
__________________________________________________________________________________________________
batch_normalization_23 (BatchNo (None, 128, 128, 16) 64          conv2d_30[0][0]                  
__________________________________________________________________________________________________
conv2d_31 (Conv2D)              (None, 128, 128, 16) 2320        batch_normalization_23[0][0]     
__________________________________________________________________________________________________
batch_norm

conv2d_45 (Conv2D)              (None, 16, 16, 128)  131200      up_sampling2d_7[0][0]            
__________________________________________________________________________________________________
concatenate_7 (Concatenate)     (None, 16, 16, 256)  0           batch_normalization_30[0][0]     
                                                                 conv2d_45[0][0]                  
__________________________________________________________________________________________________
conv2d_46 (Conv2D)              (None, 16, 16, 128)  295040      concatenate_7[0][0]              
__________________________________________________________________________________________________
batch_normalization_37 (BatchNo (None, 16, 16, 128)  512         conv2d_46[0][0]                  
__________________________________________________________________________________________________
conv2d_47 (Conv2D)              (None, 16, 16, 128)  147584      batch_normalization_37[0][0]     
__________

In [45]:
print(x_train.shape)

(6400, 128, 128, 1)


## Training

In [52]:
early_stopping = EarlyStopping(patience=10, verbose=1)
model_checkpoint = ModelCheckpoint("./keras.model", save_best_only=True, verbose=1)
reduce_lr = ReduceLROnPlateau(factor=0.1, patience=5, min_lr=0.000005, verbose=1)

epochs = 200
batch_size = 32

history = model.fit(x_train, y_train,
                    validation_data=[x_valid, y_valid], 
                    epochs=epochs,
                    batch_size=batch_size,
                    callbacks=[early_stopping, model_checkpoint, reduce_lr],
                    verbose=2)

Train on 6400 samples, validate on 800 samples
Epoch 1/200
 352/6400 [>.............................] - ETA: 1:47 - loss: 0.4527 - acc: 0.779 - ETA: 1:42 - loss: 0.4052 - acc: 0.829 - ETA: 1:41 - loss: 0.4148 - acc: 0.823 - ETA: 1:40 - loss: 0.4029 - acc: 0.837 - ETA: 1:39 - loss: 0.3922 - acc: 0.846 - ETA: 1:39 - loss: 0.4135 - acc: 0.845 - ETA: 1:38 - loss: 0.4150 - acc: 0.843 - ETA: 1:38 - loss: 0.4238 - acc: 0.841 - ETA: 1:38 - loss: 0.4170 - acc: 0.844 - ETA: 1:38 - loss: 0.4093 - acc: 0.849 - ETA: 1:37 - loss: 0.4004 - acc: 0.8553

KeyboardInterrupt: 

In [None]:
fig, (ax_loss, ax_acc) = plt.subplots(1, 2, figsize=(15,5))
ax_loss.plot(history.epoch, history.history["loss"], label="Train loss")
ax_loss.plot(history.epoch, history.history["val_loss"], label="Validation loss")
ax_loss.legend()
ax_acc.plot(history.epoch, history.history["acc"], label="Train accuracy")
ax_acc.plot(history.epoch, history.history["val_acc"], label="Validation accuracy")
ax_acc.legend()

## Predict the validation set

In [53]:
#model = load_model("./keras.model")

## Appendix 1 - Environment Configuration

In [10]:
print (os.getcwd())
print (sys.version)
print (sys.executable)
print (sys.path)

D:\Development\DataScience\Learning\Kaggle TGS Salt Identification Challenge\notebooks\eda
3.6.5 |Anaconda, Inc.| (default, Mar 29 2018, 13:32:41) [MSC v.1900 64 bit (AMD64)]
c:\applications\miniconda3\envs\tensorflow-gpu\python.exe
['D:\\Development\\DataScience\\Learning\\Kaggle TGS Salt Identification Challenge\\src', '', 'c:\\applications\\miniconda3\\envs\\tensorflow-gpu\\python36.zip', 'c:\\applications\\miniconda3\\envs\\tensorflow-gpu\\DLLs', 'c:\\applications\\miniconda3\\envs\\tensorflow-gpu\\lib', 'c:\\applications\\miniconda3\\envs\\tensorflow-gpu', 'c:\\applications\\miniconda3\\envs\\tensorflow-gpu\\lib\\site-packages', 'c:\\applications\\miniconda3\\envs\\tensorflow-gpu\\lib\\site-packages\\object_detection-0.1-py3.6.egg', 'c:\\applications\\miniconda3\\envs\\tensorflow-gpu\\lib\\site-packages\\IPython\\extensions', 'C:\\Users\\mark_\\.ipython']
