In [1]:
import tensorflow as tf
import pandas as pd 
import numpy as np
import cv2
import matplotlib.pyplot as plt
import os

%matplotlib inline

In [2]:
#CONSTANTS
IMAGE_SHAPE = (768, 768)
MASKS, IMAGES = [], []
path = ''
IMAGE_PATH = os.path.join(path, '/content/Images/')
IMAGES_LIST = os.listdir(IMAGE_PATH)
IMAGES_LIST = sorted([IMAGE_PATH+i for i in IMAGES_LIST])

In [4]:
# Connecting to Kaggle API
!pip install -q kaggle
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"volodymyrromanovych","key":"b70dec9c6634e5623401a05773bac828"}'}

In [5]:
# Getting dataset from kaggle
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d mikaelstrauhs/airbus-ship-detection-train-set-70

Downloading airbus-ship-detection-train-set-70.zip to /content
100% 18.5G/18.5G [02:30<00:00, 157MB/s]
100% 18.5G/18.5G [02:30<00:00, 132MB/s]


In [None]:
# Unzipping dataset
!unzip airbus-ship-detection-train-set-70.zip
!mv /content/train_v3/train_v3/Images /content/Images
!rm -r /content/train_v3
!rm /content/airbus-ship-detection-train-set-70.zip

In [3]:
df = pd.read_csv('/content/train_ship_segmentations_v3.csv')
df

Unnamed: 0,ImageId,EncodedPixels
0,0001124c7.jpg,
1,000155de5.jpg,264661 17 265429 33 266197 33 266965 33 267733...
2,000194a2d.jpg,360486 1 361252 4 362019 5 362785 8 363552 10 ...
3,000194a2d.jpg,51834 9 52602 9 53370 9 54138 9 54906 9 55674 ...
4,000194a2d.jpg,198320 10 199088 10 199856 10 200624 10 201392...
...,...,...
162086,fffedbb6b.jpg,
162087,ffff2aa57.jpg,
162088,ffff6e525.jpg,
162089,ffffc50b4.jpg,


We can see that we have a folder with input images and csv file with corresponding EncodedPixels for each ship.
Firstly let's make masks from the Encoded Pixels of ships 

In [4]:
# Making each image have coordinates of all its ships in EncodedPixels(right now each ship is new row)
df['EncodedPixels'] += ' ' 
df = df.groupby(['ImageId']).sum()
df['EncodedPixels'] = df['EncodedPixels'].replace(0, '')

In [5]:
def rle_to_mask(rle):
  """
  Generating masks from Encoded Pixels

  Arguments:
  string representation of Encoded Pixels

  Returns:
  (768,768,1) np.ndarray representation of a mask
  """
  shape_x = IMAGE_SHAPE[0]
  shape_y = IMAGE_SHAPE[1]
  if rle == '':
    return np.zeros((shape_x, shape_y, 1), dtype=np.float32)
    
  else:
    mask=np.zeros(shape_x*shape_y, dtype=np.float32)
    array = np.asarray([int(x) for x in rle.split()])
    starts = array[::2]
    lengths = array[1::2]

    current_position = 0
    for index, start in enumerate(starts):
        mask[int(start):int(start+lengths[index])] = 1
        current_position += lengths[index]
    return np.flipud(np.rot90(mask.reshape(shape_y, shape_x, 1),k=1))
    

Now let's make a tensorflow dataset for all (image, mask) pairs of training set

In [14]:
def image_mask_gen():
  """
  Generator for a (image, mask) set item

  Reads image from images folder and converts it to a normalized tensor
  Gets Encoded Pixels for the corresponding image from csv file and converts it to a mask
  Warning: Make sure IMAGES_LIST is sorted the same way as the csv file
  """
  length = len(df)
  for i in range(length):
    img = tf.io.read_file(IMAGES_LIST[i])
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.convert_image_dtype(img, tf.float32)
    
    mask = rle_to_mask(df['EncodedPixels'].iloc[i])
    yield img, tf.cast(mask, dtype=tf.float32)


training=tf.data.Dataset.from_generator(image_mask_gen, output_types=(tf.float32, tf.float32), output_shapes=([768,768,3], [768,768,1]))

And now let's build our model. Since the task is semantic segmetation, we are going to use Unet. And for this task Cross Entropy Loss doesn't work very well, so we are going to use dice score loss function.

In [7]:
def conv_block(inputs=None, n_filters=32, dropout_prob=0, max_pooling=True):
    """
    Convolutional downsampling block
    
    Arguments:
        inputs -- Input tensor
        n_filters -- Number of filters for the convolutional layers
        dropout_prob -- Dropout probability
        max_pooling -- Use MaxPooling2D to reduce the spatial dimensions of the output volume
    Returns: 
        next_layer, skip_connection --  Next layer and skip connection outputs
    """

    conv = tf.keras.layers.Conv2D(n_filters, # Number of filters
                  kernel_size=(3,3),   # Kernel size   
                  activation='relu',
                  padding='same',
                  kernel_initializer='he_normal')(inputs)
    conv = tf.keras.layers.Conv2D(n_filters, # Number of filters
                  kernel_size=(3,3),   # Kernel size
                  activation='relu',
                  padding='same',
                  kernel_initializer='he_normal')(conv)
    # If dropout_porb is not 0 using a Dropout layer
    if dropout_prob > 0:
        conv = tf.keras.layers.Dropout(dropout_prob)(conv)
         
        
    # if max_pooling is True add a MaxPooling2D with 2x2 pool_size
    if max_pooling:
        next_layer = tf.keras.layers.MaxPool2D()(conv)
        
    else:
        next_layer = conv
        
    skip_connection = conv
    
    return next_layer, skip_connection


def upsampling_block(prev_input, skipped_input, n_filters=32):
    """
    Convolutional upsampling block
    
    Arguments:
        prev_input -- Input tensor from previous layer
        skipped_input -- Input tensor from previous skip layer
        n_filters -- Number of filters for the convolutional layers
    Returns: 
        conv -- Tensor output
    """

    up = tf.keras.layers.Conv2DTranspose(n_filters, kernel_size=(3,3),
                                         strides=(2,2),
                                         padding='same')(prev_input)
    
    # Merge the previous output and the skipped_input
    merge = tf.keras.layers.concatenate([up, skipped_input], axis=3)
    
    conv = tf.keras.layers.Conv2D(n_filters, kernel_size=(3,3), 
                                  activation='relu',padding='same',
                                  kernel_initializer='he_normal')(merge)
    conv = tf.keras.layers.Conv2D(n_filters, kernel_size=(3,3),
                                  activation='relu', padding='same',
                                  kernel_initializer='he_normal')(conv)

    return conv


def unet_model(input_size=(IMAGE_SHAPE[0], IMAGE_SHAPE[1], 3), n_filters=32, n_classes=1):
    """
    Unet model
    
    Arguments:
        input_size -- Input shape 
        n_filters -- Number of filters for the convolutional layers
        n_classes -- Number of output classes
    Returns: 
        model -- tf.keras.Model
    """
    inputs = tf.keras.layers.Input(input_size)
    # Encoding
    cblock1 = conv_block(inputs, n_filters)
    cblock2 = conv_block(cblock1[0], n_filters*2)
    cblock3 = conv_block(cblock2[0], n_filters*4)
    cblock4 = conv_block(cblock3[0], n_filters*8, max_pooling=False)

    # Decoding
    ublock6 = upsampling_block(cblock4[0], cblock3[1],  n_filters*4)
    ublock7 = upsampling_block(ublock6, cblock2[1],  n_filters*2)
    ublock8 = upsampling_block(ublock7, cblock1[1],  n_filters)

    conv9 = tf.keras.layers.Conv2D(n_filters,
                 3,
                 activation='relu',
                 padding='same',
                 kernel_initializer='he_normal')(ublock8)

    # Output Conv2D layer with n_classes filter, kernel size of 1 and a 'same' padding
    conv10 = tf.keras.layers.Conv2D(n_classes, (1,1), padding='same')(conv9)

    
    model = tf.keras.Model(inputs=inputs, outputs=conv10)

    return model


def dice_score_loss(y_true, y_pred, smooth=1):
    """
    Loss function using Dice score
    """  
    y_true_f = tf.keras.backend.flatten(y_true)
    y_pred_f = tf.keras.backend.flatten(y_pred)
    intersection = tf.keras.backend.sum(y_true_f * y_pred_f)
    dice = (2. * intersection + smooth) / (tf.keras.backend.sum(y_true_f) + tf.keras.backend.sum(y_pred_f) + smooth)
    return 1 - dice

In [15]:
unet = unet_model()

In [16]:
unet.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 768, 768, 3  0           []                               
                                )]                                                                
                                                                                                  
 conv2d_16 (Conv2D)             (None, 768, 768, 32  896         ['input_2[0][0]']                
                                )                                                                 
                                                                                                  
 conv2d_17 (Conv2D)             (None, 768, 768, 32  9248        ['conv2d_16[0][0]']              
                                )                                                           

In [17]:
unet.compile(optimizer='adam',
              loss=dice_score_loss,
              metrics=([tf.keras.metrics.BinaryIoU()]))

In [None]:
EPOCHS = 10
BATCH_SIZE = 16
train_dataset = training.cache().batch(BATCH_SIZE)
print(training.element_spec)
model_history = unet.fit(train_dataset, epochs=EPOCHS)

(TensorSpec(shape=(768, 768, 3), dtype=tf.float32, name=None), TensorSpec(shape=(768, 768, 1), dtype=tf.float32, name=None))
Epoch 1/10


Unfortunately our training crushes due to lack of RAM unless there is really small dataset size, so we can't train our model properly :'(