In [1]:
import json
from glob import glob
from itertools import chain
from os.path import splitext, split, join
from shutil import move

import cv2 as cv
import numpy as np
import pandas as pd
from sahi.slicing import slice_image
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm

In [2]:
df_ships = pd.read_csv('../data/train_unique.csv')
df_ships['EncodedPixels'] = df_ships['EncodedPixels'].map(lambda it: json.loads(it))
print(len(df_ships))
df_ships.head()

15912


Unnamed: 0,ImageId,EncodedPixels
0,000155de5.jpg,"[264661, 17, 265429, 33, 266197, 33, 266965, 3..."
1,000194a2d.jpg,"[360486, 1, 361252, 4, 362019, 5, 362785, 8, 3..."
2,000194a2d.jpg,"[51834, 9, 52602, 9, 53370, 9, 54138, 9, 54906..."
3,000194a2d.jpg,"[198320, 10, 199088, 10, 199856, 10, 200624, 1..."
4,000194a2d.jpg,"[55683, 1, 56451, 1, 57219, 1, 57987, 1, 58755..."


In [3]:
image_name_list = df_ships['ImageId'].unique()
len(image_name_list)

8635

# Crop

In [6]:
def slice_wrapper(source_image_path, output_dir_path='../data/train_unique_cropped/positive'):
    image_name_base, ext = splitext(split(source_image_path)[1])
    slice_image(
        image=source_image_path,
        output_file_name=image_name_base,
        output_dir=output_dir_path,
        slice_height=128,
        slice_width=128,
        overlap_height_ratio=0,
        overlap_width_ratio=0,
        out_ext=ext,
    )


def run():
    for image_name in tqdm(image_name_list):
        image_path = glob(f'../data/train/**/{image_name}', recursive=True)[0]
        image_name_base = splitext(image_name)[0]
        label_path = glob(f'../data/train/**/{image_name_base}.png', recursive=True)[0]
        slice_wrapper(image_path)
        slice_wrapper(label_path)


run()

  0%|          | 0/8635 [00:00<?, ?it/s]

# Move empty

In [12]:
destination_path = '../data/train_unique_cropped/negative/'
for label_path in tqdm(glob('../data/train_unique_cropped/positive/*.png')):
    label_image = cv.imread(label_path, 0)
    if label_image.max() == 255:
        continue

    img_path = splitext(label_path)[0] + '.jpg'
    move(label_path, destination_path)
    move(img_path, destination_path)

  0%|          | 0/274644 [00:00<?, ?it/s]

# Convert labels to binary format

In [44]:
source_directory_path = '../data/train_unique_cropped/positive_binary'
for label_path in tqdm(glob(join(source_directory_path, f'*.png'))):
    img = cv.imread(label_path, 0)
    # img = (img / 255).astype(np.uint8)
    cv.imwrite(label_path, img)

  0%|          | 0/24718 [00:00<?, ?it/s]

# Split dataset

In [4]:
df_ship_groups = df_ships.groupby(by='ImageId')
len(df_ship_groups)

8635

In [5]:
ship_size_image_list = []
image_name_list = []
for img_id, df_group in tqdm(df_ship_groups):
    ship_size_list = []
    for idx, row in df_group.iterrows():
        ship_size_list.append(
            sum(row['EncodedPixels'][1::2])
        )
    image_name_list.append(img_id)
    ship_size_image_list.append(ship_size_list)

  0%|          | 0/8635 [00:00<?, ?it/s]

In [6]:
# we shouldn't split randomly image names because images could contain ships of very different sizes
# we shouldn't split crops based on ship area on them because parts of the same image shouldn't be at different sets (it will be a kind of leakage)

ship_class_boundaries = [25, 125, 2500, 7500]


def get_ship_class(ship_size):
    for idx, class_upper_bound in enumerate(ship_class_boundaries):
        if ship_size < class_upper_bound:
            return idx

    return len(ship_class_boundaries)


image_name_list = np.asarray(image_name_list)
image_stratification_class = np.asarray([get_ship_class(max(ship_size_list))
                                         for ship_size_list in ship_size_image_list])
set(image_stratification_class)

{0, 1, 2, 3, 4}

In [7]:
crop_count_list = []
for img_name in tqdm(image_name_list[:100]):
    crop_count_list.append(len(glob(f'../data/train_unique_cropped/positive/{splitext(img_name)[0]}*')))
np.average(crop_count_list)

  0%|          | 0/100 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [7]:
index_list = list(range(len(image_name_list)))
image_index_list_train, image_index_list_test = train_test_split(
    index_list,
    test_size=1500,
    random_state=42,
    shuffle=True,
    stratify=image_stratification_class
)
image_index_list_val, image_index_list_test = train_test_split(
    image_index_list_test,
    train_size=100,
    random_state=42,
    shuffle=True,
    stratify=image_stratification_class[image_index_list_test]
)

image_base_name_list_train = image_name_list[image_index_list_train]
image_base_name_list_val = image_name_list[image_index_list_val]
image_base_name_list_test = image_name_list[image_index_list_test]
len(image_base_name_list_train), len(image_base_name_list_val), len(image_base_name_list_test)

(7135, 100, 1400)

In [8]:
df_base_file_name_split = pd.DataFrame({
    'file_name': np.concatenate((image_base_name_list_train, image_base_name_list_val, image_base_name_list_test)),
    'dataset': ['train'] * len(image_base_name_list_train) + ['val'] * len(image_base_name_list_val) + ['test'] * len(
        image_base_name_list_test)
})
print(len(df_base_file_name_split))
df_base_file_name_split.head()

8635


Unnamed: 0,file_name,dataset
0,5e4389b28.jpg,train
1,47d99bdca.jpg,train
2,8d998e90f.jpg,train
3,86c9db6ce.jpg,train
4,087606e41.jpg,train


In [9]:
df_base_file_name_split.to_csv('../data/train_base_file_name_split.csv', index=False)

# Generate meta

In [10]:
df_file_name_split = pd.read_csv('../data/train_base_file_name_split.csv')
df_file_name_split.head()

Unnamed: 0,file_name,dataset
0,5e4389b28.jpg,train
1,47d99bdca.jpg,train
2,8d998e90f.jpg,train
3,86c9db6ce.jpg,train
4,087606e41.jpg,train


In [11]:
image_base_name_list_train = df_file_name_split[df_file_name_split['dataset'] == 'train']['file_name']
image_base_name_list_val = df_file_name_split[df_file_name_split['dataset'] == 'val']['file_name']
image_base_name_list_test = df_file_name_split[df_file_name_split['dataset'] == 'test']['file_name']
len(image_base_name_list_train), len(image_base_name_list_val), len(image_base_name_list_test)

(7135, 100, 1400)

In [12]:
def image_base_name_to_image_name_list(image_base_name, image_folder_path):
    image_base_name, ext = splitext(image_base_name)
    path_list = glob(join(image_folder_path, f'{image_base_name}*{ext}'))

    return [split(p)[1] for p in path_list]


def base_name_list_to_name_list(image_base_name_list, image_folder_path):
    return list(chain(*[image_base_name_to_image_name_list(n, image_folder_path)
                        for n in tqdm(image_base_name_list)]))


def image_path_to_descriptor_line(img_name, path_prefix):
    img_path = join(path_prefix, img_name)
    img_base_name = splitext(img_name)[0]
    label_path = join(path_prefix, f'{img_base_name}.png')

    return f'{img_path} {label_path}'

def string_list_to_file(string_list, file_path):
    lines = [f'{s}\n' for s in string_list]
    with open(file_path, mode='w+') as f:
        f.writelines(lines)

In [15]:
# PaddlePaddle format
image_name_list_test = base_name_list_to_name_list(image_base_name_list_test, '../data/train_unique_cropped/positive')
image_descriptor_list = [image_path_to_descriptor_line(image_path, 'positive') for image_path in image_name_list_test]
string_list_to_file(image_descriptor_list, '../data/train_unique_cropped/test.txt')

image_name_list_val = base_name_list_to_name_list(image_base_name_list_val, '../data/train_unique_cropped/positive')
image_descriptor_list = [image_path_to_descriptor_line(image_name_list_val, 'positive') for image_path in image_name_list]
string_list_to_file(image_descriptor_list, '../data/train_unique_cropped/val.txt')

image_name_list_train = base_name_list_to_name_list(image_base_name_list_train, '../data/train_unique_cropped/positive')
image_descriptor_list = [image_path_to_descriptor_line(image_name_list_train, 'positive') for image_path in image_name_list]
string_list_to_file(image_descriptor_list, '../data/train_unique_cropped/train.txt')

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/7135 [00:00<?, ?it/s]

In [16]:
# FastAI format
image_name_list_test = base_name_list_to_name_list(image_base_name_list_test, '../data/train_unique_cropped/positive')
image_name_list_val = base_name_list_to_name_list(image_base_name_list_val, '../data/train_unique_cropped/positive')
image_name_list_train = base_name_list_to_name_list(image_base_name_list_train, '../data/train_unique_cropped/positive')
len(image_base_name_list_train), len(image_base_name_list_val), len(image_base_name_list_test)

  0%|          | 0/1400 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/7135 [00:00<?, ?it/s]

(7135, 100, 1400)

In [27]:
def add_prefix(path_list, prefix):
    return [join(prefix, p) for p in path_list]

image_path_list_test = add_prefix(image_name_list_test, 'positive/')
image_path_list_val = add_prefix(image_name_list_val, 'positive/')
image_path_list_train = add_prefix(image_name_list_train, 'positive/')

In [28]:
df_file_path_split = pd.DataFrame({
    'file_path': np.concatenate((image_path_list_train, image_path_list_val, image_path_list_test)),
    'dataset': ['train'] * len(image_path_list_train) + ['val'] * len(image_path_list_val) + ['test'] * len(
        image_path_list_test)
})
df_file_path_split.head()

Unnamed: 0,file_path,dataset
0,positive/5e4389b28_640_128_768_256.jpg,train
1,positive/47d99bdca_384_512_512_640.jpg,train
2,positive/47d99bdca_512_512_640_640.jpg,train
3,positive/47d99bdca_384_384_512_512.jpg,train
4,positive/47d99bdca_512_384_640_512.jpg,train


In [29]:
df_file_path_split.to_csv('../data/train_unique_cropped/file_name_split.csv', index=False)

# Copy train and val files

In [31]:
import pandas as pd
from tqdm.notebook import tqdm
from shutil import copy

In [32]:
df_file_path_split = pd.read_csv('../data/train_unique_cropped/file_name_split.csv')
df_file_path_split.head()

Unnamed: 0,file_path,dataset
0,positive/5e4389b28_640_128_768_256.jpg,train
1,positive/47d99bdca_384_512_512_640.jpg,train
2,positive/47d99bdca_512_512_640_640.jpg,train
3,positive/47d99bdca_384_384_512_512.jpg,train
4,positive/47d99bdca_512_384_640_512.jpg,train


In [33]:
dataset_path = '../data/train_unique_cropped/'
destination_path = '../data/train_unique_cropped/positive_binary/'
for path in tqdm(df_file_path_split['file_path']):
    path = join(dataset_path, path)
    copy(path, destination_path)
    copy(splitext(path)[0]+'.png', destination_path)

  0%|          | 0/24718 [00:00<?, ?it/s]