# Base preprocessing of data

Ignore warnings

In [1]:
import warnings
warnings.filterwarnings('ignore')

### Imports

In [2]:
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

%matplotlib inline

### Paths:
* to raw data, that should contain:
  * `./depths.csv`
  * `./sample_submission.csv`
  * `./test/` -> with all test images
  * `./train.csv`
  * `./train/` -> with all train images
* to future preprocess data

In [3]:
RAW_DATA_PATH = '/cobrain/groups/ml_group/data/dustpelt/salt_raw/'
PREP_DATA_PATH = '/cobrain/groups/ml_group/data/dustpelt/salt_prep/'

In [4]:
prep_train_path = os.path.join(PREP_DATA_PATH, 'train')
prep_test_path = os.path.join(PREP_DATA_PATH, 'test')

if not os.path.isdir(prep_train_path):
    os.mkdir(prep_train_path)
    
if not os.path.isdir(prep_test_path):
    os.mkdir(prep_test_path)

CSV example of salt depths

In [5]:
depths = pd.read_csv(os.path.join(RAW_DATA_PATH, 'depths.csv'))
depths.head()

Unnamed: 0,id,z
0,4ac19fb269,306
1,1825fadf99,157
2,f59821d067,305
3,5b435fad9d,503
4,e340e7bfca,783


CSV example of train ids and rle masks

In [6]:
train = pd.read_csv(os.path.join(RAW_DATA_PATH, 'train.csv'))
train.head()

Unnamed: 0,id,rle_mask
0,575d24d81d,
1,a266a2a9df,5051 5151
2,75efad62c1,9 93 109 94 210 94 310 95 411 95 511 96 612 96...
3,34e51dba6a,48 54 149 54 251 53 353 52 455 51 557 50 659 4...
4,4875705fb0,1111 1 1212 1 1313 1 1414 1 1514 2 1615 2 1716...


list of ids:

In [7]:
train_ids = train['id'].values
train_ids

array(['575d24d81d', 'a266a2a9df', '75efad62c1', ..., '1306fcee4c',
       '48d81e93d9', 'edf1e6ac00'], dtype=object)

`sample_submission.csv` has all the test ids, i've checked it

In [8]:
sample = pd.read_csv(os.path.join(RAW_DATA_PATH, 'sample_submission.csv'))

test_ids = sample['id'].values
test_ids

array(['155410d6fa', '78b32781d1', '63db2a476a', ..., '07c3553ef7',
       '9c2e45bf79', '41d0f0703c'], dtype=object)

### Preprocess all the train data into appropriate structure

In [9]:
from collections import defaultdict

from keras.preprocessing.image import load_img
from skimage.transform import resize

# to import something from saltsegm "lib", add symlink to your conda or similar env
# example:
# ln -s ~/salt-challenge/saltsegm ~/anaconda3/lib/python3.6/site-packages/
from saltsegm.utils import id2png

Using TensorFlow backend.


In [10]:
metadata = defaultdict(dict)

imgs_path = os.path.join(RAW_DATA_PATH, 'train/images')
masks_path = os.path.join(RAW_DATA_PATH, 'train/masks')

for i, _id in enumerate(tqdm(train_ids)):
    if not os.path.isdir( os.path.join(prep_train_path, _id) ):
        os.mkdir( os.path.join(prep_train_path, _id) )
        
    # raw image
    img = np.array( load_img(os.path.join(imgs_path, id2png(_id)), grayscale=True) ) / 255.
    img = np.array(img, dtype='float32')
    
    img_name = 'image.npy'
    img_name = os.path.join(_id, img_name)
    
    
    # image resized to 128x128
    img128 = resize(img, output_shape=(128, 128), order=3, preserve_range=True)
    
    img128_name = 'image128.npy'
    img128_name = os.path.join(_id, img128_name)
    
    
    # raw mask
    mask = np.array( load_img(os.path.join(masks_path, id2png(_id)), grayscale=True) ) / 255.
    mask = np.array(mask, dtype='float32')
    
    mask_name = 'target.npy'
    mask_name = os.path.join(_id, mask_name)
    
    
    # mask resized to 128x128
    mask128 = resize(mask, output_shape=(128, 128), order=3, preserve_range=True)
    
    mask128_name = 'target128.npy'
    mask128_name = os.path.join(_id, mask128_name)
    
    
    # depths
    z = depths[depths['id'].values == _id]['z'].values[0]
    
    
    # ratio of pixels with target to all pixels on the image
    target_ratio = np.sum(mask) / np.product(mask.shape)
    
    
    # is not empty?
    is_not_empty = False if np.sum(img) == 0 else True

    
    metadata[i] = {'id': _id,
                   'image': img_name,
                   'target': mask_name,
                   'image-128': img128_name,
                   'target-128': mask128_name,
                   'z': z,
                   'target_ratio': target_ratio,
                   'is_not_empty': is_not_empty}
    
    np.save(os.path.join(prep_train_path, img_name), img)
    np.save(os.path.join(prep_train_path, img128_name), img128)
    np.save(os.path.join(prep_train_path, mask_name), mask)
    np.save(os.path.join(prep_train_path, mask128_name), mask128)
    
metadata_csv = pd.DataFrame.from_dict(metadata, orient='index')
metadata_csv.to_csv( path_or_buf=os.path.join(prep_train_path, 'metadata.csv') )

100%|██████████| 4000/4000 [03:09<00:00, 21.09it/s]


overview of metadata:

In [11]:
metadata_csv.head()

Unnamed: 0,id,image,target,image-128,target-128,z,target_ratio,is_not_empty
0,575d24d81d,575d24d81d/image.npy,575d24d81d/target.npy,575d24d81d/image128.npy,575d24d81d/target128.npy,843,0.0,True
1,a266a2a9df,a266a2a9df/image.npy,a266a2a9df/target.npy,a266a2a9df/image128.npy,a266a2a9df/target128.npy,794,0.50495,True
2,75efad62c1,75efad62c1/image.npy,75efad62c1/target.npy,75efad62c1/image128.npy,75efad62c1/target128.npy,468,0.993334,True
3,34e51dba6a,34e51dba6a/image.npy,34e51dba6a/target.npy,34e51dba6a/image128.npy,34e51dba6a/target128.npy,727,0.149201,True
4,4875705fb0,4875705fb0/image.npy,4875705fb0/target.npy,4875705fb0/image128.npy,4875705fb0/target128.npy,797,0.042839,True


### Preprocess all the test data into appropriate structure

In [13]:
metadata = defaultdict(dict)

imgs_path = os.path.join(RAW_DATA_PATH, 'test/images')

for i, _id in enumerate(tqdm(test_ids)):
    if not os.path.isdir( os.path.join(prep_test_path, _id) ):
        os.mkdir( os.path.join(prep_test_path, _id) )
        
    # raw image
    img = np.array( load_img(os.path.join(imgs_path, id2png(_id)), grayscale=True) ) / 255.
    img = np.array(img, dtype='float32')
    
    img_name = 'image.npy'
    img_name = os.path.join(_id, img_name)
    
    
    # image resized to 128x128
    img128 = resize(img, output_shape=(128, 128), order=3, preserve_range=True)
    
    img128_name = 'image128.npy'
    img128_name = os.path.join(_id, img128_name)
    
    
    # depths
    z = depths[depths['id'].values == _id]['z'].values[0]
    
    
    # is not empty?
    is_not_empty = False if np.sum(img) == 0 else True

    
    metadata[i] = {'id': _id,
                   'image': img_name,
                   'image-128': img128_name,
                   'z': z,
                   'is_not_empty': is_not_empty}
    
    np.save(os.path.join(prep_test_path, img_name), img)
    np.save(os.path.join(prep_test_path, img128_name), img128)
    
metadata_csv = pd.DataFrame.from_dict(metadata, orient='index')
metadata_csv.to_csv( path_or_buf=os.path.join(prep_test_path, 'metadata.csv') )

100%|██████████| 18000/18000 [08:00<00:00, 37.48it/s]


overview of metadata:

In [14]:
metadata_csv.head()

Unnamed: 0,id,image,image-128,z,is_not_empty
0,155410d6fa,155410d6fa/image.npy,155410d6fa/image128.npy,559,True
1,78b32781d1,78b32781d1/image.npy,78b32781d1/image128.npy,298,True
2,63db2a476a,63db2a476a/image.npy,63db2a476a/image128.npy,392,True
3,17bfcdb967,17bfcdb967/image.npy,17bfcdb967/image128.npy,698,True
4,7ea0fd3c88,7ea0fd3c88/image.npy,7ea0fd3c88/image128.npy,837,True


If you want to append new preprocess image of feature just add column into metadata and/or insert this object into corresponding image's folder.