Tf records for speeding up 

### Webographie:  
https://medium.com/@moritzkrger/speeding-up-keras-with-tfrecord-datasets-5464f9836c36  
https://medium.com/ymedialabs-innovation/how-to-use-tfrecord-with-datasets-and-iterators-in-tensorflow-with-code-samples-ffee57d298af  
https://www.tensorflow.org/tutorials/load_data/tf_records#read_the_tfrecord_file  
https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/how_tos/reading_data/convert_to_records.py  

In [140]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import numpy as np
from skimage.io import imread

from preprocess.pre_process import multi_rle_encode, rle_encode, rle_decode, masks_as_image, masks_as_color, balancing_train
from preprocess.pre_process import make_image_gen, create_aug_gen

import tensorflow as tf

import time
from IPython.display import clear_output

In [133]:
start = time.time()
time.sleep(2)
timing = time.time() - start
timing

2.0024585723876953

In [27]:
def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


def _bytes_feature(value):
      return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

In [2]:
ship_dir = '../../data/airbus_ship_detection/'
train_image_dir = os.path.join(ship_dir, 'train')# Images for training
test_image_dir = os.path.join(ship_dir, 'test')# Images for testing
label_dir = os.path.join(ship_dir, 'train_ship_segmentations_v2.csv')# Images for testing
masks = pd.read_csv(label_dir, engine="python") # Markers for ships

In [4]:
data_link_balanced = balancing_train(masks, rate_of_has_ship=0.5, ship_dir_train=train_image_dir)
data_link_unbalanced = balancing_train(masks, rate_of_has_ship=0.0, ship_dir_train=train_image_dir)

In [5]:
print("data_link_balanced rate:{0}, lenght: {1}".format(data_link_balanced.has_ship.sum()/len(data_link_balanced)
                                                    ,len(data_link_balanced)))
print("data_link_unbalanced rate:{0}, lenght: {1}".format(round(data_link_unbalanced.has_ship.sum()/len(data_link_unbalanced),2)
                                                    ,len(data_link_unbalanced)))

data_link_balanced rate:0.5, lenght: 80432
data_link_unbalanced rate:0.21, lenght: 187099


In [7]:
data_link_balanced.head(5)

Unnamed: 0,ImageId,ships,has_ship,file_size_kb
0,000155de5.jpg,1,1,147.625977
1,00021ddc3.jpg,9,1,242.910156
2,0002756f7.jpg,2,1,287.620117
3,00031f145.jpg,1,1,232.898438
4,000532683.jpg,2,1,166.852539


80432

In [135]:
%%time
images_list = []
for filename in data_link_unbalanced.ImageId[:10]:
        img = imread(os.path.join(train_image_dir,filename))
        if img is not None:
            images_list.append(img)
        else:
            print("Error image missing")
print(np.array(images_list).shape)

(10, 768, 768, 3)
CPU times: user 332 ms, sys: 0 ns, total: 332 ms
Wall time: 330 ms


In [114]:
images_link = np.array(data_link_unbalanced.ImageId)
labels = np.array(data_link_unbalanced.has_ship)
print("shape of images :", images_link.shape)
print("shape of labels :", labels.shape)

shape of images : (187099,)
shape of labels : (187099,)


In [161]:
def convert_to(images_link, labels, name):
    """Converts a dataset to tfrecords."""
    num_examples = labels.shape[0]
    

    if images_link.shape[0] != num_examples:
        raise ValueError('Images size %d does not match label size %d.' %
                         (images_link.shape[0], num_examples))
        
    img_shape = imread(os.path.join(train_image_dir, images_link[0])).shape    
    
    rows = img_shape[0]
    cols = img_shape[1]
    depth = img_shape[2]

    filename = os.path.join(ship_dir, name + '.tfrecords')
    print('Writing', filename)
    start = time.time()
    t_per_file = []
    with tf.python_io.TFRecordWriter(filename) as writer:
        for index in range(num_examples):
            if index%100 == 0:
                clear_output(wait=True)
                t_per_file.append((time.time() - start) / 100)
                eta = np.mean(t_per_file) * (num_examples - index)
                print("ETA:", round(eta,2), end=" seconds ==>  ")
                print(index, "/", num_examples)
                start = time.time()
            with tf.gfile.FastGFile(os.path.join(train_image_dir, images_link[index]) , 'rb') as fid:
                image_data = fid.read()
            example = tf.train.Example(
                features=tf.train.Features(
                    feature={
                      'label': _int64_feature(int(labels[index])),
                      'image_raw': _bytes_feature(image_data)
                      }))
            writer.write(example.SerializeToString())

In [162]:
from sklearn.model_selection import train_test_split

In [163]:
images_train, images_val, labels_train, labels_val = train_test_split(images_link, labels, test_size=0.1)

In [None]:
convert_to(images_train, labels_train, "training")

ETA: 236.16 seconds ==>  69700 / 168389


In [None]:
convert_to(images_val, labels_val, "validation")

In [None]:
def create_dataset(filepath):
    
    # This works with arrays as well
    dataset = tf.data.TFRecordDataset(filepath)
    
    # Maps the parser on every filepath in the array. You can set the number of parallel loaders here
    dataset = dataset.map(_parse_function, num_parallel_calls=8)
    
    # This dataset will go on forever
    dataset = dataset.repeat()
    
    # Set the number of datapoints you want to load and shuffle 
    dataset = dataset.shuffle(SHUFFLE_BUFFER)
    
    # Set the batchsize
    dataset = dataset.batch(BATCH_SIZE)
    
    # Create an iterator
    iterator = dataset.make_one_shot_iterator()
    
    # Create your tf representation of the iterator
    image, label = iterator.get_next()

    # Bring your picture back in shape
    image = tf.reshape(image, [-1, 256, 256, 1])
    
    # Create a one hot array for your labels
    label = tf.one_hot(label, NUM_CLASSES)
    
    return image, label