In [1]:
import numpy as np
import tensorflow as tf
tf.enable_eager_execution()
tf.__version__

'1.13.1'

## Parameters

In [2]:
import pathlib
#img_path = "/home/Data/CharactersTrimPad28/"
img_path = "./s3mnt/ChineseNumbers/"
img_path = "/home/Data/ChineseNumbers/"
data_root = pathlib.Path(img_path)
AUTOTUNE = tf.data.experimental.AUTOTUNE
NUM_EPOCHS = 1
BATCH_SIZE = 32

## Mapping function

In [3]:
def preprocess_image(image):
  image = tf.image.decode_image(image, channels=3)
  image = tf.cast(image, tf.float32)
  image /= 255.0  # normalize to [0,1] range
  return image

def load_and_preprocess_image(path):
  image = tf.read_file(path)
  return preprocess_image(image)

# The tuples are unpacked into the positional arguments of the mapped function
def load_and_preprocess_from_path_label(path, label):
  return load_and_preprocess_image(path), label

## Generate paths and labels

In [4]:
# tf.data.Dataset.from_tensor_slices
all_image_paths = [str(path) for path in list(data_root.glob('*/*'))]
print(all_image_paths[:10])

label_names = sorted(item.name for item in data_root.glob('*/') if item.is_dir())
label_to_index = dict((name, index) for index,name in enumerate(label_names))
all_image_labels = [label_to_index[pathlib.Path(path).parent.name]
                    for path in all_image_paths]

image_count = len(all_image_paths)

['/home/Data/ChineseNumbers/二/Han yi Cu yuan ti Font-Traditional Chinese ttf.png', '/home/Data/ChineseNumbers/二/JiaShang Liu Xing kai 5500 Font- Simplified Chinesettf.png', '/home/Data/ChineseNumbers/二/Japan hengshan writing brush Font-Traditional Chinesettf.png', '/home/Data/ChineseNumbers/二/Classic Cu hei Fontttf.png', '/home/Data/ChineseNumbers/二/Chinese New Year(DFGirl-dospy-fei) font-Simplified Chinesettf.png', '/home/Data/ChineseNumbers/二/Han yi Fang die Fontttf.png', '/home/Data/ChineseNumbers/二/Classic Kong die hei Fontttf.png', '/home/Data/ChineseNumbers/二/Childhood amusement park Font-Simplified Chinesettf.png', '/home/Data/ChineseNumbers/二/Snow World  Butterfly Font-Simplified Chinesettf.png', '/home/Data/ChineseNumbers/二/Hypocrite Youth v 20 Font-Simplified ChineseTTF.png']


## Test iterate time

In [18]:
import time

def timeit(dataset):
    overall_start = time.time()
    n_image = 0

    start = time.time()
    for n_batch,(images,labels) in enumerate(dataset):
        n_image += int(images.shape[0])
        if n_image%100 == 0:
            #print('.',end='')
            #print("\r{} images in {} batches with BATCH_SIZE {}: {:.2f} s".format(n_image, n_batch, BATCH_SIZE, time.time()-start), end='', flush=True)
            print("\r{} images: {:.2f} s".format(n_image, time.time()-start), end='', flush=True)
    print()
    end = time.time()
    duration = end-start
    
    print("{} images: {} s".format(n_image, duration))
    print("{:0.5f} Images/s".format(n_image/float(duration)))
    print("Total time: {}s".format(end-overall_start))

## Input pipeline experiment

### 1. Original pipeline

In [19]:
# Extract
path_label_ds = tf.data.Dataset.from_tensor_slices((all_image_paths, all_image_labels))

# Transform
path_label_ds = path_label_ds.shuffle(buffer_size=image_count)
path_label_ds = path_label_ds.repeat(NUM_EPOCHS)

image_label_ds = path_label_ds.map(load_and_preprocess_from_path_label)
image_label_ds = image_label_ds.batch(BATCH_SIZE)

# Load
timeit(image_label_ds)

12000 images: 3.17 s
12607 images: 3.327425956726074 s
3788.81459 Images/s
Total time: 3.3274269104003906s


### 2. Map with num_parallel_calls

In [21]:
# Extract
path_label_ds = tf.data.Dataset.from_tensor_slices((all_image_paths, all_image_labels))

# Transform
path_label_ds = path_label_ds.shuffle(buffer_size=image_count)
path_label_ds = path_label_ds.repeat(NUM_EPOCHS)
path_label_ds = path_label_ds.apply(tf.data.experimental.shuffle_and_repeat(buffer_size=image_count, count=NUM_EPOCHS))


image_label_ds = path_label_ds.map(load_and_preprocess_from_path_label, num_parallel_calls=4)
image_label_ds = image_label_ds.batch(BATCH_SIZE)

# Load
timeit(image_label_ds)

12000 images: 1.08 s
12607 images: 1.127851963043213 s
11177.88541 Images/s
Total time: 1.1278529167175293s


### 3. tf.data.experimental.shuffle_and_repeat

See more about [tf.data.experimental.shuffle_and_repeat](https://www.tensorflow.org/api_docs/python/tf/data/experimental/shuffle_and_repeat)
```
tf.data.experimental.shuffle_and_repeat(
    buffer_size,
    count=None,
    seed=None
)
```

In [22]:
# Extract
path_label_ds = tf.data.Dataset.from_tensor_slices((all_image_paths, all_image_labels))

# Transform
path_label_ds = path_label_ds.apply(tf.data.experimental.shuffle_and_repeat(buffer_size=image_count, count=NUM_EPOCHS))


image_label_ds = path_label_ds.map(load_and_preprocess_from_path_label, num_parallel_calls=4)
image_label_ds = image_label_ds.batch(BATCH_SIZE)

# Load
timeit(image_label_ds)

12000 images: 1.05 s
12607 images: 1.105530023574829 s
11403.57994 Images/s
Total time: 1.1055309772491455s


### 4. tf.data.experimental.map_and_batch

See more about [tf.data.experimental.map_and_batch](https://www.tensorflow.org/api_docs/python/tf/data/experimental/map_and_batch)
```
tf.data.experimental.map_and_batch(
    map_func,
    batch_size,
    num_parallel_batches=None,
    drop_remainder=False,
    num_parallel_calls=None
)
```

In [23]:
# Extract
path_label_ds = tf.data.Dataset.from_tensor_slices((all_image_paths, all_image_labels))

# Transform
path_label_ds = path_label_ds.shuffle(buffer_size=image_count)
path_label_ds = path_label_ds.repeat(NUM_EPOCHS)

image_label_ds = path_label_ds.apply(tf.data.experimental.map_and_batch(load_and_preprocess_from_path_label, BATCH_SIZE, num_parallel_calls=4))

# Load
timeit(image_label_ds)

12000 images: 1.04 s
12607 images: 1.0948739051818848 s
11514.56797 Images/s
Total time: 1.0948748588562012s


### 5. shuffle_and_repeat + map_and_batch

In [24]:
# Extract
path_label_ds = tf.data.Dataset.from_tensor_slices((all_image_paths, all_image_labels))

# Transform
path_label_ds = path_label_ds.apply(tf.data.experimental.shuffle_and_repeat(buffer_size=image_count, count=NUM_EPOCHS))
image_label_ds = path_label_ds.apply(tf.data.experimental.map_and_batch(load_and_preprocess_from_path_label, BATCH_SIZE, num_parallel_calls=4))

# Load
timeit(image_label_ds)

12000 images: 1.05 s
12607 images: 1.0960001945495605 s
11502.73518 Images/s
Total time: 1.096001148223877s


### 6. Prefetch

See more about [tf.data.experimental.prefetch_to_device](https://www.tensorflow.org/api_docs/python/tf/data/experimental/prefetch_to_device)
```
tf.data.experimental.prefetch_to_device(
    device,
    buffer_size=None
)
```

In [25]:
# Extract
path_label_ds = tf.data.Dataset.from_tensor_slices((all_image_paths, all_image_labels))

# Transform
path_label_ds = path_label_ds.apply(tf.data.experimental.shuffle_and_repeat(buffer_size=image_count, count=NUM_EPOCHS))

image_label_ds = path_label_ds.map(load_and_preprocess_from_path_label, num_parallel_calls=4)
image_label_ds = image_label_ds.batch(BATCH_SIZE)

# Load
image_label_ds = image_label_ds.prefetch(buffer_size=1) # Only on CPU
#image_label_ds = image_label_ds.apply(tf.data.experimental.prefetch_to_device(device="/gpu:0", buffer_size=1)) # Must be final Dataset in input pipeline
timeit(image_label_ds)

12000 images: 1.06 s
12607 images: 1.1160252094268799 s
11296.33981 Images/s
Total time: 1.1160261631011963s


### 7. Cache

See more about [tf.data.Dataset.cache](https://www.tensorflow.org/tutorials/load_data/images#cache)
                                       
Use tf.data.Dataset.cache to easily cache calculations across epochs. This is especially performant if the dataq fits in memory
```
ds = image_label_ds.cache()
```

One disadvantage to using an in memory cache is that the cache must be rebuilt on each run, giving the same startup delay each time the dataset is started:
If the data doesn't fit in memory, use a cache file. 
The cache file also has the advantage that it can be used to quickly restart the dataset without rebuilding the cache. Note how much faster it is the second time:


```
ds = image_label_ds.cache(filename='./cache.tf-data')
```

In [27]:
# Extract
ds = tf.data.Dataset.from_tensor_slices((all_image_paths, all_image_labels))

# Transform
ds = ds.apply(tf.data.experimental.shuffle_and_repeat(buffer_size=image_count, count=NUM_EPOCHS))
#path_label_ds = path_label_ds.cache(filename='./cache.tf-path')

ds = ds.map(load_and_preprocess_from_path_label, num_parallel_calls=4)
ds = ds.batch(BATCH_SIZE)

# Load
ds = ds.cache(filename='./cache.tf-ds')
ds = ds.prefetch(buffer_size=1) # Only on CPU
#image_label_ds = image_label_ds.apply(tf.data.experimental.prefetch_to_device(device="/gpu:0", buffer_size=1)) # Must be final Dataset in input pipeline
timeit(ds)

12000 images: 0.11 s
12607 images: 0.11554718017578125 s
109106.94645 Images/s
Total time: 0.11554813385009766s


In [5]:
# Extract
path_label_ds = tf.data.Dataset.from_tensor_slices((all_image_paths, all_image_labels))

# Transform
path_label_ds = path_label_ds.apply(tf.data.experimental.shuffle_and_repeat(buffer_size=image_count, count=NUM_EPOCHS))
path_label_ds = path_label_ds.cache(filename='./cache.tf-path')
for epoch in range(3):
    for i, (path, label) in enumerate(path_label_ds):
        if i % 12607 == 0:
            print(i, label, path)
        elif i % 12607 == 1:
            print(i, label, path)
        elif i % 12607 == 2:
            print(i, label, path)

image_label_ds = path_label_ds.map(load_and_preprocess_from_path_label, num_parallel_calls=4)
image_label_ds = image_label_ds.batch(BATCH_SIZE)

# Load
image_label_ds = image_label_ds.cache(filename='./cache.tf-image')
image_label_ds = image_label_ds.prefetch(buffer_size=10) # Only on CPU
#image_label_ds = image_label_ds.apply(tf.data.experimental.prefetch_to_device(device="/gpu:0", buffer_size=1)) # Must be final Dataset in input pipeline
timeit(image_label_ds)

Instructions for updating:
Colocations handled automatically by placer.
0 tf.Tensor(2, shape=(), dtype=int32) tf.Tensor(b'/home/Data/ChineseNumbers/\xe4\xb8\x89/Take off&Good luck Elegant Bold Figure w6 Font-Simplified Chinesettf.png', shape=(), dtype=string)
1 tf.Tensor(8, shape=(), dtype=int32) tf.Tensor(b'/home/Data/ChineseNumbers/\xe5\x8d\x81/Fang zheng You xian Font-Traditional Chinesettf.png', shape=(), dtype=string)
2 tf.Tensor(3, shape=(), dtype=int32) tf.Tensor(b'/home/Data/ChineseNumbers/\xe4\xb9\x9d/Wen er guang gao Song ti Font-Traditional Chinesettf.png', shape=(), dtype=string)
12607 tf.Tensor(2, shape=(), dtype=int32) tf.Tensor(b'/home/Data/ChineseNumbers/\xe4\xb8\x89/Xuke Li Handwriting caricature (v10) Font-Simplified Chinesettf.png', shape=(), dtype=string)
12608 tf.Tensor(2, shape=(), dtype=int32) tf.Tensor(b'/home/Data/ChineseNumbers/\xe4\xb8\x89/Mini Zhan bi hei Font-Simplified Chinesettf.png', shape=(), dtype=string)
12609 tf.Tensor(6, shape=(), dtype=int32) tf.Te

KeyboardInterrupt: 

In [None]:
import time
import pathlib
img_path = "/home/Data/CharactersTrimPad28/"
img_path = "/home/Data/ChineseNumbers/"
data_root = pathlib.Path(img_path)

In [13]:
start = time.time()

# Extract
all_image_paths = [str(path) for path in list(data_root.glob('*/*'))]
image_count = len(all_image_paths)

label_names = sorted(item.name for item in data_root.glob('*/') if item.is_dir())
label_to_index = dict((name, index) for index,name in enumerate(label_names))
all_image_labels = [label_to_index[pathlib.Path(path).parent.name]
                    for path in all_image_paths]

path_label_ds = tf.data.Dataset.from_tensor_slices((all_image_paths, all_image_labels))

print("{:.2f} s".format(time.time()-start))
for i, (path, label) in enumerate(path_label_ds):
    if i % 100 == 0:
        print("\r{} paths: {:.2f} s".format(i, time.time()-start), end='', flush=True)
print("\r{} paths: {:.2f} s".format(i, time.time()-start), end='', flush=True)

3.23 s
12606 paths: 4.38 s

In [7]:
def get_files(dir_path, label):
    globbed = tf.string_join([dir_path, '/*.png'])
    files = tf.matching_files(globbed)
    
    num_files = tf.shape(files)[0] # in the directory
    labels = tf.tile([label], [num_files, ]) # expand label to all files
    return tf.data.Dataset.from_tensor_slices((files, labels))

In [12]:
start = time.time()

# Extract
classes = sorted(f.as_posix() for f in data_root.glob('*/') if f.is_dir())
num_classes = len(classes)
labels = np.arange(num_classes, dtype=np.int32)

class_label_ds = tf.data.Dataset.from_tensor_slices((classes, labels))
path_label_ds = class_label_ds.apply(tf.data.experimental.parallel_interleave(
    get_files, cycle_length=num_classes, block_length=4)) 

print("{:.2f} s".format(time.time()-start))
for i, (path, label) in enumerate(path_label_ds):
    if i % 100 == 0:
        print("\r{} paths: {:.2f} s".format(i, time.time()-start), end='', flush=True)
print("\r{} paths: {:.2f} s".format(i, time.time()-start), end='', flush=True)

0.10 s
12606 paths: 5.74 s

In [17]:
def get_files(dir_path, label):
    files = list(str(dir_path.glob('*/*.png')))
    labels = [label] * len(files)
    return tf.data.Dataset.from_tensor_slices((files, labels))   

In [None]:
#tf.gfile.ListDirectory(img_path)

In [None]:
# files = tf.data.Dataset.list_files("*.tfrecord")
# dataset = tf.data.TFRecordDataset(files)

In [None]:
# tf.data.Dataset.from_generator
def path_label_generator():
    char_to_label = dict()
    for path in data_root.glob('*/*'):
        char = path.parent.name
        if char not in char_to_label:
            char_to_label[char] = len(char_to_label)
        yield path, char_to_label[char]
        
path_label_ds = tf.data.Dataset.from_generator(path_label_generator, (tf.string, tf.int32))
print(path_label_ds)
print('shape: ', repr(path_label_ds.output_shapes))
print('type: ', path_label_ds.output_types)