In [None]:
# Caching with tf.data

import tensorflow_datasets as tfds
import tensorflow as tf


dataset = tfds.load('cats_vs_dogs', split=tfds.Split.TRAIN)

# In-memory caching
train_dataset = dataset.cache()
model.fit(train_dataset, epochs=...)

# Disk caching
train_dataset = dataset.cache(filename='cache')
model.fit(train_dataset, epochs=...)

# Parallelism with tf.data

In [1]:
def augment(features):
  X = tf.image.random_flip_left_right(features['image'])
  X = tf.image.random_flip_up_down(X)
  X = tf.image.random_brightness(X, max_delta=0.1)
  X = tf.image.random_saturation(X, lower=0.75, upper=1.5)
  X = tf.image.random_hue(X, max_delta=0.15)
  X = tf.image.random_contrast(X, lower=0.75, upper=1.5)
  X = tf.image.resize(X, (224, 224))
  image = X / 255.0

  return image, features['label']

In [None]:
# What happens when you map that transformation?

dataset = tfds.load('cats_vs_dogs', split=tfds.Split.TRAIN)

augmented_dataset = dataset.map(augment)

In [None]:
# Parallelizing data transformations

augmented_dataset = dataset.map(augment, num_parallel_calls=1)

# Maximizing the utilization of CPU cores

# Get the number of available cpu cores
num_cores = multiprocessing.cpu_count()

# Set num_parallel_calls with 'num_cores'
augmented_dataset = dataset.map(augment, num_parallel_calls=num_cores)

# Autotuning

* tf.data.experimental.AUTOTUNE
* Tunes the value dynamically at runtime
* Decides on the level of parallelism
* Tweaks values of parameters in transformations (tf.data)
  * Buffer size (map, prefetch, shuffle, ...)
  * CPU budget (num_parallel_calls)
  * I/O (num_parallel_reads)

In [None]:
# Autotune in practice
from tensorflow.data.experimental import AUTOTUNE

augmented_dataset = datset.map(
    augment,
    num_parallel_calls=AUTOTUNE
)

# Parallelizing data loading

dataset = tfds.load('cats_vs_dogs', split=tfds.Split.TRAIN)

# With prefetch
train_dataset = dataset.map(format_image).prefetch(tf.data.experimental.AUTOTUNE)

In [None]:
# Parallelizing data extraction

TFRECORDS_DIR = '/root/tensorflow_datasets/cats_vs_dogs/<dataset-version>/'
files = tf.data.Dataset.list_files(TFRECORDS_DIR + "cats_vs_dogs-train.tfrecord-*")

num_parallel_reads = 4

dataset = files.interleave(
    tf.data.TFRecordDataset, # map function
    cycle_length=num_parallel_reads, 
    num_parallel_calls=tf.data.experimental.AUTOTUNE 
)

# Best practices for code improvements

## Performance considerations
* The Dataset APIs are designed to be flexible
* Most operations are commutative
* Order transformations accordingly
  * e.g., map, batch, shuffle, repeat, interleave, prefetch, etc.

# Map and Batch
## The map transformation has overhead in terms of
* Scheduling
* Executing the user-defined function

# Solution : Vectorize the user-defined function
```python
 dataset = dataset.batch(BATCH_SIZE).map(func) or
```
or 

```python
options = tf.data.Options()
options.experimental_optimization.map_vectorization.enabled = True
dataset = dataset.with_options(options)

```

# Use map before cache when the transformation is expensive
```python
transformed_dataset = dataset.map(transforms_func).cache()
```


## Shuffle and Repeat

* Shuffling the dataset before applying repeat can cause slow downs
* shuffle.repeat for ordering guarantees
* repeat.shuffle for better performance

## Map and (Interleave / Prefetch / Shuffle)

* All transformations maintain an internal buffer
* Memory footprint is affected if map affects the size of elements
* Generally, have order that affects the memory usage the least