# Load and parse data with TensorFlow 2.0 (tf.data)

A TensorFlow 2.0 example to build input pipelines for loading data efficiently.


- Numpy Arrays
- Images
- CSV file
- Custom data from a Generator

For more information about creating and loading TensorFlow's `TFRecords` data format, see: [tfrecords.ipynb](tfrecords.ipynb)

- Author: Aymeric Damien
- Project: https://github.com/aymericdamien/TensorFlow-Examples/

In [1]:
from __future__ import absolute_import, division, print_function

import numpy as np
import random
import requests
import string
import tarfile
import tensorflow as tf

### Load Numpy Arrays

Build a data pipeline over numpy arrays.

In [2]:
# Create a toy dataset (even and odd numbers, with respective labels of 0 and 1).
evens = np.arange(0, 100, step=2, dtype=np.int32)
evens_label = np.zeros(50, dtype=np.int32)
odds = np.arange(1, 100, step=2, dtype=np.int32)
odds_label = np.ones(50, dtype=np.int32)
# Concatenate arrays
features = np.concatenate([evens, odds])
labels = np.concatenate([evens_label, odds_label])

# Load a numpy array using tf data api with `from_tensor_slices`.
data = tf.data.Dataset.from_tensor_slices((features, labels))
# Refill data indefinitely.  
data = data.repeat()
# Shuffle data.
data = data.shuffle(buffer_size=100)
# Batch data (aggregate records together).
data = data.batch(batch_size=4)
# Prefetch batch (pre-load batch for faster consumption).
data = data.prefetch(buffer_size=1)

In [3]:
for batch_x, batch_y in data.take(5):
    print(batch_x, batch_y)

tf.Tensor([24 81 85 96], shape=(4,), dtype=int32) tf.Tensor([0 1 1 0], shape=(4,), dtype=int32)
tf.Tensor([18 37 83 47], shape=(4,), dtype=int32) tf.Tensor([0 1 1 1], shape=(4,), dtype=int32)
tf.Tensor([25 58 18  6], shape=(4,), dtype=int32) tf.Tensor([1 0 0 0], shape=(4,), dtype=int32)
tf.Tensor([45 32  2 14], shape=(4,), dtype=int32) tf.Tensor([1 0 0 0], shape=(4,), dtype=int32)
tf.Tensor([38 30 98 68], shape=(4,), dtype=int32) tf.Tensor([0 0 0 0], shape=(4,), dtype=int32)


In [4]:
# Note: If you are planning on calling multiple time,
# you can user the iterator way:
ite_data = iter(data)
for i in range(5):
    batch_x, batch_y = next(ite_data)
    print(batch_x, batch_y)

for i in range(5):
    batch_x, batch_y = next(ite_data)
    print(batch_x, batch_y)

tf.Tensor([72 12  5 64], shape=(4,), dtype=int32) tf.Tensor([0 0 1 0], shape=(4,), dtype=int32)
tf.Tensor([ 6  0  2 93], shape=(4,), dtype=int32) tf.Tensor([0 0 0 1], shape=(4,), dtype=int32)
tf.Tensor([79 37  6 94], shape=(4,), dtype=int32) tf.Tensor([1 1 0 0], shape=(4,), dtype=int32)
tf.Tensor([62 16 17 28], shape=(4,), dtype=int32) tf.Tensor([0 0 1 0], shape=(4,), dtype=int32)
tf.Tensor([11 48  0 70], shape=(4,), dtype=int32) tf.Tensor([1 0 0 0], shape=(4,), dtype=int32)
tf.Tensor([28 22 20 71], shape=(4,), dtype=int32) tf.Tensor([0 0 0 1], shape=(4,), dtype=int32)
tf.Tensor([ 8  1 57 52], shape=(4,), dtype=int32) tf.Tensor([0 1 1 0], shape=(4,), dtype=int32)
tf.Tensor([44  7 44  2], shape=(4,), dtype=int32) tf.Tensor([0 1 0 0], shape=(4,), dtype=int32)
tf.Tensor([48 34 32  4], shape=(4,), dtype=int32) tf.Tensor([0 0 0 0], shape=(4,), dtype=int32)
tf.Tensor([51 95 65 76], shape=(4,), dtype=int32) tf.Tensor([1 1 1 0], shape=(4,), dtype=int32)


### Load CSV files

Build a data pipeline from features stored in a CSV file. For this example, Titanic dataset will be used as a toy dataset stored in CSV format.

#### Titanic Dataset



survived|pclass|name|sex|age|sibsp|parch|ticket|fare
--------|------|----|---|---|-----|-----|------|----
1|1|"Allen, Miss. Elisabeth Walton"|female|29|0|0|24160|211.3375
1|1|"Allison, Master. Hudson Trevor"|male|0.9167|1|2|113781|151.5500
0|1|"Allison, Miss. Helen Loraine"|female|2|1|2|113781|151.5500
0|1|"Allison, Mr. Hudson Joshua Creighton"|male|30|1|2|113781|151.5500
...|...|...|...|...|...|...|...|...

In [5]:
# Download Titanic dataset (in csv format).
d = requests.get("https://raw.githubusercontent.com/tflearn/tflearn.github.io/master/resources/titanic_dataset.csv")
with open("titanic_dataset.csv", "wb") as f:
    f.write(d.content)

In [6]:
# Load Titanic dataset.
# Original features: survived,pclass,name,sex,age,sibsp,parch,ticket,fare
# Select specific columns: survived,pclass,name,sex,age,fare
column_to_use = [0, 1, 2, 3, 4, 8]
record_defaults = [tf.int32, tf.int32, tf.string, tf.string, tf.float32, tf.float32]

# Load the whole dataset file, and slice each line.
data = tf.data.experimental.CsvDataset("titanic_dataset.csv", record_defaults, header=True, select_cols=column_to_use)
# Refill data indefinitely.
data = data.repeat()
# Shuffle data.
data = data.shuffle(buffer_size=1000)
# Batch data (aggregate records together).
data = data.batch(batch_size=2)
# Prefetch batch (pre-load batch for faster consumption).
data = data.prefetch(buffer_size=1)

In [7]:
for survived, pclass, name, sex, age, fare in data.take(1):
    print(survived.numpy())
    print(pclass.numpy())
    print(name.numpy())
    print(sex.numpy())
    print(age.numpy())
    print(fare.numpy())

[1 1]
[1 3]
[b'Ostby, Miss. Helene Ragnhild' b'Asplund, Master. Edvin Rojj Felix']
[b'female' b'male']
[22.  3.]
[61.9792 31.3875]


### Load Images

Build a data pipeline by loading images from disk. For this example, Oxford Flowers dataset will be used.

In [8]:
# Download Oxford 17 flowers dataset
d = requests.get("http://www.robots.ox.ac.uk/~vgg/data/flowers/17/17flowers.tgz")
with open("17flowers.tgz", "wb") as f:
    f.write(d.content)
# Extract archive.
with tarfile.open("17flowers.tgz") as t:
    t.extractall()

In [9]:
with open('jpg/dataset.csv', 'w') as f:
    c = 0
    for i in range(1360):
        f.write("jpg/image_%04i.jpg,%i\n" % (i+1, c))
        if (i+1) % 80 == 0:
            c += 1

In [10]:
# Load Images
with open("jpg/dataset.csv") as f:
    dataset_file = f.read().splitlines()

# Load the whole dataset file, and slice each line.
data = tf.data.Dataset.from_tensor_slices(dataset_file)
# Refill data indefinitely.
data = data.repeat()
# Shuffle data.
data = data.shuffle(buffer_size=1000)

# Load and pre-process images.
def load_image(path):
    # Read image from path.
    image = tf.io.read_file(path)
    # Decode the jpeg image to array [0, 255].
    image = tf.image.decode_jpeg(image)
    # Resize images to a common size of 256x256.
    image = tf.image.resize(image, [256, 256])
    # Rescale values to [-1, 1].
    image = 1. - image / 127.5
    return image
# Decode each line from the dataset file.
def parse_records(line):
    # File is in csv format: "image_path,label_id".
    # TensorFlow requires a default value, but it will never be used.
    image_path, image_label = tf.io.decode_csv(line, ["", 0])
    # Apply the function to load images.
    image = load_image(image_path)
    return image, image_label
# Use 'map' to apply the above functions in parallel.
data = data.map(parse_records, num_parallel_calls=4)

# Batch data (aggregate images-array together).
data = data.batch(batch_size=2)
# Prefetch batch (pre-load batch for faster consumption).
data = data.prefetch(buffer_size=1)

In [11]:
for batch_x, batch_y in data.take(1):
    print(batch_x, batch_y)

tf.Tensor(
[[[[-4.83272076e-01  7.79840708e-01  6.97303891e-01]
   [-4.34444427e-01  7.91131496e-01  7.23598838e-01]
   [-3.99359941e-01  7.99816191e-01  7.40073562e-01]
   ...
   [-3.79255176e-01  7.44573832e-01  7.55208313e-01]
   [-3.82046580e-01  7.39522099e-01  7.70894587e-01]
   [-3.96341205e-01  7.25227356e-01  7.56599903e-01]]

  [[-4.34366822e-01  7.83341467e-01  7.04113543e-01]
   [-3.69448423e-01  8.15980196e-01  7.47757256e-01]
   [-3.01225543e-01  8.44281375e-01  7.89583325e-01]
   ...
   [-2.63991117e-01  8.12275469e-01  8.31985295e-01]
   [-3.33664179e-01  7.81948447e-01  7.96691179e-01]
   [-3.78308773e-01  7.38725483e-01  7.52757370e-01]]

  [[-4.11390305e-01  7.59252429e-01  6.91087902e-01]
   [-3.26000929e-01  8.03288221e-01  7.50183821e-01]
   [-2.16925621e-01  8.42109442e-01  8.12928915e-01]
   ...
   [-2.23175645e-01  8.07734489e-01  8.35458279e-01]
   [-3.13827991e-01  7.88786769e-01  7.98815191e-01]
   [-3.92708302e-01  7.21017122e-01  7.28860259e-01]]

  ...

 

### Load data from a Generator

In [12]:
# Create a dummy generator.
def generate_features():
    # Function to generate a random string.
    def random_string(length):
        return ''.join(random.choice(string.ascii_letters) for m in xrange(length))
    # Return a random string, a random vector, and a random int.
    yield random_string(4), np.random.uniform(size=4), random.randint(0, 10)

In [13]:
# Load a numpy array using tf data api with `from_tensor_slices`.
data = tf.data.Dataset.from_generator(generate_features, output_types=(tf.string, tf.float32, tf.int32))
# Refill data indefinitely.
data = data.repeat()
# Shuffle data.
data = data.shuffle(buffer_size=100)
# Batch data (aggregate records together).
data = data.batch(batch_size=4)
# Prefetch batch (pre-load batch for faster consumption).
data = data.prefetch(buffer_size=1)

In [14]:
# Display data.
xrange = range
for batch_str, batch_vector, batch_int in data.take(5):
    print(batch_str, batch_vector, batch_int)

tf.Tensor([b'rjlG' b'trSb' b'Odgh' b'QTDL'], shape=(4,), dtype=string) tf.Tensor(
[[0.7968671  0.15037513 0.7494175  0.7291446 ]
 [0.45667616 0.49803782 0.00560279 0.8808681 ]
 [0.59517807 0.887676   0.6485402  0.3936121 ]
 [0.64150053 0.6131235  0.4755623  0.49115276]], shape=(4, 4), dtype=float32) tf.Tensor([3 1 8 6], shape=(4,), dtype=int32)
tf.Tensor([b'kVJb' b'ulfP' b'GUVm' b'OMjg'], shape=(4,), dtype=string) tf.Tensor(
[[0.81523997 0.42112964 0.68366605 0.30692655]
 [0.8287681  0.44852808 0.23755868 0.5468942 ]
 [0.6098704  0.21439378 0.6970029  0.84742945]
 [0.13671751 0.7290514  0.29603773 0.9415665 ]], shape=(4, 4), dtype=float32) tf.Tensor([5 1 5 7], shape=(4,), dtype=int32)
tf.Tensor([b'XIEn' b'dZjY' b'xwLI' b'RHUY'], shape=(4,), dtype=string) tf.Tensor(
[[0.6253043  0.75604796 0.04800427 0.8301756 ]
 [0.07147692 0.9510518  0.81060547 0.6188102 ]
 [0.35208228 0.52762914 0.9284128  0.76412773]
 [0.09099609 0.7309415  0.55512774 0.30464917]], shape=(4, 4), dtype=float32) tf.Te