In [48]:
import tensorflow as tf
import time

## Create tf dataset from list

In [49]:
sales_numbers = [21, 22, -108, 31, -1, 32, 34,31]

## Iterate Through tf dataset

In [50]:
tf_dataset = tf.data.Dataset.from_tensor_slices(sales_numbers)

In [51]:
tf_dataset

<_TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.int32, name=None)>

## Iterate through elements as numpy elements 

In [52]:
for sales in tf_dataset:
    print(sales.numpy())

21
22
-108
31
-1
32
34
31


## Iterate through first n elements in tf dataset

In [53]:
for sales in tf_dataset.take(3):
    print(sales.numpy())

21
22
-108


##  Filter sales numbers that are < 0

In [54]:
tf_dataset = tf_dataset.filter(lambda x: x>0)
for sales in tf_dataset.as_numpy_iterator():
    print(sales)

21
22
31
32
34
31


##  USA dollars  to Indian Rupees (INR) Assuming 1->72 conversation rate

In [55]:
tf_dataset = tf_dataset.map(lambda x: x*72)
for sales in tf_dataset.as_numpy_iterator():
    print(sales)

1512
1584
2232
2304
2448
2232


## Shuffle

In [56]:
tf_dataset = tf_dataset.shuffle(2)
for sales in tf_dataset.as_numpy_iterator():
    print(sales)

1512
2232
2304
2448
2232
1584


## Batching 

In [57]:
for sales_batch in tf_dataset.batch(2):
    print(sales_batch.numpy())

[1584 2232]
[2304 1512]
[2232 2448]


## Perform all of the above operations in one shot

In [58]:
tf_dataset = tf.data.Dataset.from_tensor_slices(sales_numbers)

tf_dataset = tf_dataset.filter(lambda x: x>0).map(lambda y: y*72).shuffle(2).batch(2)
for sales in tf_dataset.as_numpy_iterator():
    print(sales)

[1584 2232]
[1512 2304]
[2448 2232]


# Images 

In [59]:
images_ds = tf.data.Dataset.list_files("image/*/*", shuffle = False)

In [60]:
image_count = len(images_ds)
image_count

606

In [61]:
type(images_ds)

tensorflow.python.data.ops.from_tensor_slices_op._TensorSliceDataset

In [62]:
for file in images_ds.take(5):
    print(file.numpy())

b'image\\cats\\cat.1.jpg'
b'image\\cats\\cat.10.jpg'
b'image\\cats\\cat.100.jpg'
b'image\\cats\\cat.101.jpg'
b'image\\cats\\cat.102.jpg'


In [63]:
images_ds = images_ds.shuffle(600)
for file in images_ds.take(4):
    print(file.numpy())

b'image\\cats\\cat.21.jpg'
b'image\\dogs\\dog.138.jpg'
b'image\\dogs\\dog.113.jpg'
b'image\\dogs\\dog.6.jpg'


In [64]:
class_names = ["cat","dog","horse"]

In [65]:
train_size = int(image_count*0.8)
train_ds = images_ds.take(train_size)
test_ds = images_ds.skip(train_size)

In [66]:
len(train_ds)

484

In [67]:
len(test_ds)

122

In [68]:
def get_label(file_path):
    import os
    parts = tf.strings.split(file_path, os.path.sep)
    return parts[-2]

In [69]:
get_label("b'image\\dogs\\dog.75.jpg'")

<tf.Tensor: shape=(), dtype=string, numpy=b'dogs'>

In [70]:
get_label("b'image\\cats\\cat.101.jpg'")

<tf.Tensor: shape=(), dtype=string, numpy=b'cats'>

In [71]:
def process_image(file_path):
    label = get_label(file_path)
    img = tf.io.read_file(file_path)
    img = tf.image.decode_jpeg(img)
    img = tf.image.resize(img,[128,128])
    return img,label

In [72]:
img,label = process_image("image\\dogs\\dog.75.jpg")
img.numpy()[:2]

array([[[200.69073 , 221.71655 , 142.6391  ],
        [187.69073 , 208.67645 , 139.1914  ],
        [152.49738 , 173.73175 , 114.028625],
        [132.24976 , 155.25836 ,  98.51825 ],
        [134.26904 , 159.84717 , 100.11279 ],
        [133.58636 , 161.85315 , 105.3598  ],
        [123.21875 , 148.04688 , 101.45282 ],
        [128.17633 , 145.58258 , 105.37164 ],
        [136.3747  , 149.83423 , 108.755005],
        [124.23108 , 141.04974 ,  89.86163 ],
        [152.9289  , 174.94995 , 111.818726],
        [167.88666 , 189.7851  , 121.32037 ],
        [139.29944 , 162.41547 ,  97.87158 ],
        [133.50385 , 156.76947 ,  99.70697 ],
        [138.64575 , 163.22131 , 113.78381 ],
        [147.68188 , 173.68188 , 128.68188 ],
        [133.42023 , 158.16675 , 123.74402 ],
        [ 94.45941 , 107.560974,  80.78235 ],
        [153.91266 , 156.81854 , 122.14276 ],
        [158.06067 , 173.2638  , 122.31665 ],
        [145.14868 , 170.52771 , 111.21515 ],
        [167.1977  , 183.40161 , 1

In [73]:
train_ds = train_ds.map(process_image)
test_ds = test_ds.map(process_image)

In [74]:
for image, label in train_ds.take(2):
    print("****",image)
    print("****",label)

**** tf.Tensor(
[[[185.21484  204.21484  237.21484 ]
  [185.21484  204.21484  237.21484 ]
  [185.21484  204.21484  237.21484 ]
  ...
  [188.78516  207.78516  237.78516 ]
  [188.29005  207.29005  237.29005 ]
  [187.       206.       236.      ]]

 [[186.       205.       238.      ]
  [186.       205.       238.      ]
  [186.       205.       238.      ]
  ...
  [188.64453  207.64453  237.64453 ]
  [188.36719  207.36719  237.36719 ]
  [187.64453  206.64453  236.64453 ]]

 [[187.       206.       238.92578 ]
  [187.       206.       238.92578 ]
  [187.       206.       238.92578 ]
  ...
  [189.07422  208.07422  238.07422 ]
  [188.79688  207.79688  237.79688 ]
  [188.07422  207.07422  237.07422 ]]

 ...

 [[176.29112  125.29112   79.142685]
  [189.69756  138.42021   86.80693 ]
  [157.54965  105.420746  47.93637 ]
  ...
  [205.33322  148.9465    90.07541 ]
  [218.0407   157.76335   96.93132 ]
  [220.12996  156.12996   92.12996 ]]

 [[207.36998  155.36998  108.16586 ]
  [205.24313  153.965

In [75]:
def scale(image, label):
    return image/255, label

In [76]:
train_ds = train_ds.map(scale)

In [77]:
for image, label in train_ds.take(5):
    print("**Image : ", image.numpy()[0][0])
    print("**Label : ", label.numpy())

**Image :  [0.23529412 0.23921569 0.21568628]
**Label :  b'cats'
**Image :  [0.10278799 0.20747548 0.15376838]
**Label :  b'dogs'
**Image :  [0.8234069  0.7680112  0.25527307]
**Label :  b'horses'
**Image :  [0.8387359  0.8314798  0.87432355]
**Label :  b'cats'
**Image :  [0.46491078 0.33157745 0.18497817]
**Label :  b'cats'


## Prefetching

In [83]:
class FileDataset(tf.data.Dataset):
    def read_file_in_batches(num_samples):
        # OPening the file 
        time.sleep(0.03)
        
        for sample_idx in range(num_samples) :
            # Reading data (line,record) from the file 
            time.sleep(0.015)
            
            yield (sample_idx,)
            
    def __new__(cls, num_samples=3):
        return tf.data.Dataset.from_generator(
            cls.read_file_in_batches,
            output_signature = tf.TensorSpec(shape = (1,),dtype=tf.int64),
            args=(num_samples,)
        )

In [84]:
def benchmark(dataset , num_epochs=2):
    for epoch_num in range(num_epochs):
        for sample in dataset:
            # performing a training step
            time.sleep(0.01)

In [85]:
%%timeit
benchmark(FileDataset())

338 ms ± 6.47 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [87]:
%%timeit
benchmark(FileDataset().prefetch(1))

317 ms ± 9.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [92]:
%%timeit
benchmark(FileDataset().prefetch(tf.data.AUTOTUNE))

333 ms ± 20 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Caching

In [93]:
dataset = tf.data.Dataset.range(5)
dataset = dataset.map(lambda x: x**2)
dataset = dataset.cache("mycache.txt")
list(dataset.as_numpy_iterator())

[0, 1, 4, 9, 16]

In [94]:
list(dataset.as_numpy_iterator())

[0, 1, 4, 9, 16]

In [95]:
def mapped_function(s):
    tf.py_function(lambda: time.sleep(0.03), [], ())
    return s

In [96]:
%%timeit -r1 -n1
benchmark(FileDataset().map(mapped_function),5)

1.32 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


In [98]:
%%timeit -r1 -n1
benchmark(FileDataset().map(mapped_function).cache(), 5)

461 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)
