# 0. Imports

In [3]:
import numpy as np
import tensorflow as tf

# 1. Setup Dummy Data for Demonstration
## 1.1 create data in numpy arrays

In [4]:
inputs_0 = np.array(['rgb_0', 'rgb_1', 'rgb_2', 'rgb_3', 'rgb_4', 'rgb_5'])
inputs_1 = np.array(['rgb_1', 'rgb_2', 'rgb_3', 'rgb_4', 'rgb_5', 'rgb_6'])
labels   = np.array([['tx01','ty01','tz01','r01','p01','y01'],
                     ['tx12','ty12','tz12','r12','p12','y12'],
                     ['tx23','ty23','tz23','r23','p23','y23'],
                     ['tx34','ty34','tz34','r34','p34','y34'],
                     ['tx45','ty45','tz45','r45','p45','y45'],
                     ['tx56','ty56','tz56','r56','p56','y56']])

## 1.2 put numpy data into tf.data.Dataset objects

In [5]:
## prepare inputs
ds_inputs_0 = tf.data.Dataset.from_tensor_slices(inputs_0)
ds_inputs_1 = tf.data.Dataset.from_tensor_slices(inputs_1)
# create a dataset that returns a tuple (image_0, image_1)
ds_inputs   = tf.data.Dataset.zip((ds_inputs_0, ds_inputs_1))

## prepare labels
ds_labels = tf.data.Dataset.from_tensor_slices(labels)

## zip togeter the input images and the labels s.t. a tuple ((image_0, image_1), labels) is returned
ds_zip = tf.data.Dataset.zip((ds_inputs, ds_labels))

# print outputshape of ds_zip
print(ds_zip)

<ZipDataset shapes: (((), ()), (6,)), types: ((tf.string, tf.string), tf.string)>


If we wanted to train on non-sequence data, ```ds_zip``` could now be batched and shuffled in order to obtain a finalized dataset that can be used for training with the ```tf.keras.model.fit()``` train-loop, i.e. the output shapes of ```ds_zip``` fit the expectations of tf.keras module.
But since we want to learn from sequences we first need to do further processing of the dataset pipeline.

# 2. Slice Dataset into Subsequences
what we wish to come up with is a transformation, that takes an array ```[1,2,3,4]``` and outputs for example an array of the form  ```[[1,2], [2,3], [3,4]]```. Such an transformation can be achived with the ```tf.data.Dataset.window()``` function:  

In [6]:
ds_window_unmapped = ds_zip.window(3,1,1, drop_remainder=True)
print(ds_window_unmapped)

<WindowDataset shapes: ((DatasetSpec(TensorSpec(shape=(), dtype=tf.string, name=None), TensorShape([])), DatasetSpec(TensorSpec(shape=(), dtype=tf.string, name=None), TensorShape([]))), DatasetSpec(TensorSpec(shape=(6,), dtype=tf.string, name=None), TensorShape([]))), types: ((DatasetSpec(TensorSpec(shape=(), dtype=tf.string, name=None), TensorShape([])), DatasetSpec(TensorSpec(shape=(), dtype=tf.string, name=None), TensorShape([]))), DatasetSpec(TensorSpec(shape=(6,), dtype=tf.string, name=None), TensorShape([])))>


TODO
1. describe need for flat_map --> window() returns dataset of datasets!
2. describe mapper function and why to use batch

In [7]:
## define class that holds parameterized function to map on dataset
class mapper():
    def __init__(self, window_size):
        self.window_size = window_size
        
    def map_to_batch(self, *sub):
        tmp = tf.data.Dataset.zip(
                (tf.data.Dataset.zip((sub[0][0].batch(self.window_size), sub[0][1].batch(self.window_size))),
                sub[1].batch(3)))
        return tmp
    
## map entries of windowed dataset into flat arrays of sequences
ds_window = ds_window_unmapped.flat_map(mapper(3).map_to_batch)
print(ds_window)

<FlatMapDataset shapes: (((None,), (None,)), (None, 6)), types: ((tf.string, tf.string), tf.string)>


# 3. Further process Dataset to get finalized Dataset
## 3.1 Map Inputs to Layernames
TODO
explain why it is better to use inputs that are maped to layernames --> safety

In [8]:
## define mapping of input images to input layernames s.t. dataset returns dictionaries as inputs
def map_to_dict(*sub):
    layernames = ['in_t0', 'in_t1'] # dummy layernames
    return ({ layernames[i] : sequence for i, sequence in enumerate(sub[0]) }, sub[1])

## remap inputs of dataset
ds_final = ds_window.map(map_to_dict)
print(ds_final)

## print entries of dataset
print("iterate dataset:")
for data in ds_final:
    print("next data:")
    print(data)

<MapDataset shapes: ({in_t0: (None,), in_t1: (None,)}, (None, 6)), types: ({in_t0: tf.string, in_t1: tf.string}, tf.string)>
iterate dataset:
next data:
({'in_t0': <tf.Tensor: id=46, shape=(3,), dtype=string, numpy=array([b'rgb_0', b'rgb_1', b'rgb_2'], dtype=object)>, 'in_t1': <tf.Tensor: id=47, shape=(3,), dtype=string, numpy=array([b'rgb_1', b'rgb_2', b'rgb_3'], dtype=object)>}, <tf.Tensor: id=48, shape=(3, 6), dtype=string, numpy=
array([[b'tx01', b'ty01', b'tz01', b'r01', b'p01', b'y01'],
       [b'tx12', b'ty12', b'tz12', b'r12', b'p12', b'y12'],
       [b'tx23', b'ty23', b'tz23', b'r23', b'p23', b'y23']], dtype=object)>)
next data:
({'in_t0': <tf.Tensor: id=49, shape=(3,), dtype=string, numpy=array([b'rgb_1', b'rgb_2', b'rgb_3'], dtype=object)>, 'in_t1': <tf.Tensor: id=50, shape=(3,), dtype=string, numpy=array([b'rgb_2', b'rgb_3', b'rgb_4'], dtype=object)>}, <tf.Tensor: id=51, shape=(3, 6), dtype=string, numpy=
array([[b'tx12', b'ty12', b'tz12', b'r12', b'p12', b'y12'],
       [b

## 3.2 Batch Dataset
TODO
write text that now we can batch and shuffle the final dataset

In [9]:
batch_size = 2
ds_final_batched = ds_final.batch(batch_size)
print(ds_final_batched)

<BatchDataset shapes: ({in_t0: (None, None), in_t1: (None, None)}, (None, None, 6)), types: ({in_t0: tf.string, in_t1: tf.string}, tf.string)>


## 3.3 Investigate Elements of Dataset

In [10]:
for data in ds_final_batched:
    print("next data:")
    print(data)

next data:
({'in_t0': <tf.Tensor: id=66, shape=(2, 3), dtype=string, numpy=
array([[b'rgb_0', b'rgb_1', b'rgb_2'],
       [b'rgb_1', b'rgb_2', b'rgb_3']], dtype=object)>, 'in_t1': <tf.Tensor: id=67, shape=(2, 3), dtype=string, numpy=
array([[b'rgb_1', b'rgb_2', b'rgb_3'],
       [b'rgb_2', b'rgb_3', b'rgb_4']], dtype=object)>}, <tf.Tensor: id=68, shape=(2, 3, 6), dtype=string, numpy=
array([[[b'tx01', b'ty01', b'tz01', b'r01', b'p01', b'y01'],
        [b'tx12', b'ty12', b'tz12', b'r12', b'p12', b'y12'],
        [b'tx23', b'ty23', b'tz23', b'r23', b'p23', b'y23']],

       [[b'tx12', b'ty12', b'tz12', b'r12', b'p12', b'y12'],
        [b'tx23', b'ty23', b'tz23', b'r23', b'p23', b'y23'],
        [b'tx34', b'ty34', b'tz34', b'r34', b'p34', b'y34']]],
      dtype=object)>)
next data:
({'in_t0': <tf.Tensor: id=69, shape=(2, 3), dtype=string, numpy=
array([[b'rgb_2', b'rgb_3', b'rgb_4'],
       [b'rgb_3', b'rgb_4', b'rgb_5']], dtype=object)>, 'in_t1': <tf.Tensor: id=70, shape=(2, 3), dtype=st

# 4. Further Reading
If you want to use such a sequenced dataset to train on a model similar to DeepVO (https://www.cs.ox.ac.uk/files/9026/DeepVO.pdf) this Jupyter Notebook will be interesting: http://www.cs.virginia.edu/~vicente/recognition/2016/notebooks/kerasLSTM.html .
In that notebook they show how to generally train a LSTM base RNN on sequenced data. They train on sequences of chars in order to generate english sentences. They do not use the ```tf.data.Dataset``` interface and instead they use simple numpy arrays. But this example is shows well how the RNNs need to be trained on sequences and later can be used to infere from single instances (which is principally the way it is done in DeepVO).