# Datasets
* `tf.data` module provids classes to throw data in a model and to manipulate data
* In particular they can
    - read data from in memory
    - read data from a csv
    - apply transformations
* In particular they are designed to deal with a large amount of data

- Dataset can be accessd via 
    - looping
    - creationg a python iterator `iter(dataset)`

### Loading from memory
There are two methods that create datasets:
    - `from_tensors`
    - `from_tensor_slices`

In [10]:
import tensorflow as tf
from tensorflow.data import Dataset
import numpy as np
xs = np.array([[1,2,3,4], [4,5,6,7], [1, 1, 1,1]])

# return the whole array
dataset =Dataset.from_tensors(xs)

#access via iteration
for x in dataset:
    print(x)
    
# creating iterator
ts = iter(dataset)
next(ts)

tf.Tensor(
[[1 2 3 4]
 [4 5 6 7]
 [1 1 1 1]], shape=(3, 4), dtype=int64)


<tf.Tensor: shape=(3, 4), dtype=int64, numpy=
array([[1, 2, 3, 4],
       [4, 5, 6, 7],
       [1, 1, 1, 1]])>

In [11]:
# return slices along axis 0
dataset = Dataset.from_tensor_slices(xs)
print(type(dataset))
for x in dataset:
    print(x)

<class 'tensorflow.python.data.ops.dataset_ops.TensorSliceDataset'>
tf.Tensor([1 2 3 4], shape=(4,), dtype=int64)
tf.Tensor([4 5 6 7], shape=(4,), dtype=int64)
tf.Tensor([1 1 1 1], shape=(4,), dtype=int64)


In [12]:
ts = iter(dataset)
next(ts)

<tf.Tensor: shape=(4,), dtype=int64, numpy=array([1, 2, 3, 4])>

### Inspecting the shape of a dataset:
- using the method `element_spec` 
### Transformations
- combine slices into a batch `batch` 
- `map`
- `flat_map`
- `repeat`
- `shuffle` 
- `window` creates a window (e.g., for time series)


### Another example

In [13]:
timeseries = tf.range(102)
dataset = Dataset.from_tensor_slices(timeseries)
dataset.element_spec # only scalars in this case - no shape

TensorSpec(shape=(), dtype=tf.int32, name=None)

In [14]:
# print only the first 5 elements
for x in dataset.take(5):
    print(x)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)


In [15]:
batch = dataset.batch(10, drop_remainder=False)
for x in batch:
    print(x) # batch transformation returns a dataset containing tensors

tf.Tensor([0 1 2 3 4 5 6 7 8 9], shape=(10,), dtype=int32)
tf.Tensor([10 11 12 13 14 15 16 17 18 19], shape=(10,), dtype=int32)
tf.Tensor([20 21 22 23 24 25 26 27 28 29], shape=(10,), dtype=int32)
tf.Tensor([30 31 32 33 34 35 36 37 38 39], shape=(10,), dtype=int32)
tf.Tensor([40 41 42 43 44 45 46 47 48 49], shape=(10,), dtype=int32)
tf.Tensor([50 51 52 53 54 55 56 57 58 59], shape=(10,), dtype=int32)
tf.Tensor([60 61 62 63 64 65 66 67 68 69], shape=(10,), dtype=int32)
tf.Tensor([70 71 72 73 74 75 76 77 78 79], shape=(10,), dtype=int32)
tf.Tensor([80 81 82 83 84 85 86 87 88 89], shape=(10,), dtype=int32)
tf.Tensor([90 91 92 93 94 95 96 97 98 99], shape=(10,), dtype=int32)
tf.Tensor([100 101], shape=(2,), dtype=int32)


### Example Windows for Time Series
- Use the `window` method
- Returns a dataset of datasets

In [16]:
xs = Dataset.range(10)
xs = xs.window(5, shift=1)
for window in xs:
    for val in window:
        print(val.numpy(), end=" ")
    print()

0 1 2 3 4 
1 2 3 4 5 
2 3 4 5 6 
3 4 5 6 7 
4 5 6 7 8 
5 6 7 8 9 
6 7 8 9 
7 8 9 
8 9 
9 


In [17]:
xs = Dataset.range(10)
xs = xs.window(5, shift=1, drop_remainder=True)
xs = xs.flat_map(lambda window: window.batch(5))  # 5 timesteps -> 1 training batch
for window in xs:
    print(window.numpy())

[0 1 2 3 4]
[1 2 3 4 5]
[2 3 4 5 6]
[3 4 5 6 7]
[4 5 6 7 8]
[5 6 7 8 9]


In [18]:
xs = Dataset.range(10)
xs = xs.window(5, shift=1, drop_remainder=True)
xs = xs.flat_map(lambda window: window.batch(5))  # 5 timesteps -> 1 training batch
xs = xs.map(lambda window: (window[:-1], window[-1:])) # take the first 4 values as fetures and the last as target
for x, y in xs:
    print(x.numpy(), y.numpy())

[0 1 2 3] [4]
[1 2 3 4] [5]
[2 3 4 5] [6]
[3 4 5 6] [7]
[4 5 6 7] [8]
[5 6 7 8] [9]


- So far we have only worked with the inner window. 
- That is we arranged the data for one time series we would like to predict
- In a model we want to insert "training" batches. That is more time series in one training step
- Wo we have to batch the outer dataset
- ... and randomize it

In [19]:
xs = Dataset.range(10)
xs = xs.window(5, shift=1, drop_remainder=True)
xs = xs.flat_map(lambda window: window.batch(5))  # 5 timesteps -> 1 training batch
xs = xs.map(lambda window: (window[:-1], window[-1:])) # take the first 4 values as fetures and the last as target
# 1. shuffle data in order to avoid sequence bias
# 2. set outer batch size
# 3. prefetch data: that is allow to prepare the next batch while another pice of code (training) is beeing exectued.
# This sould be always done in order to speed up things.
xs = xs.shuffle(buffer_size=20).batch(2).prefetch(1)
for x, y in xs:
    print(f"Input: \n {x.numpy()}")
    print(f"Output: \n {y.numpy()}")    

Input: 
 [[2 3 4 5]
 [4 5 6 7]]
Output: 
 [[6]
 [8]]
Input: 
 [[5 6 7 8]
 [3 4 5 6]]
Output: 
 [[9]
 [7]]
Input: 
 [[1 2 3 4]
 [0 1 2 3]]
Output: 
 [[5]
 [4]]


In [20]:
#xs.shuffle?
#xs.prefetch?

### Timer series data preparation using `tf.keras.preprocessing.timeseries_dataset_from_array`  

In [21]:
from tensorflow.keras.preprocessing import timeseries_dataset_from_array

xs = timeseries_dataset_from_array(timeseries, 
                                   sequence_length=5, 
                                   targets=None, 
                                   batch_size=2, # 'outer' batch size going into model
                                   sequence_stride=1, # time shift between consecutive time series
                                   sampling_rate=1, # time period upon next time step
                                   shuffle=False)

xs = xs.map(lambda x: (x[:, :-1], x[:, -1:]))

for x, y in xs.take(5):
    print(f"Input: \n {x}")
    print(f"Output: \n {y}")

Input: 
 [[0 1 2 3]
 [1 2 3 4]]
Output: 
 [[4]
 [5]]
Input: 
 [[2 3 4 5]
 [3 4 5 6]]
Output: 
 [[6]
 [7]]
Input: 
 [[4 5 6 7]
 [5 6 7 8]]
Output: 
 [[8]
 [9]]
Input: 
 [[ 6  7  8  9]
 [ 7  8  9 10]]
Output: 
 [[10]
 [11]]
Input: 
 [[ 8  9 10 11]
 [ 9 10 11 12]]
Output: 
 [[12]
 [13]]


**Discussion:**
- It comes kind of handy because there is no inner flat map as with the window function
- And additionally allows stride and sampling rates.
- One could also directly use targets, but these need to be prepared as tensors before hand. So we do this after dataset generation via the `map` step. This seems more natural.

- However it is not apparently clear how to incorporate data from a csv (in contrast to the plain dataset API). 
- So, there does not seem to be much of a benfit using this wrapper.
- In the future we will rather use window functions together with appropriate `flat_map` and `map` operations

### Time Series with features
- So far we considered time series where the fetures where the last 4 timestep values and the target was the 5th timestamp in our toy example. 
- Now we would like to incorporate more features (covariates). 
- Let's look at the shape of the data: 
- **Without covariates** we have shape
    - Training: `(batch, time)`,  i.e (2, 4) in the toy example
    - Targets: `(batch, 1)`, i.e. (2,1) in our toy example because we arranged our labels to predict only one step ahead in the future
    
- **With covariates** we expect the shape
    - Training: `(batch, time, features)`
    - Targets: `(batch, 1)`
    
- Note that this is compatible to the case without timesteps as in this case `(batch_size, time_steps) ~ (batch_size, time_steps, 1) `

- **Generalization for more complex Targets**
    - `(batch, ahead_time)` If we want to predict more timesteps ahead
    - `(batch, ahead_time, features_to_predict)` If we want to predict more timesteps ahead for multiple targets 

### A more complex time series example
- We would like to use datasets to prepare time series data for an example containing features. 
- We follow this [tensorflow tutorial](https://www.tensorflow.org/tutorials/structured_data/time_series)

In [3]:
import pandas as pd
import tensorflow as tf
import os


def get_weather_data():
    """ Retrieve weather data.
    From tensor flow tutorial https://www.tensorflow.org/tutorials/structured_data/time_series?hl=en
    """
    zip_path = tf.keras.utils.get_file(
        origin='https://storage.googleapis.com/tensorflow/tf-keras-datasets/jena_climate_2009_2016.csv.zip',
        fname='jena_climate_2009_2016.csv.zip',
        extract=True)
    csv_path, _ = os.path.splitext(zip_path)
    df = pd.read_csv(csv_path)
    # subsample 10 min -> 1 hour intervals
    df = df[5::6]
    date_time = pd.to_datetime(df.pop('Date Time'), format='%d.%m.%Y %H:%M:%S')
    df.index = date_time
    return df

df = get_weather_data()

In [22]:
df

Unnamed: 0_level_0,p (mbar),T (degC),Tpot (K),Tdew (degC),rh (%),VPmax (mbar),VPact (mbar),VPdef (mbar),sh (g/kg),H2OC (mmol/mol),rho (g/m**3),wv (m/s),max. wv (m/s),wd (deg)
Date Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2009-01-01 01:00:00,996.50,-8.05,265.38,-8.78,94.40,3.33,3.14,0.19,1.96,3.15,1307.86,0.21,0.63,192.7
2009-01-01 02:00:00,996.62,-8.88,264.54,-9.77,93.20,3.12,2.90,0.21,1.81,2.91,1312.25,0.25,0.63,190.3
2009-01-01 03:00:00,996.84,-8.81,264.59,-9.66,93.50,3.13,2.93,0.20,1.83,2.94,1312.18,0.18,0.63,167.2
2009-01-01 04:00:00,996.99,-9.05,264.34,-10.02,92.60,3.07,2.85,0.23,1.78,2.85,1313.61,0.10,0.38,240.0
2009-01-01 05:00:00,997.46,-9.63,263.72,-10.65,92.20,2.94,2.71,0.23,1.69,2.71,1317.19,0.40,0.88,157.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2016-12-31 19:10:00,1002.18,-0.98,272.01,-5.36,72.00,5.69,4.09,1.59,2.54,4.08,1280.70,0.87,1.36,190.6
2016-12-31 20:10:00,1001.40,-1.40,271.66,-6.84,66.29,5.51,3.65,1.86,2.27,3.65,1281.87,1.02,1.92,225.4
2016-12-31 21:10:00,1001.19,-2.75,270.32,-6.90,72.90,4.99,3.64,1.35,2.26,3.63,1288.02,0.71,1.56,158.7
2016-12-31 22:10:00,1000.65,-2.89,270.22,-7.15,72.30,4.93,3.57,1.37,2.22,3.57,1288.03,0.35,0.68,216.7


In [23]:
class Offsets:
    def __init__(self, 
                 input_width: int, 
                 label_width: int, 
                 shift: int=None):
        self.input_width = input_width
        self.label_width = label_width 
        self.shift = shift
        if not shift:
            self.shift = self.label_width
        self.sequence_width = self.input_width + self.shift
        self.check()
        
    def check(self):
        msg = "Label width needs to be smaller than offset in order to make future predictions"
        assert self.label_width <= self.shift, msg

In [24]:
from pandas import DataFrame

class SplitInTime:
    
    def __init__(self, offsets: Offsets):
        self.offsets = offsets
    
    def __call__(self, t: tf.Tensor):
        features = t[: -self.offsets.shift]
        labels = t[-self.offsets.label_width :]
        return (features, labels)
    
    
def get_label_indices(df: DataFrame, labels: list):
    return [idx for idx, name in enumerate(df.columns) if name in labels]

    
class PickLabels:
    
    def __init__(self, label_indices: list):
        self.label_indices = label_indices
        
    def __call__(self, features, labels):
        # tf is less flexible than numpy. so we take each tensor slice seperately and stack it then
        picked_labels = [labels[:, i] for i in self.label_indices]
        picked_labels = tf.stack(picked_labels, axis=1)
        return (features, picked_labels)
    
        
class BatchWindow:
    
    def __init__(self, sequence_width: int):
        self.sequence_width = sequence_width
    
    def __call__(self, window: Dataset):
        return window.batch(self.sequence_width)
        
        
        
class MakeDatasetFromDataFrame:
    
    def __init__(self, offsets: Offsets, batch_size: int, labels: list):
        self.offsets = offsets
        self.batch_size = batch_size
        self.labels = labels
    
    def __call__(self, df: DataFrame):
        sequence_width = self.offsets.sequence_width
        label_indices = get_label_indices(df, self.labels)
        return (Dataset.from_tensor_slices(df)
                .window(sequence_width, 1, drop_remainder=True)
                .flat_map(BatchWindow(sequence_width))
                .map(SplitInTime(offsets))
                .map(PickLabels(label_indices))
                .batch(self.batch_size)
                .prefetch(1))

In [27]:
offsets = Offsets(4, 1) # parametrize offsets
ds = MakeDatasetFromDataFrame(offsets=offsets, batch_size=1, labels=['T (degC)'])(df)
ds.element_spec

(TensorSpec(shape=(None, None, 14), dtype=tf.float64, name=None),
 TensorSpec(shape=(None, None, 1), dtype=tf.float64, name=None))

In [28]:
# concrete batches
for x, y in ds.take(1):
    print(f"Input with shape {x.shape}: \n {x}")
    print(f"Output with shape {y.shape}: \n {y}")

Input with shape (1, 4, 14): 
 [[[ 9.96500e+02 -8.05000e+00  2.65380e+02 -8.78000e+00  9.44000e+01
    3.33000e+00  3.14000e+00  1.90000e-01  1.96000e+00  3.15000e+00
    1.30786e+03  2.10000e-01  6.30000e-01  1.92700e+02]
  [ 9.96620e+02 -8.88000e+00  2.64540e+02 -9.77000e+00  9.32000e+01
    3.12000e+00  2.90000e+00  2.10000e-01  1.81000e+00  2.91000e+00
    1.31225e+03  2.50000e-01  6.30000e-01  1.90300e+02]
  [ 9.96840e+02 -8.81000e+00  2.64590e+02 -9.66000e+00  9.35000e+01
    3.13000e+00  2.93000e+00  2.00000e-01  1.83000e+00  2.94000e+00
    1.31218e+03  1.80000e-01  6.30000e-01  1.67200e+02]
  [ 9.96990e+02 -9.05000e+00  2.64340e+02 -1.00200e+01  9.26000e+01
    3.07000e+00  2.85000e+00  2.30000e-01  1.78000e+00  2.85000e+00
    1.31361e+03  1.00000e-01  3.80000e-01  2.40000e+02]]]
Output with shape (1, 1, 1): 
 [[[-9.63]]]


In [29]:
df.head(10)

Unnamed: 0_level_0,p (mbar),T (degC),Tpot (K),Tdew (degC),rh (%),VPmax (mbar),VPact (mbar),VPdef (mbar),sh (g/kg),H2OC (mmol/mol),rho (g/m**3),wv (m/s),max. wv (m/s),wd (deg)
Date Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2009-01-01 01:00:00,996.5,-8.05,265.38,-8.78,94.4,3.33,3.14,0.19,1.96,3.15,1307.86,0.21,0.63,192.7
2009-01-01 02:00:00,996.62,-8.88,264.54,-9.77,93.2,3.12,2.9,0.21,1.81,2.91,1312.25,0.25,0.63,190.3
2009-01-01 03:00:00,996.84,-8.81,264.59,-9.66,93.5,3.13,2.93,0.2,1.83,2.94,1312.18,0.18,0.63,167.2
2009-01-01 04:00:00,996.99,-9.05,264.34,-10.02,92.6,3.07,2.85,0.23,1.78,2.85,1313.61,0.1,0.38,240.0
2009-01-01 05:00:00,997.46,-9.63,263.72,-10.65,92.2,2.94,2.71,0.23,1.69,2.71,1317.19,0.4,0.88,157.0
2009-01-01 06:00:00,997.71,-9.67,263.66,-10.62,92.7,2.93,2.71,0.21,1.69,2.72,1317.71,0.05,0.5,146.0
2009-01-01 07:00:00,998.33,-9.17,264.12,-10.1,92.9,3.04,2.83,0.22,1.76,2.83,1315.98,2.08,2.88,348.8
2009-01-01 08:00:00,999.17,-8.1,265.12,-9.05,92.8,3.31,3.07,0.24,1.92,3.08,1311.65,0.72,1.25,213.9
2009-01-01 09:00:00,999.69,-7.66,265.52,-8.84,91.2,3.43,3.13,0.3,1.95,3.13,1310.14,0.34,0.63,202.2
2009-01-01 10:00:00,1000.27,-7.04,266.1,-8.17,91.6,3.6,3.3,0.3,2.05,3.29,1307.76,1.45,3.0,292.6
