In [1]:
import os

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import tensorflow as tf

In [4]:

DATA_DIR = 'gs://time_series_datasets'
LOCAL_CACHE_DIR = './data_loader/dataset/'


class TSFDataLoader:
  """Generate data loader from raw data."""

  def __init__(
      self, data, batch_size, seq_len, pred_len, feature_type, target='OT'
  ):
    self.data = data
    self.batch_size = batch_size
    self.seq_len = seq_len
    self.pred_len = pred_len
    self.feature_type = feature_type
    self.target = target
    self.target_slice = slice(0, None)

    self._read_data()

  def _read_data(self):
    """Load raw data and split datasets."""

    # copy data from cloud storage if not exists
    if not os.path.isdir(LOCAL_CACHE_DIR):
      os.mkdir(LOCAL_CACHE_DIR)

    file_name = self.data + '.csv'
    cache_filepath = os.path.join(LOCAL_CACHE_DIR, file_name)
    if not os.path.isfile(cache_filepath):
      tf.io.gfile.copy(
          os.path.join(DATA_DIR, file_name), cache_filepath, overwrite=True
      )

    df_raw = pd.read_csv(cache_filepath)

    # S: univariate-univariate, M: multivariate-multivariate, MS:
    # multivariate-univariate
    df = df_raw.set_index('date')
    if self.feature_type == 'S':
      df = df[[self.target]]
    elif self.feature_type == 'MS':
      target_idx = df.columns.get_loc(self.target)
      self.target_slice = slice(target_idx, target_idx + 1)

    # split train/valid/test
    n = len(df)
    if self.data.startswith('ETTm'):
      train_end = 12 * 30 * 24 * 4
      val_end = train_end + 4 * 30 * 24 * 4
      test_end = val_end + 4 * 30 * 24 * 4
    elif self.data.startswith('ETTh'):
      train_end = 12 * 30 * 24
      val_end = train_end + 4 * 30 * 24
      test_end = val_end + 4 * 30 * 24
    else:
      train_end = int(n * 0.7)
      val_end = n - int(n * 0.2)
      test_end = n
    train_df = df[:train_end]
    val_df = df[train_end - self.seq_len : val_end]
    test_df = df[val_end - self.seq_len : test_end]

    # standardize by training set
    self.scaler = StandardScaler()
    self.scaler.fit(train_df.values)

    def scale_df(df, scaler):
      data = scaler.transform(df.values)
      return pd.DataFrame(data, index=df.index, columns=df.columns)

    self.train_df = scale_df(train_df, self.scaler)
    self.val_df = scale_df(val_df, self.scaler)
    self.test_df = scale_df(test_df, self.scaler)
    self.n_feature = self.train_df.shape[-1]

  def _split_window(self, data):
    inputs = data[:, : self.seq_len, :]
    labels = data[:, self.seq_len :, self.target_slice]
    # Slicing doesn't preserve static shape information, so set the shapes
    # manually. This way the `tf.data.Datasets` are easier to inspect.
    inputs.set_shape([None, self.seq_len, None])
    labels.set_shape([None, self.pred_len, None])
    return inputs, labels

  def _make_dataset(self, data, shuffle=True):
    data = np.array(data, dtype=np.float32)
    ds = tf.keras.utils.timeseries_dataset_from_array(
        data=data,
        targets=None,
        sequence_length=(self.seq_len + self.pred_len),
        sequence_stride=1,
        shuffle=shuffle,
        batch_size=self.batch_size,
    )
    ds = ds.map(self._split_window)
    return ds

  def inverse_transform(self, data):
    return self.scaler.inverse_transform(data)

  def get_train(self, shuffle=True):
    return self._make_dataset(self.train_df, shuffle=shuffle)

  def get_val(self):
    return self._make_dataset(self.val_df, shuffle=False)

  def get_test(self):
    return self._make_dataset(self.test_df, shuffle=False)

In [5]:
  # load datasets
  data_loader = TSFDataLoader(
      data='weather',
      batch_size=32,
      seq_len=336,
      pred_len=96,
      feature_type='M',
      target='OT',
  )
  train_data = data_loader.get_train()
  val_data = data_loader.get_val()
  test_data = data_loader.get_test()

In [8]:
train_data

<_MapDataset element_spec=(TensorSpec(shape=(None, 336, 21), dtype=tf.float32, name=None), TensorSpec(shape=(None, 96, 21), dtype=tf.float32, name=None))>

In [9]:
val_data 

<_MapDataset element_spec=(TensorSpec(shape=(None, 336, 21), dtype=tf.float32, name=None), TensorSpec(shape=(None, 96, 21), dtype=tf.float32, name=None))>

In [20]:
import numpy as np

# Sample 3D NumPy array with shape (10, 3, 21)
data = np.random.rand(4, 3, 21)

# Assuming seq_len = 2 and pred_len = 1 for illustration purposes
seq_len = 2
pred_len = 1

# Creating input and output sequences based on seq_len and pred_len
input_sequences = []
output_sequences = []

# Generating input and output sequences
for sequence in data:
    for i in range(len(sequence) - seq_len - pred_len + 1):
        input_seq = sequence[i:i + seq_len]
        output_seq = sequence[i + seq_len:i + seq_len + pred_len]
        input_sequences.append(input_seq)
        output_sequences.append(output_seq)

# Converting lists to NumPy arrays
input_sequences = np.array(input_sequences)
output_sequences = np.array(output_sequences)

print("Input Sequences (shape):", input_sequences.shape)
print("Output Sequences (shape):", output_sequences.shape)

Input Sequences (shape): (4, 2, 21)
Output Sequences (shape): (4, 1, 21)


In [25]:
input_sequences[1]

array([[0.00316875, 0.4573602 , 0.01919774, 0.67595503, 0.77027054,
        0.77914655, 0.08831018, 0.42250614, 0.55844842, 0.59594697,
        0.28111921, 0.98381549, 0.96248126, 0.54498142, 0.79147503,
        0.617044  , 0.55314048, 0.72282026, 0.96695314, 0.72625082,
        0.82753557],
       [0.44732178, 0.25866765, 0.24713372, 0.65167408, 0.93386181,
        0.51421089, 0.73082274, 0.30311735, 0.85468899, 0.60882467,
        0.51449664, 0.10788424, 0.11369338, 0.64138292, 0.81037767,
        0.3346115 , 0.89007985, 0.97929528, 0.90432804, 0.08742955,
        0.65426042]])

In [26]:
data[1]

array([[0.00316875, 0.4573602 , 0.01919774, 0.67595503, 0.77027054,
        0.77914655, 0.08831018, 0.42250614, 0.55844842, 0.59594697,
        0.28111921, 0.98381549, 0.96248126, 0.54498142, 0.79147503,
        0.617044  , 0.55314048, 0.72282026, 0.96695314, 0.72625082,
        0.82753557],
       [0.44732178, 0.25866765, 0.24713372, 0.65167408, 0.93386181,
        0.51421089, 0.73082274, 0.30311735, 0.85468899, 0.60882467,
        0.51449664, 0.10788424, 0.11369338, 0.64138292, 0.81037767,
        0.3346115 , 0.89007985, 0.97929528, 0.90432804, 0.08742955,
        0.65426042],
       [0.23035467, 0.90070498, 0.84934115, 0.61073124, 0.88099909,
        0.1539288 , 0.35662183, 0.73488949, 0.97908236, 0.46522614,
        0.93876556, 0.8944651 , 0.91011926, 0.84441351, 0.89958414,
        0.09752477, 0.90620487, 0.59878132, 0.12212723, 0.39242131,
        0.10898099]])

In [27]:
output_sequences[1]

array([[0.23035467, 0.90070498, 0.84934115, 0.61073124, 0.88099909,
        0.1539288 , 0.35662183, 0.73488949, 0.97908236, 0.46522614,
        0.93876556, 0.8944651 , 0.91011926, 0.84441351, 0.89958414,
        0.09752477, 0.90620487, 0.59878132, 0.12212723, 0.39242131,
        0.10898099]])

In [19]:
for i in data:
    print(len(i))

3
3
3
3


In [14]:
input_sequences

array([], dtype=float64)