# Dataset - ETT Dataset

## 0. imports

In [1]:
%load_ext jupyter_black

In [2]:
import os
import re
import glob

import numpy as np
import pandas as pd

import torch
import torch.nn as nn

from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
from gluonts.time_feature import time_features_from_frequency_str

## 1. Data Preprocess

### 1.1 data load

In [3]:
def load_data(data_path: str, task: str, target: str) -> pd.DataFrame:
    df = pd.read_csv(data_path)
    df["date"] = pd.to_datetime(df["date"])
    df = df.set_index("date")

    if task == "S":
        df = df[[target]]

    return df

### 1.2 data split

In [4]:
def trn_val_tst_split(
    df: pd.DataFrame, split_idx_dict: dict[str, list[int]], scaler=None
) -> dict[str, pd.DataFrame]:
    if scaler:
        s_idx, e_idx = split_idx_dict["train"]
        train_df = df[s_idx:e_idx]
        scaler.fit(train_df)
        df[df.columns] = scaler.transform(df[df.columns])

    data_dict = {}
    for stage in split_idx_dict:
        s_idx, e_idx = split_idx_dict[stage]
        data_dict[stage] = df[s_idx:e_idx]

    return data_dict

### 1.3 get stamp data

In [22]:
def get_stamp_data(df: pd.DataFrame, use_time_enc: bool = True, freq: str = "h"):
    stamp_df = pd.DataFrame()
    stamp_df["date"] = df.index
    if use_time_enc:
        dates = pd.to_datetime(stamp_df["date"].values)
        stamp_data = np.vstack(
            [feat(dates) for feat in time_features_from_frequency_str(freq)]
        )
        stamp_data = stamp_data.transpose(1, 0)
    else:
        if freq == "h":
            stamp_df["month"] = stamp_df["date"].apply(lambda row: row.month)
            stamp_df["day"] = stamp_df["date"].apply(lambda row: row.day)
            stamp_df["weekday"] = stamp_df["date"].apply(lambda row: row.weekday())
            stamp_df["hour"] = stamp_df["date"].apply(lambda row: row.hour)
            stamp_data = stamp_df.drop(["date"], axis=1).values
        elif freq == "t":
            stamp_df["month"] = stamp_df.date.apply(lambda row: row.month, 1)
            stamp_df["day"] = stamp_df.date.apply(lambda row: row.day, 1)
            stamp_df["weekday"] = stamp_df.date.apply(lambda row: row.weekday(), 1)
            stamp_df["hour"] = stamp_df.date.apply(lambda row: row.hour, 1)
            stamp_df["minute"] = stamp_df.date.apply(lambda row: row.minute, 1)
            stamp_df["minute"] = stamp_df.minute.map(lambda x: x // 15)
            stamp_df = stamp_df.drop(["date"], 1).values
    return stamp_data

In [23]:
# 'forecasting task, options:[M, S, MS];
#  M:multivariate predict multivariate,
#  S:univariate predict univariate,
#  MS:multivariate predict univariate'
task = "M"
target = "OT"
data_path = "../data/ETT-small/ETTh1.csv"

seq_len = 96
label_len = 48
pred_len = 96
stage = "train"
use_scaler = True
use_time_enc = True
freq = "h"
target = "OT"

split_idx_dict = {
    "train": [0, 12 * 30 * 24],
    "val": [12 * 30 * 24 - seq_len, 12 * 30 * 24 + 4 * 30 * 24],
    "test": [12 * 30 * 24 + 4 * 30 * 24 - seq_len, 12 * 30 * 24 + 8 * 30 * 24],
}

scaler = None
if use_scaler:
    scaler = StandardScaler()

In [24]:
df = load_data(data_path, task=task, target=target)
data_dict = trn_val_tst_split(df, split_idx_dict, scaler)
train_stamp_data = get_stamp_data(data_dict["train"], use_time_enc, freq)

## 2. Dataset

In [34]:
class ETTDataset(Dataset):
    def __init__(
        self,
        df: pd.DataFrame,
        seq_len: int,
        label_len: int,
        pred_len: int,
        freq: str = "h",
        use_time_enc: bool = True,
    ):
        self.seq_len = seq_len
        self.label_len = label_len
        self.pred_len = pred_len
        self.freq = freq
        self.use_time_enc = use_time_enc

        self.input_data = df.values
        self.target_data = df.values
        self.stamp_data = get_stamp_data(df, use_time_enc, freq)

    def __len__(self):
        return len(self.input_data) - self.seq_len - self.pred_len + 1

    def __getitem__(self, idx):
        s_begin = idx
        s_end = s_begin + self.seq_len
        r_begin = s_end - self.label_len
        r_end = r_begin + self.label_len + self.pred_len

        past_values = self.input_data[s_begin:s_end]
        past_time_features = self.stamp_data[s_begin:s_end]
        future_values = self.target_data[r_begin:r_end]
        future_time_features = self.stamp_data[r_begin:r_end]

        return {
            "past_values": torch.FloatTensor(past_values),
            "past_time_features": torch.FloatTensor(past_time_features),
            "future_values": torch.FloatTensor(future_values),
            "future_time_features": torch.FloatTensor(future_time_features),
        }

In [35]:
dataset = ETTDataset(
    data_dict["train"], seq_len, label_len, pred_len, freq, use_time_enc
)

In [36]:
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, drop_last=True)

In [37]:
batch = next(iter(dataloader))

In [38]:
batch["past_values"].shape

torch.Size([32, 96, 7])

In [39]:
batch["future_values"].shape

torch.Size([32, 144, 7])

In [42]:
batch["past_time_features"].shape

torch.Size([32, 96, 4])

In [43]:
batch["future_time_features"].shape

torch.Size([32, 144, 4])