In [22]:
import pandas as pd
import numpy as np
import pickle

def transform(data, mean, std):
    return (data - mean) / std

def inverse_transform(data, mean, std):
    return (data * std) + mean

In [23]:
subset_name = "ETTm1"
freq = 15 # 15 minute
raw_data_path = "raw_dataset/ETT-small/{}.csv".format(subset_name)

In [24]:
raw_data = pd.read_csv(raw_data_path)

# Following many previous works (e.g., Informer, Autoformer), we use the first 20 months of data, i.e., the first 14400 rows.
raw_data = raw_data.iloc[:20*30*24*4]
df_index = pd.to_datetime(raw_data["date"].values)
raw_data = raw_data[raw_data.columns[1:]]
raw_data.index = df_index
raw_data.head()

Unnamed: 0,HUFL,HULL,MUFL,MULL,LUFL,LULL,OT
2016-07-01 00:00:00,5.827,2.009,1.599,0.462,4.203,1.34,30.531
2016-07-01 00:15:00,5.76,2.076,1.492,0.426,4.264,1.401,30.459999
2016-07-01 00:30:00,5.76,1.942,1.492,0.391,4.234,1.31,30.038
2016-07-01 00:45:00,5.76,1.942,1.492,0.426,4.234,1.31,27.013
2016-07-01 01:00:00,5.693,2.076,1.492,0.426,4.142,1.371,27.787001


In [25]:
raw_data.shape

(57600, 7)

In [26]:
# variate
L, N = raw_data.shape
train_len = int(L * 0.6)
test_len = int(L * 0.2)
val_len = L - train_len - test_len

train_data = raw_data.values[0:train_len]
mean, std = train_data.mean(axis=0), train_data.std(axis=0)
norm_var = transform(raw_data.values, mean, std)

train_len, val_len, test_len

(34560, 11520, 11520)

In [27]:
# time marker
tod_size = int((24 * 60) / freq) - 1
dow_size = 6
dom_size = 30
doy_size = 365

tod = np.array(list(map(lambda x: ((60 * x.hour + x.minute) / freq), raw_data.index))) # 0 ~ 287
dow = np.array(raw_data.index.dayofweek) # 0 ~ 6
dom = np.array(raw_data.index.day) - 1 # 0 ~ 30
doy = np.array(raw_data.index.dayofyear) - 1 # 0 ~ 181

time_marker = np.stack([tod, dow, dom, doy], axis=-1)
norm_time_marker = np.stack([tod / tod_size - 0.5, dow / dow_size - 0.5, dom / dom_size - 0.5, doy / doy_size - 0.5], axis=-1)

In [28]:
np.savez("../../dataset/{}/feature.npz".format(subset_name), norm_var=norm_var, norm_time_marker=norm_time_marker, time_marker=time_marker)
np.savez("../../dataset/{}/var_scaler_info.npz".format(subset_name), mean=mean, std=std)

In [29]:
norm_var.shape

(57600, 7)