In [233]:
import pandas as pd
import numpy as np
import pickle

def transform(data, mean, std):
    return (data - mean) / std

def inverse_transform(data, mean, std):
    return (data * std) + mean

In [234]:
raw_data_path = "raw_dataset/PEMSBAY/PEMS-BAY.h5"
adj_path = "raw_dataset/PEMSBAY/adj_PEMS-BAY.pkl"
freq = 5 # minute

In [235]:
raw_data = pd.read_hdf(raw_data_path)
raw_data.head()

sensor_id,400001,400017,400030,400040,400045,400052,400057,400059,400065,400069,...,409525,409526,409528,409529,413026,413845,413877,413878,414284,414694
2017-01-01 00:00:00,71.4,67.8,70.5,67.4,68.8,66.6,66.8,68.0,66.8,69.0,...,68.8,67.9,68.8,68.0,69.2,68.9,70.4,68.8,71.1,68.0
2017-01-01 00:05:00,71.6,67.5,70.6,67.5,68.7,66.6,66.8,67.8,66.5,68.2,...,68.4,67.3,68.4,67.6,70.4,68.8,70.1,68.4,70.8,67.4
2017-01-01 00:10:00,71.6,67.6,70.2,67.4,68.7,66.1,66.8,67.8,66.2,67.8,...,68.4,67.4,68.4,67.5,70.2,68.3,69.8,68.4,70.5,67.9
2017-01-01 00:15:00,71.1,67.5,70.3,68.0,68.5,66.7,66.6,67.7,65.9,67.8,...,68.5,67.5,68.5,67.5,70.4,68.7,70.2,68.4,70.8,67.6
2017-01-01 00:20:00,71.7,67.8,70.2,68.1,68.4,66.9,66.1,67.7,66.1,67.8,...,68.5,67.7,68.5,67.4,69.6,69.1,70.0,68.4,71.0,67.9


In [236]:
# variate
L, N = raw_data.shape
train_len = int(L * 0.7)
test_len = int(L * 0.2)
val_len = L - train_len - test_len

train_data = raw_data.values[0:train_len]
mean, std = train_data.mean(axis=0), train_data.std(axis=0)
norm_var = transform(raw_data.values, mean, std)

train_len, val_len, test_len

(36481, 5212, 10423)

In [237]:
# time marker
tod_size = int((24 * 60) / freq) - 1
dow_size = 6
dom_size = 30
doy_size = 365

tod = np.array(list(map(lambda x: ((60 * x.hour + x.minute) / freq), raw_data.index))) # 0 ~ 287
dow = np.array(raw_data.index.dayofweek) # 0 ~ 6
dom = np.array(raw_data.index.day) - 1 # 0 ~ 30
doy = np.array(raw_data.index.dayofyear) - 1 # 0 ~ 181

time_marker = np.stack([tod, dow, dom, doy], axis=-1)
norm_time_marker = np.stack([tod / tod_size - 0.5, dow / dow_size - 0.5, dom / dom_size - 0.5, doy / doy_size - 0.5], axis=-1)

In [238]:
norm_var.shape

(52116, 325)

In [239]:
# adj_mat
with open(adj_path, 'rb') as f:
    adj = pickle.load(f, encoding="bytes")

In [240]:
np.savez("../../dataset/PEMSBAY/feature.npz", norm_var=norm_var, norm_time_marker=norm_time_marker, time_marker=time_marker)
np.savez("../../dataset/PEMSBAY/var_scaler_info.npz", mean=mean, std=std)
np.save("../../dataset/PEMSBAY/adj_mat.npy", adj[2])

In [241]:
train_len, val_len, test_len

(36481, 5212, 10423)

In [242]:
d = np.load("../../dataset/PEMSBAY/feature.npz")

In [243]:
d.files

['norm_var', 'norm_time_marker', 'time_marker']