In [1]:
import pandas as pd
import numpy as np
import pickle

def transform(data, mean, std):
    return (data - mean) / std

def inverse_transform(data, mean, std):
    return (data * std) + mean

In [2]:
raw_data_path = "raw_dataset/METRLA/METR-LA.h5"
adj_path = "raw_dataset/METRLA/adj_METR-LA.pkl"
freq = 5 # minute

In [3]:
raw_data = pd.read_hdf(raw_data_path)
raw_data.head()

Unnamed: 0,773869,767541,767542,717447,717446,717445,773062,767620,737529,717816,...,772167,769372,774204,769806,717590,717592,717595,772168,718141,769373
2012-03-01 00:00:00,64.375,67.625,67.125,61.5,66.875,68.75,65.125,67.125,59.625,62.75,...,45.625,65.5,64.5,66.428571,66.875,59.375,69.0,59.25,69.0,61.875
2012-03-01 00:05:00,62.666667,68.555556,65.444444,62.444444,64.444444,68.111111,65.0,65.0,57.444444,63.333333,...,50.666667,69.875,66.666667,58.555556,62.0,61.111111,64.444444,55.888889,68.444444,62.875
2012-03-01 00:10:00,64.0,63.75,60.0,59.0,66.5,66.25,64.5,64.25,63.875,65.375,...,44.125,69.0,56.5,59.25,68.125,62.5,65.625,61.375,69.857143,62.0
2012-03-01 00:15:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2012-03-01 00:20:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# variate
L, N = raw_data.shape
train_len = int(L * 0.7)
test_len = int(L * 0.2)
val_len = L - train_len - test_len

train_data = raw_data.values[0:train_len]
mean, std = train_data.mean(axis=0), train_data.std(axis=0)
norm_var = transform(raw_data.values, mean, std)

train_len, val_len, test_len

(23990, 3428, 6854)

In [5]:
# time marker
tod_size = int((24 * 60) / freq) - 1
dow_size = 7
dom_size = 31
doy_size = 366

tod = np.array(list(map(lambda x: ((60 * x.hour + x.minute) / freq), raw_data.index))) # 0 ~ 287
dow = np.array(raw_data.index.dayofweek) # 0 ~ 6
dom = np.array(raw_data.index.day) - 1 # 0 ~ 30
doy = np.array(raw_data.index.dayofyear) - 1 # 0 ~ 181

time_marker = np.stack([tod, dow, dom, doy], axis=-1)
norm_time_marker = np.stack([tod / tod_size - 0.5, dow / dow_size - 0.5, dom / dom_size - 0.5, doy / doy_size - 0.5], axis=-1)
# norm_time_marker = np.stack([tod / tod_size, dow / dow_size, dom / dom_size, doy / doy_size], axis=-1)

In [6]:
norm_var.shape

(34272, 207)

In [7]:
# adj_mat
with open(adj_path, 'rb') as f:
    adj = pickle.load(f, encoding="bytes")

In [8]:
np.savez("../../dataset/METRLA/feature.npz", norm_var=norm_var, norm_time_marker=norm_time_marker, time_marker=time_marker)
np.savez("../../dataset/METRLA/var_scaler_info.npz", mean=mean, std=std)
np.save("../../dataset/METRLA/adj_mat.npy", adj[2])

In [9]:
train_len, val_len, test_len

(23990, 3428, 6854)