In [None]:
import numpy as np

# Đường dẫn tới file .npz
file_path = 'data/bike_drop/train.npz'

# Đọc file .npz
data = np.load(file_path)

# In tên và hình dạng của từng mảng trong file
for key in data.files:
    print(f"{key}: shape = {data[key].shape}")
  

x: shape = (2606, 12, 250, 3)
y: shape = (2606, 12, 250, 1)
x_offsets: shape = (12, 1)
y_offsets: shape = (12, 1)


In [6]:
import os
import h5py
import numpy as np
import pandas as pd

def get_data_loader_to_npz(
    data_category: list,
    X_list: list,
    Y_list: list,
    _len: list,
    city,
    output_dir: str,
    add_time_in_day=True,
    add_day_in_week=True
):
    val_len, test_len = _len[0], _len[1]

    for mode in ['pick', 'drop']:
        data = []
        for category in data_category:
            with h5py.File(f"data/nogrid/{city}/{category}_data.h5", 'r') as hf:
                data_arr = hf[f'{category}_{mode}'][:]
            data.append(data_arr)  # (T, N)
        data = np.concatenate(data, axis=1)  # (T, N_total)

        # Tạo time index cho feature thời gian
        T = data.shape[0]
        start_time = '2016-04-01 00:00:00' if 'NYC' in city else '2024-07-01 00:00:00'
        time_index = pd.date_range(start=start_time, periods=T, freq='30min')

        # Feature engineering
        data = np.expand_dims(data, axis=-1)  # (T, N, 1)
        x_feature_list = [data]

        if add_time_in_day:
            time_ind = (time_index.values - time_index.values.astype("datetime64[D]")) / np.timedelta64(1, "D")
            time_in_day = np.tile(time_ind, [data.shape[1], 1]).T[..., None]  # (T, N, 1)
            x_feature_list.append(time_in_day)

        if add_day_in_week:
            dow = time_index.dayofweek
            dow_tiled = np.tile(dow, [data.shape[1], 1]).T[..., None]  # (T, N, 1)
            x_feature_list.append(dow_tiled)

        data_feat = np.concatenate(x_feature_list, axis=-1)  # (T, N, F)

        # Tạo sample X, Y
        X_, Y_ = [], []
        for i in range(max(X_list), data_feat.shape[0] - max(Y_list)):
            X_.append([data_feat[i - j] for j in X_list])
            Y_.append([data_feat[i + j] for j in Y_list])
        X_ = np.asarray(X_)  # (samples, T_in, N, F)
        Y_ = np.asarray(Y_)  # (samples, T_out, N, F)

        # Tách train/val/test
        num_total = X_.shape[0]
        num_val, num_test = val_len, test_len
        num_train = num_total - num_val - num_test

        splits = {
            'train': (0, num_train),
            'val': (num_train, num_train + num_val),
            'test': (num_train + num_val, num_total)
        }

        # Lưu vào folder NYC/taxi_pick/train.npz, NYC/taxi_drop/train.npz, ...
        for category in data_category:
            mode_folder = os.path.join(output_dir, f"{category}_{mode}")
            os.makedirs(mode_folder, exist_ok=True)
            for split, (start, end) in splits.items():
                x_split = X_[start:end]
                y_split = Y_[start:end]
                np.savez_compressed(
                    os.path.join(mode_folder, f"{split}.npz"),
                    x=x_split,
                    y=y_split
                )
                print(f"Saved {mode_folder}/{split}.npz: x={x_split.shape}, y={y_split.shape}")
            break  # chỉ chạy 1 lần cho mỗi mode với tất cả category


In [7]:
get_data_loader_to_npz(
    data_category=["taxi"],
    X_list=list(range(12)),
    Y_list=list(range(1, 13)),
    _len=[672, 672],
    city="NYC",
    output_dir="NYC"
)

Saved NYC\taxi_pick/train.npz: x=(3001, 12, 266, 3), y=(3001, 12, 266, 3)
Saved NYC\taxi_pick/val.npz: x=(672, 12, 266, 3), y=(672, 12, 266, 3)
Saved NYC\taxi_pick/test.npz: x=(672, 12, 266, 3), y=(672, 12, 266, 3)
Saved NYC\taxi_drop/train.npz: x=(3001, 12, 266, 3), y=(3001, 12, 266, 3)
Saved NYC\taxi_drop/val.npz: x=(672, 12, 266, 3), y=(672, 12, 266, 3)
Saved NYC\taxi_drop/test.npz: x=(672, 12, 266, 3), y=(672, 12, 266, 3)


In [9]:
import numpy as np

# Load file
data1 = np.load('NYC/taxi_pick/test.npz')
x1 = data1['x']  # shape: (672, 12, 271, 3)
y1 = data1['y']  # shape: (672, 12, 271, 1)


# Lấy chiều thứ 1 của chiều cuối
x1_feat1 = x1[:, :, :, 0]

print("x1[0:5, 0, 0, 1]:")
print(x1_feat1[0:5, 0, 0])


x1[0:5, 0, 0, 1]:
[129. 140. 131. 103. 107.]
