# Thư viện

In [189]:
import os
import h5py
from tqdm import tqdm

from numpy.lib.stride_tricks import sliding_window_view
import numpy as np
import pandas as pd

import torch

In [190]:
PATH = "../../data/preprocessed"

# Load data

In [191]:
status_df = pd.read_csv(f"{PATH}/segment_status.csv")
segments_df = pd.read_csv(f"{PATH}/segments.csv")
train_df = pd.read_csv(f"{PATH}/train.csv")

In [192]:
status_df["updated_at"] = pd.to_datetime(status_df["updated_at"], format="mixed")
status_df.head()

Unnamed: 0,_id,updated_at,segment_id,velocity
0,0,2020-07-03 14:55:31.869000+00:00,24845,20
1,1,2020-07-03 15:02:56.048000+00:00,33923,10
2,2,2020-07-04 08:15:52.696000+00:00,33824,5
3,3,2020-07-04 08:15:59.903000+00:00,33824,5
4,4,2020-07-04 08:16:08.201000+00:00,33824,5


In [193]:
segments_df["created_at"] = pd.to_datetime(segments_df["created_at"], format="mixed")
segments_df["updated_at"] = pd.to_datetime(segments_df["updated_at"], format="mixed")
segments_df.head()

Unnamed: 0,segment_id,created_at,updated_at,s_node_id,e_node_id,length,street_id,max_velocity,street_level,street_name,street_type,length_z_score
0,0,2020-10-18 13:26:17.365000+00:00,2020-10-18 13:26:17.365000+00:00,373543511,5468660805,114,31096786,80.0,0,Quốc Lộ 1,trunk,1.369884
1,1,2020-10-18 13:26:17.400000+00:00,2020-10-18 13:26:17.400000+00:00,5468660805,5738158916,9,31096786,80.0,0,Quốc Lộ 1,trunk,-0.622828
2,2,2020-10-18 13:26:17.435000+00:00,2020-10-18 13:26:17.435000+00:00,5738158916,5738158918,23,31096786,80.0,0,Quốc Lộ 1,trunk,-0.357133
3,3,2020-10-18 13:26:17.444000+00:00,2020-10-18 13:26:17.444000+00:00,5738158918,5738158912,66,31096786,80.0,0,Quốc Lộ 1,trunk,0.45893
4,4,2020-10-18 13:26:17.452000+00:00,2020-10-18 13:26:17.452000+00:00,5738158912,5758104203,127,31096786,80.0,0,Quốc Lộ 1,trunk,1.6166


In [194]:
train_df["date"] = pd.to_datetime(train_df["date"])
train_df.head()

Unnamed: 0,_id,segment_id,date,period,LOS,s_node_id,e_node_id,length,street_id,max_velocity,...,weekday_4,weekday_5,weekday_6,street_type_primary,street_type_primary_link,street_type_secondary,street_type_tertiary,street_type_trunk,street_type_trunk_link,street_type_unclassified
0,614,2443,2020-08-02,period_23_30,E,5093747408,5778236991,7,32577460,,...,False,False,True,False,False,False,True,False,False,False
1,615,2443,2020-08-03,period_0_00,D,5093747408,5778236991,7,32577460,,...,False,False,False,False,False,False,True,False,False,False
2,616,2443,2020-11-26,period_1_00,E,5093747408,5778236991,7,32577460,,...,False,False,False,False,False,False,True,False,False,False
3,617,2443,2020-11-30,period_4_30,B,5093747408,5778236991,7,32577460,,...,False,False,False,False,False,False,True,False,False,False
4,618,2443,2020-11-30,period_5_00,B,5093747408,5778236991,7,32577460,,...,False,False,False,False,False,False,True,False,False,False


# Xây dựng mô hình không gian

## Danh sách các segments được sử dụng trong train

In [195]:
active_segments_list = train_df["segment_id"].unique()
print(len(active_segments_list))

202


## Lọc các segments được sử dụng

In [196]:
active_segments_df = segments_df[segments_df["segment_id"].isin(active_segments_list)]
N = active_segments_df.shape[0]
active_segments_df.shape

(202, 12)

## Hashmap từ segment id sang index và ngược lại

In [197]:
segment_id_2_node_idx = {segment_id: i for i, segment_id in enumerate(active_segments_list)}
node_idx_2_segment_id = {i: segment_id for i, segment_id in enumerate(active_segments_list)}

## Tạo danh sách liền kề

In [198]:
node_to_segments = {}

for _, row in active_segments_df.iterrows():
    seg_idx = segment_id_2_node_idx[row['segment_id']]
    
    s_node = row['s_node_id']
    e_node = row['e_node_id']
    
    if s_node not in node_to_segments: node_to_segments[s_node] = []
    if e_node not in node_to_segments: node_to_segments[e_node] = []
    
    node_to_segments[s_node].append(seg_idx)
    node_to_segments[e_node].append(seg_idx)

## Tạo danh sách cạnh

In [199]:
edges = set()
for node_id, segments in node_to_segments.items():
    if len(segments) > 1:
        # Kết nối tất cả các cặp segment tại nút giao này
        for i in range(len(segments)):
            for j in range(i + 1, len(segments)):
                seg_a = segments[i]
                seg_b = segments[j]
                
                edges.add((seg_a, seg_b))
                edges.add((seg_b, seg_a))

for i in range(N):
    edges.add((i, i))

In [200]:
edge_list_source = [src for src, dst in edges]
edge_list_target = [dst for src, dst in edges]

edge_index = torch.tensor([edge_list_source, edge_list_target], dtype=torch.long)
edge_index. shape

torch.Size([2, 300])

# Xây dựng mô hình thời gian

## Tạo timestamp

In [201]:
train_df['date'] = pd.to_datetime(train_df['date'])
train_df['timestamp'] = train_df['date'] + \
                                pd.to_timedelta(train_df['hour'], unit='h') + \
                                pd.to_timedelta(train_df['minute'], unit='m')
train_df['weekday'] = train_df['timestamp'].dt.weekday

In [202]:
all_segments_idx = list(range(N)) # N = 10026
min_time = train_df['timestamp'].min()
max_time = train_df['timestamp'].max()
full_time_index = pd.date_range(start=min_time, end=max_time, freq='30min') # N x T
T_total = len(full_time_index)
print(f"Tạo lưới dữ liệu: {N} segments x {T_total} chu kỳ thời gian.")

Tạo lưới dữ liệu: 202 segments x 14008 chu kỳ thời gian.


## Không gian đặc trưng động

### Danh sách đặc trưng động

In [203]:
dynamic_features = ['LOS_encoded', 'hour', 'minute', 'weekday', 'velocity'] 
F_dynamic = len(dynamic_features)

### Pivot table train

In [204]:
dynamic_source_df = train_df.copy()
pivot_base_df = dynamic_source_df[['timestamp', 'segment_id'] + ['LOS_encoded', 'hour', 'minute', 'weekday']].copy()
pivot_base_df['segment_idx'] = pivot_base_df['segment_id'].map(segment_id_2_node_idx)
pivot_base_df = pivot_base_df.dropna(subset=['segment_idx'])
pivot_base_df['segment_idx'] = pivot_base_df['segment_idx'].astype(int)
pivot_base_df.head()

Unnamed: 0,timestamp,segment_id,LOS_encoded,hour,minute,weekday,segment_idx
0,2020-08-02 23:30:00,2443,4,23,30,6,0
1,2020-08-03 00:00:00,2443,3,0,0,0,0
2,2020-11-26 01:00:00,2443,4,1,0,3,0
3,2020-11-30 04:30:00,2443,1,4,30,0,0
4,2020-11-30 05:00:00,2443,1,5,0,0,0


### Pivot table status

In [205]:
df_status_proc = status_df.copy()
df_status_proc['segment_idx'] = df_status_proc['segment_id'].map(segment_id_2_node_idx)
df_status_proc = df_status_proc.dropna(subset=['segment_idx']) # Chỉ giữ các segment có trong df_train
df_status_proc['segment_idx'] = df_status_proc['segment_idx'].astype(int)
df_status_proc.head()

Unnamed: 0,_id,updated_at,segment_id,velocity,segment_idx
29,29,2020-07-04 08:39:14.365000+00:00,56816,15,142
30,30,2020-07-04 08:39:16.873000+00:00,56816,15,142
31,31,2020-07-04 08:40:13.969000+00:00,56816,15,142
32,32,2020-07-04 08:40:15.470000+00:00,56816,15,142
33,33,2020-07-04 08:40:26.617000+00:00,56816,12,142


In [206]:
# Đặt 'updated_at' làm index để resample
df_status_proc = df_status_proc.set_index('updated_at')

In [207]:
df_velocity_agg = (
    df_status_proc
    .groupby('segment_idx')['velocity']
    .resample('30min')
    .mean()
    .unstack('segment_idx')
)

df_velocity_agg.index = df_velocity_agg.index.tz_convert(None)
df_velocity_agg = df_velocity_agg.reindex(index=full_time_index, columns=all_segments_idx)

In [208]:
df_velocity_agg.index.dtype

dtype('<M8[us]')

### Tạo không gian đặc trưng dynamic

In [209]:
data_grid_dynamic = np.zeros((N, F_dynamic, T_total), dtype=np.float32)
print(f"Đang pivot và điền vào lưới [N, F, T] (Shape: {data_grid_dynamic.shape})...")

Đang pivot và điền vào lưới [N, F, T] (Shape: (202, 5, 14008))...


In [None]:
for i, feature_name in enumerate(dynamic_features):
    print(f"  Đang xử lý: {feature_name} (feature {i+1}/{F_dynamic})...")
    
    if feature_name == 'velocity':
        # Dùng df_velocity_agg đã tạo
        df_pivot_feat = df_velocity_agg
    else:
        # Dùng logic cũ cho các feature từ df_train
        # Lọc df_pivot_base theo thời gian để pivot nhanh hơn
        df_pivot_feat_filtered = pivot_base_df[pivot_base_df['timestamp'].isin(full_time_index)]
        df_pivot_feat_filtered = df_pivot_feat_filtered.drop_duplicates(subset=['timestamp', 'segment_idx'])
        df_pivot_feat = df_pivot_feat_filtered.pivot(
            index='timestamp',
            columns='segment_idx',
            values=feature_name
        )
    
    # Căn chỉnh lưới (reindex) theo index thời gian và cột segment
    df_grid_feat = df_pivot_feat.reindex(index=full_time_index, columns=all_segments_idx)
    print(df_grid_feat.index.dtype)
    
    # Lấp đầy (fill)
    # Dùng bfill (quá khứ) rồi ffill (tương lai) trong trường hợp còn sót
    df_grid_filled = df_grid_feat.bfill().ffill().fillna(-1)
    
    grid_feat_np = df_grid_filled.values.T # [N, T]
    data_grid_dynamic[:, i, :] = grid_feat_np

  Đang xử lý: LOS_encoded (feature 1/5)...
datetime64[us]
  Đang xử lý: hour (feature 2/5)...
datetime64[us]
  Đang xử lý: minute (feature 3/5)...
datetime64[us]
  Đang xử lý: weekday (feature 4/5)...
datetime64[us]
  Đang xử lý: velocity (feature 5/5)...
datetime64[us]


In [211]:
data_grid_dynamic.shape

(202, 5, 14008)

In [212]:
data_grid_dynamic = np.transpose(data_grid_dynamic, [2, 0, 1]) # T, N, F
data_grid_dynamic.shape

(14008, 202, 5)

In [214]:
# Check for NaNs and print max velocity feature after transpose
print('NaN count:', np.isnan(data_grid_dynamic).sum())
print('Shape:', data_grid_dynamic.shape)
# Correct indexing for max velocity after transpose: axis 2 is feature
print('Max velocity:', np.nanmax(data_grid_dynamic[:, :, 4]))

NaN count: 0
Shape: (14008, 202, 5)
Max velocity: 198.0


In [215]:
print(df_velocity_agg.index[:5])
print(full_time_index[:5])

print(df_velocity_agg.index.equals(full_time_index))


DatetimeIndex(['2020-07-04 08:30:00', '2020-07-04 09:00:00',
               '2020-07-04 09:30:00', '2020-07-04 10:00:00',
               '2020-07-04 10:30:00'],
              dtype='datetime64[us]', freq='30min')
DatetimeIndex(['2020-07-04 08:30:00', '2020-07-04 09:00:00',
               '2020-07-04 09:30:00', '2020-07-04 10:00:00',
               '2020-07-04 10:30:00'],
              dtype='datetime64[us]', freq='30min')
True


## Tạo không gian đặc trưng tĩnh

### Danh sách đặc trưng tĩnh

In [216]:
# --- PHẦN ĐẶC TRƯNG TĨNH (STATIC) (giữ nguyên) ---
print("\n--- Bước 2B: Xây dựng Lưới Đặc Trưng Tĩnh (Static) ---")

# Sử dụng df_segments
df_segments_proc = segments_df.copy() 

# Lấy tên các cột OHE
ohe_street_cols = [col for col in df_segments_proc.columns if col.startswith('type_')]
static_features = ['length', 'street_level'] + ohe_street_cols 


--- Bước 2B: Xây dựng Lưới Đặc Trưng Tĩnh (Static) ---


### Xây dựng không gian đặc trưng tĩnh

In [217]:
F_static = len(static_features)
data_grid_static = np.zeros((N, F_static), dtype=np.float32) # N = 10026

df_segments_proc['segment_idx'] = df_segments_proc['segment_id'].map(segment_id_2_node_idx) # map 10026
df_segments_proc = df_segments_proc.dropna(subset=['segment_idx'])
df_segments_proc['segment_idx'] = df_segments_proc['segment_idx'].astype(int)

df_segments_sorted = df_segments_proc.set_index('segment_idx').reindex(all_segments_idx) # reindex 10026

for i, feature_name in enumerate(static_features):
    if feature_name in df_segments_sorted.columns:
        data_grid_static[:, i] = df_segments_sorted[feature_name].fillna(0).values
    else:
        print(f"Cảnh báo: Cột '{feature_name}' không tìm thấy, dùng giá trị 0.")

print(f"Lưới đặc trưng tĩnh có shape: {data_grid_static.shape} ---")

Lưới đặc trưng tĩnh có shape: (202, 2) ---


# Tạo window sliding

In [218]:
def seq_2_seq(data, past_steps=24, future_steps=24):
    """
    Faithful replacement for your offset-based seq_2_seq
    """
    T, N, F = data.shape
    P, Q = past_steps, future_steps

    # Window covers: [t-(P-1), ..., t, t+1, ..., t+Q]
    total_window = P + Q

    W = sliding_window_view(
        data,
        window_shape=total_window,
        axis=0
    )
    print(W.shape)
    # W shape: (T - (P+Q), P+Q+1, N, F)

    # X = past up to t (inclusive)
    X = W[:, :, :, :P]                      # (samples, P, N, F)

    # y = future after t (last feature only, like your code)
    y = W[:, :, -1, P:]              # (samples, Q, N)

    return X, y

In [219]:
X, y = seq_2_seq(data_grid_dynamic)
print(X.shape)
print(y.shape)

(13961, 202, 5, 48)
(13961, 202, 5, 24)
(13961, 202, 24)


# Lưu dữ liệu

In [220]:
with h5py.File(f"{PATH}/dynamic.h5", "w") as f:
    f.create_dataset("dynamic", data=data_grid_dynamic)

In [221]:
with h5py.File(f"{PATH}/static.h5", "w") as f:
    f.create_dataset("static", data=data_grid_static)

In [222]:
with h5py.File(f"{PATH}/past.h5", "w") as f:
    f.create_dataset("past", data=X)

In [223]:
with h5py.File(f"{PATH}/future.h5", "w") as f:
    f.create_dataset("future", data=y)