# Thư viện

In [1]:
import os

import numpy as np
import pandas as pd

import torch

In [2]:
PATH = "../../data/preprocessed"

# Load data

In [3]:
status_df = pd.read_csv(f"{PATH}/segment_status.csv")
segments_df = pd.read_csv(f"{PATH}/segments.csv")
train_df = pd.read_csv(f"{PATH}/train.csv")

In [44]:
status_df["updated_at"] = pd.to_datetime(status_df["updated_at"], format="mixed")
status_df.head()

Unnamed: 0,_id,updated_at,segment_id,velocity
0,0,2020-07-03 14:55:31.869000+00:00,24845,20
1,1,2020-07-03 15:02:56.048000+00:00,33923,10
2,2,2020-07-04 08:15:52.696000+00:00,33824,5
3,3,2020-07-04 08:15:59.903000+00:00,33824,5
4,4,2020-07-04 08:16:08.201000+00:00,33824,5


In [46]:
segments_df["created_at"] = pd.to_datetime(segments_df["created_at"], format="mixed")
segments_df["updated_at"] = pd.to_datetime(segments_df["updated_at"], format="mixed")
segments_df.head()

Unnamed: 0,segment_id,created_at,updated_at,s_node_id,e_node_id,length,street_id,max_velocity,street_level,street_name,street_type
0,0,2020-10-18 13:26:17.365000+00:00,2020-10-18 13:26:17.365000+00:00,373543511,5468660805,114,31096786,80.0,0,Quốc Lộ 1,trunk
1,1,2020-10-18 13:26:17.400000+00:00,2020-10-18 13:26:17.400000+00:00,5468660805,5738158916,9,31096786,80.0,0,Quốc Lộ 1,trunk
2,2,2020-10-18 13:26:17.435000+00:00,2020-10-18 13:26:17.435000+00:00,5738158916,5738158918,23,31096786,80.0,0,Quốc Lộ 1,trunk
3,3,2020-10-18 13:26:17.444000+00:00,2020-10-18 13:26:17.444000+00:00,5738158918,5738158912,66,31096786,80.0,0,Quốc Lộ 1,trunk
4,4,2020-10-18 13:26:17.452000+00:00,2020-10-18 13:26:17.452000+00:00,5738158912,5758104203,127,31096786,80.0,0,Quốc Lộ 1,trunk


In [49]:
train_df["date"] = pd.to_datetime(train_df["date"])
train_df.head()

Unnamed: 0,_id,segment_id,date,period,LOS,s_node_id,e_node_id,length,street_id,max_velocity,...,street_type_secondary,street_type_secondary_link,street_type_tertiary,street_type_tertiary_link,street_type_trunk,street_type_trunk_link,street_type_unclassified,street_type_university,timestamp,weekday
0,0,26,2021-04-16,period_0_30,A,366428456,366416066,116,32575820,,...,False,False,True,False,False,False,False,False,2021-04-16 00:30:00,4
1,1,33,2020-08-02,period_23_30,C,366469460,3792257828,26,32575862,,...,True,False,False,False,False,False,False,False,2020-08-02 23:30:00,6
2,2,33,2020-08-03,period_0_00,D,366469460,3792257828,26,32575862,,...,True,False,False,False,False,False,False,False,2020-08-03 00:00:00,0
3,3,67,2021-03-09,period_9_30,B,366403668,5755066033,7,32575862,,...,True,False,False,False,False,False,False,False,2021-03-09 09:30:00,1
4,4,67,2021-03-23,period_9_30,B,366403668,5755066033,7,32575862,,...,True,False,False,False,False,False,False,False,2021-03-23 09:30:00,1


# Xây dựng mô hình không gian

## Danh sách các segments được sử dụng trong train

In [50]:
active_segments_list = train_df["segment_id"].unique()
print(len(active_segments_list))

10027


## Lọc các segments được sử dụng

In [51]:
active_segments_df = segments_df[segments_df["segment_id"].isin(active_segments_list)]
N = active_segments_df.shape[0]
active_segments_df.shape

(10027, 11)

## Hashmap từ segment id sang index và ngược lại

In [52]:
segment_id_2_node_idx = {segment_id: i for i, segment_id in enumerate(active_segments_list)}
node_idx_2_segment_id = {i: segment_id for i, segment_id in enumerate(active_segments_list)}

## Tạo danh sách liền kề

In [53]:
node_to_segments = {}

for _, row in active_segments_df.iterrows():
    seg_idx = segment_id_2_node_idx[row['segment_id']]
    
    s_node = row['s_node_id']
    e_node = row['e_node_id']
    
    if s_node not in node_to_segments: node_to_segments[s_node] = []
    if e_node not in node_to_segments: node_to_segments[e_node] = []
    
    node_to_segments[s_node].append(seg_idx)
    node_to_segments[e_node].append(seg_idx)

## Tạo danh sách cạnh

In [54]:
edges = set()
for node_id, segments in node_to_segments.items():
    if len(segments) > 1:
        # Kết nối tất cả các cặp segment tại nút giao này
        for i in range(len(segments)):
            for j in range(i + 1, len(segments)):
                seg_a = segments[i]
                seg_b = segments[j]
                
                edges.add((seg_a, seg_b))
                edges.add((seg_b, seg_a))

for i in range(N):
    edges.add((i, i))

In [55]:
edge_list_source = [src for src, dst in edges]
edge_list_target = [dst for src, dst in edges]

edge_index = torch.tensor([edge_list_source, edge_list_target], dtype=torch.long)
edge_index. shape

torch.Size([2, 34297])

# Xây dựng mô hình thời gian

## Tạo timestamp

In [56]:
train_df['date'] = pd.to_datetime(train_df['date'])
train_df['timestamp'] = train_df['date'] + \
                                pd.to_timedelta(train_df['hour'], unit='h') + \
                                pd.to_timedelta(train_df['minute'], unit='m')
train_df['weekday'] = train_df['timestamp'].dt.weekday

In [57]:
train_df['weekday']

0        4
1        6
2        0
3        1
4        1
        ..
33436    1
33437    5
33438    3
33439    5
33440    5
Name: weekday, Length: 33441, dtype: int32

In [58]:
all_segments_idx = list(range(N)) # N = 10026
min_time = train_df['timestamp'].min()
max_time = train_df['timestamp'].max()
full_time_index = pd.date_range(start=min_time, end=max_time, freq='30min')
T_total = len(full_time_index)
print(f"Tạo lưới dữ liệu: {N} segments x {T_total} chu kỳ thời gian.")

Tạo lưới dữ liệu: 10027 segments x 14049 chu kỳ thời gian.


## Không gian đặc trưng động

### Danh sách đặc trưng động

In [92]:
dynamic_features = ['LOS_encoded', 'hour', 'minute', 'weekday', 'velocity'] 
F_dynamic = len(dynamic_features)

### Pivot table train

In [93]:
dynamic_source_df = train_df.copy()
pivot_base_df = dynamic_source_df[['timestamp', 'segment_id'] + ['LOS_encoded', 'hour', 'minute', 'weekday']].copy()
pivot_base_df['segment_idx'] = pivot_base_df['segment_id'].map(segment_id_2_node_idx)
pivot_base_df = pivot_base_df.dropna(subset=['segment_idx'])
pivot_base_df['segment_idx'] = pivot_base_df['segment_idx'].astype(int)
pivot_base_df.head()

Unnamed: 0,timestamp,segment_id,LOS_encoded,hour,minute,weekday,segment_idx
0,2021-04-16 00:30:00,26,0,0,30,4,0
1,2020-08-02 23:30:00,33,2,23,30,6,1
2,2020-08-03 00:00:00,33,3,0,0,0,1
3,2021-03-09 09:30:00,67,1,9,30,1,2
4,2021-03-23 09:30:00,67,1,9,30,1,2


### Pivot table status

In [69]:
df_status_proc = status_df.copy()
df_status_proc['segment_idx'] = df_status_proc['segment_id'].map(segment_id_2_node_idx)
df_status_proc = df_status_proc.dropna(subset=['segment_idx']) # Chỉ giữ các segment có trong df_train
df_status_proc['segment_idx'] = df_status_proc['segment_idx'].astype(int)
df_status_proc.head()

Unnamed: 0,_id,updated_at,segment_id,velocity,segment_idx
0,0,2020-07-03 14:55:31.869000+00:00,24845,20,2888
1,1,2020-07-03 15:02:56.048000+00:00,33923,10,3978
2,2,2020-07-04 08:15:52.696000+00:00,33824,5,3947
3,3,2020-07-04 08:15:59.903000+00:00,33824,5,3947
4,4,2020-07-04 08:16:08.201000+00:00,33824,5,3947


In [70]:
# Đặt 'updated_at' làm index để resample
df_status_proc = df_status_proc.set_index('updated_at')

In [71]:
df_velocity_agg = df_status_proc.groupby('segment_idx')['velocity'].resample('30min').mean()

In [73]:
df_velocity_pivot = df_velocity_agg.unstack(level=0)

### Tạo không gian đặc trưng dynamic

In [79]:
data_grid_dynamic = np.zeros((N, F_dynamic, T_total), dtype=np.float32)
print(f"Đang pivot và điền vào lưới [N, F, T] (Shape: {data_grid_dynamic.shape})...")

Đang pivot và điền vào lưới [N, F, T] (Shape: (10027, 5, 14049))...


In [81]:
for i, feature_name in enumerate(dynamic_features):
    print(f"  Đang xử lý: {feature_name} (feature {i+1}/{F_dynamic})...")
    
    if feature_name == 'velocity':
        # Dùng df_velocity_pivot đã tạo
        df_pivot_feat = df_velocity_pivot
    else:
        # Dùng logic cũ cho các feature từ df_train
        # Lọc df_pivot_base theo thời gian để pivot nhanh hơn
        df_pivot_feat_filtered = pivot_base_df[pivot_base_df['timestamp'].isin(full_time_index)]
        df_pivot_feat_filtered = df_pivot_feat_filtered.drop_duplicates(subset=['timestamp', 'segment_idx'])
        df_pivot_feat = df_pivot_feat_filtered.pivot(
            index='timestamp',
            columns='segment_idx',
            values=feature_name
        )
    
    # Căn chỉnh lưới (reindex) theo index thời gian và cột segment
    df_grid_feat = df_pivot_feat.reindex(index=full_time_index, columns=all_segments_idx)
    
    # Lấp đầy (fill)
    # Dùng ffill (lấp về phía trước) rồi bfill (lấp về phía sau)
    # fillna(-1) cho bất kỳ cái gì còn sót (ví dụ: segment không bao giờ có dữ liệu)
    df_grid_filled = df_grid_feat.ffill().bfill().fillna(-1) 
    #df_grid_filled = df_grid_feat.fillna(-1)
    
    grid_feat_np = df_grid_filled.values.T # [N, T]
    data_grid_dynamic[:, i, :] = grid_feat_np

  Đang xử lý: LOS_encoded (feature 1/5)...
  Đang xử lý: hour (feature 2/5)...
  Đang xử lý: minute (feature 3/5)...
  Đang xử lý: weekday (feature 4/5)...
  Đang xử lý: velocity (feature 5/5)...


## Tạo không gian đặc trưng tĩnh

### Danh sách đặc trưng tĩnh

In [88]:
# --- PHẦN ĐẶC TRƯNG TĨNH (STATIC) (giữ nguyên) ---
print("\n--- Bước 2B: Xây dựng Lưới Đặc Trưng Tĩnh (Static) ---")

# Sử dụng df_segments (đã xử lý OHE từ Cell 14)
df_segments_proc = segments_df.copy() 

# Lấy tên các cột OHE đã được tạo ở Cell 14
ohe_street_cols = [col for col in df_segments_proc.columns if col.startswith('type_')]
static_features = ['length', 'street_level'] + ohe_street_cols 


--- Bước 2B: Xây dựng Lưới Đặc Trưng Tĩnh (Static) ---


### Xây dựng không gian đặc trưng tĩnh

In [89]:
F_static = len(static_features)
data_grid_static = np.zeros((N, F_static), dtype=np.float32) # N = 10026

df_segments_proc['segment_idx'] = df_segments_proc['segment_id'].map(segment_id_2_node_idx) # map 10026
df_segments_proc = df_segments_proc.dropna(subset=['segment_idx'])
df_segments_proc['segment_idx'] = df_segments_proc['segment_idx'].astype(int)

df_segments_sorted = df_segments_proc.set_index('segment_idx').reindex(all_segments_idx) # reindex 10026

for i, feature_name in enumerate(static_features):
    if feature_name in df_segments_sorted.columns:
        data_grid_static[:, i] = df_segments_sorted[feature_name].fillna(0).values
    else:
        print(f"Cảnh báo: Cột '{feature_name}' không tìm thấy, dùng giá trị 0.")

print(f"--- Bước 2B: Hoàn tất! Lưới đặc trưng tĩnh có shape: {data_grid_static.shape} ---")

--- Bước 2B: Hoàn tất! Lưới đặc trưng tĩnh có shape: (10027, 2) ---
