#### Load Data

In [12]:
import numpy as np
import pandas as pd
import torch

df = pd.read_parquet("../data/processed/traffic_weather_with_segments.parquet")

# Use timestamp as index for easy time-based slicing
df = df.set_index('timestamp').sort_index()

df.head()

Unnamed: 0_level_0,origin,destination,traffic_speed,congestion_level,distance,origin_lat,origin_lon,dest_lat,dest_lon,temperature,...,hour_sin,hour_cos,dow_sin,dow_cos,month_sin,month_cos,origin_code,destination_code,segment_key,segment_idx
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2025-04-01 00:00:00+00:00,rrethi i flamurit,salla e sporteve 1 tetori,31.477501,-4.9275,1376.0,42.646832,21.156087,42.653815,21.167529,6.5,...,0.5,0.866025,0.781831,0.62349,0.866025,-0.5,0,6,rrethi i flamurit→salla e sporteve 1 tetori,0
2025-04-01 00:00:00+00:00,rrethi i madh (lakrishte),rrethi te komuna e vjeter,33.197502,-10.655,2420.0,42.653122,21.146059,42.66698,21.16398,6.5,...,0.5,0.866025,0.781831,0.62349,0.866025,-0.5,2,5,rrethi i madh (lakrishte)→rrethi te komuna e v...,3
2025-04-01 00:00:00+00:00,rrethi i maxi 24,rrethi i kazermes,27.445,8.515,3235.0,42.65265,21.175016,42.650434,21.138832,6.5,...,0.5,0.866025,0.781831,0.62349,0.866025,-0.5,3,1,rrethi i maxi 24→rrethi i kazermes,4
2025-04-01 00:00:00+00:00,te qafa,rrethi i flamurit,22.952499,23.4825,2576.0,42.665365,21.160591,42.64804,21.1576,6.5,...,0.5,0.866025,0.781831,0.62349,0.866025,-0.5,8,0,te qafa→rrethi i flamurit,9
2025-04-01 00:00:00+00:00,rrethi te ismeti,xhamia e llapit,29.514999,1.61,3154.0,42.65808,21.137243,42.67141,21.16175,6.5,...,0.5,0.866025,0.781831,0.62349,0.866025,-0.5,4,9,rrethi te ismeti→xhamia e llapit,5


In [13]:
#Make sure timezone is naive
df.index = df.index.tz_localize(None)
print(f"Timezone: {df.index.tz}")

Timezone: None


#### Select modeling features

In [14]:
feature_cols = [
    'traffic_speed',
    'temperature', 'precipitation',
    'hour_sin', 'hour_cos',
    'dow_sin', 'dow_cos',
    'month_sin', 'month_cos'
]

target_col = 'traffic_speed'

#### Convert the dataframe into a [time × nodes × features] tensor

In [15]:
num_nodes = df['segment_idx'].nunique()
print("Number of nodes:", num_nodes)

# Create a tensor-like array
# shape = [time, nodes, features]
time_index = sorted(df.index.unique())
feature_tensor = np.zeros((len(time_index), num_nodes, len(feature_cols)))

for t_idx, ts in enumerate(time_index):
    # subset of all rows at this timestamp
    rows = df.loc[ts]

    # rows may be a DataFrame or Series depending on count
    if isinstance(rows, pd.Series):
        rows = rows.to_frame().T

    for _, r in rows.iterrows():
        n = int(r['segment_idx'])
        feature_tensor[t_idx, n, :] = r[feature_cols].values

feature_tensor.shape


Number of nodes: 12


(15406, 12, 9)

#### Train/Val/Test Split

In [16]:
train_end = pd.Timestamp("2025-08-31 23:59:59")
val_end = pd.Timestamp("2025-09-30 23:59:59")

timestamps = pd.to_datetime(time_index)

train_idx = timestamps <= train_end
val_idx = (timestamps > train_end) & (timestamps <= val_end)
test_idx = timestamps > val_end

X_train_raw = feature_tensor[train_idx]
X_val_raw = feature_tensor[val_idx]
X_test_raw = feature_tensor[test_idx]

X_train_raw.shape, X_val_raw.shape, X_test_raw.shape

((11016, 12, 9), (2160, 12, 9), (2230, 12, 9))

#### Scale Features without Leakage

In [17]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

T_train, N, F = X_train_raw.shape
X_train_2d = X_train_raw.reshape(-1, F)

scaler.fit(X_train_2d)

def scale_set(arr):
    T, N, F = arr.shape
    arr_2d = arr.reshape(-1, F)
    arr_scaled = scaler.transform(arr_2d)
    return arr_scaled.reshape(T, N, F)

X_train = scale_set(X_train_raw)
X_val   = scale_set(X_val_raw)
X_test  = scale_set(X_test_raw)


#### Create Sliding Windows 

In [18]:
# seq_len = 12 (4 hours)
# pred_len = 3 (1 hour ahead)
# Predicting traffic_speed only, not the other features

seq_len = 12
pred_len = 3
target_index = 0  # traffic_speed

def create_sequences(data):
    X, Y = [], []
    T = data.shape[0]

    for t in range(T - seq_len - pred_len):
        X.append(data[t:t+seq_len])                              # [12, 12, F]
        Y.append(data[t+seq_len:t+seq_len+pred_len,:,target_index])  # [3, 12]

    return np.array(X), np.array(Y)

X_train_seq, Y_train_seq = create_sequences(X_train)
X_val_seq,   Y_val_seq   = create_sequences(X_val)
X_test_seq,  Y_test_seq  = create_sequences(X_test)


#### Convert to torch tensors & save

In [19]:
torch.save({
    'X_train': torch.tensor(X_train_seq).float(),
    'Y_train': torch.tensor(Y_train_seq).float(),
    'X_val':   torch.tensor(X_val_seq).float(),
    'Y_val':   torch.tensor(Y_val_seq).float(),
    'X_test':  torch.tensor(X_test_seq).float(),
    'Y_test':  torch.tensor(Y_test_seq).float(),
    'scaler':  scaler,
}, "../data/processed/model_ready.pt")

print("Saved final model-ready dataset.")

Saved final model-ready dataset.
