In [None]:
import numpy as np
import joblib
import torch
import os
import pathlib

from sklearn.preprocessing import StandardScaler

ToDos

- Mögliche Überlegung: id als string und dann label encoding
Label Encoding nach der Umwandlung in Strings wird dazu führen, dass die IDs neu nummeriert werden, und dabei wird der Abstand zwischen den ursprünglichen IDs ignoriert. Solange das Modell keine Bedeutung aus den Abständen zwischen den IDs ableitet, gibt es keine negativen Auswirkungen.

In [2]:
def normalize_feature(train_batch, val_batch):
    """
    Normalizes edge and node attributes in batched train/val graph data.
    Applies cyclical encoding for 'month' in edges and uses StandardScaler for all other features.
    Also saves the fitted scalers to disk for later reuse.
    """
    
    ### === Normalize targets (y) === ###
    y_train = train_batch.y.view(-1, 1).numpy()
    y_scaler = StandardScaler().fit(y_train)

    train_batch.y = torch.tensor(y_scaler.transform(train_batch.y.view(-1, 1)), dtype=torch.float32)
    val_batch.y = torch.tensor(y_scaler.transform(val_batch.y.view(-1, 1)), dtype=torch.float32)

    ### === Normalize edge attributes === ###
    month_idx = 1
    feat_indices = [0, 2]  # speed_rel and year

    edge_feats_train = train_batch.edge_attr[:, feat_indices].numpy()
    feat_scaler = StandardScaler().fit(edge_feats_train)

    for batch in [train_batch, val_batch]:
        feat_tensor = batch.edge_attr[:, feat_indices]
        feat_scaled = torch.tensor(feat_scaler.transform(feat_tensor.numpy()), dtype=torch.float32)

        # Cyclical encoding for month
        month_raw = batch.edge_attr[:, month_idx]
        month_sin = torch.sin(2 * np.pi * month_raw / 12).view(-1, 1)
        month_cos = torch.cos(2 * np.pi * month_raw / 12).view(-1, 1)

        batch.edge_attr = torch.cat([feat_scaled, month_sin, month_cos], dim=1)

    ### === Normalize node features (lon, lat) === ###
    node_feats_train = train_batch.x.numpy()
    node_scaler = StandardScaler().fit(node_feats_train)

    train_batch.x = torch.tensor(node_scaler.transform(train_batch.x.numpy()), dtype=torch.float32)
    val_batch.x = torch.tensor(node_scaler.transform(val_batch.x.numpy()), dtype=torch.float32)

    ### === Save scalers === ###
    scaler_dir = pathlib.Path(__file__).parent / "scalers"
    os.makedirs(scaler_dir, exist_ok=True)
    joblib.dump(y_scaler, scaler_dir / "target_scaler.pkl")
    joblib.dump(feat_scaler, scaler_dir / "edge_scaler.pkl")
    joblib.dump(node_scaler, scaler_dir / "node_scaler.pkl")

    return train_batch, val_batch

In [None]:
def normalize_test_features(test_batch):
    """
    Normalizes a batched test graph using pre-fitted scalers from training phase.
    
    This includes:
      - Standardizing the target variable (from edge_attr),
      - Applying cyclical encoding to the 'month' edge feature,
      - Normalizing other edge features (e.g., speed_rel, year),
      - Normalizing node features (coordinates).

    Parameters:
    -----------
    test_batch : torch_geometric.data.Batch
        A batched PyTorch Geometric Data object representing test graphs.

    Returns:
    --------
    test_batch : torch_geometric.data.Batch
        The normalized batch.
    """
    
    # === Load scalers === #
    project_root = os.path.abspath(os.path.join(os.getcwd(), "..", "..", ".."))
    scaler_dir = os.path.join(project_root, "src", "utils", "helper_functions", "scalers")
    y_scaler = joblib.load(os.path.join(scaler_dir, "target_scaler.pkl"))
    feat_scaler = joblib.load(os.path.join(scaler_dir, "edge_scaler.pkl"))
    node_scaler = joblib.load(os.path.join(scaler_dir, "node_scaler.pkl"))

    # === Define indices === #
    target_idx = 4
    feat_indices = [0, 2]
    month_idx = 1

    # --- Target y --- #
    test_batch.y = test_batch.edge_attr[:, target_idx]
    test_batch.y = torch.tensor(
        y_scaler.transform(test_batch.y.view(-1, 1)), dtype=torch.float32
    )

    # --- Edge features --- #
    edge_feats = test_batch.edge_attr[:, feat_indices].numpy()
    edge_feats_scaled = torch.tensor(
        feat_scaler.transform(edge_feats), dtype=torch.float32
    )

    # Month cyclical encoding
    month_raw = test_batch.edge_attr[:, month_idx]
    month_sin = torch.sin(2 * np.pi * month_raw / 12).view(-1, 1)
    month_cos = torch.cos(2 * np.pi * month_raw / 12).view(-1, 1)

    test_batch.edge_attr = torch.cat([edge_feats_scaled, month_sin, month_cos], dim=1)

    # --- Node features --- #
    test_batch.x = torch.tensor(
        node_scaler.transform(test_batch.x.numpy()), dtype=torch.float32
    )

    return test_batch
