In [None]:
import numpy as np
import pandas as pd

In [None]:
# Load the preprocessed data

df = pd.read_csv('prepared_data/preprocessed_full_data.csv')
print(df.info())
df.head()

In [None]:
print(df["timestamp"].dtype)
df['timestamp'] = df['timestamp'].astype(str)
df['timestamp'] = pd.to_datetime(df['timestamp'])
print(df["timestamp"].dtype)

# Downscale
def optimize_dtypes(df):
    """
    Downcast all numeric columns and convert object columns to category.
    """

    df_optimized = df.copy()

    # ---- downcast floats ----
    float_cols = df_optimized.select_dtypes(include=["float64", "float32"]).columns
    for col in float_cols:
        df_optimized[col] = df_optimized[col].astype("float32")

    # ---- downcast ints ----
    int_cols = df_optimized.select_dtypes(include=["int64", "int32", "int16", "int8"]).columns
    for col in int_cols:
        # int64 -> smallest possible int
        df_optimized[col] = pd.to_numeric(df_optimized[col], downcast="integer")

    # ---- convert strings to categories ----
    obj_cols = df_optimized.select_dtypes(include=["object"]).columns
    for col in obj_cols:
        df_optimized[col] = df_optimized[col].astype("category")

    return df_optimized

df = optimize_dtypes(df)
print(df.info())

In [None]:
import numpy as np

def evaluate_block_predictions(Y_true, Y_pred):
    """
    Ultra-fast evaluation of block forecasts using vectorized operations.
    Shape: Y_true, Y_pred = (N, H)
    """

    # ---- Basic metrics (vectorized) ----
    diff = Y_true - Y_pred
    mae  = np.mean(np.abs(diff))
    rmse = np.sqrt(np.mean(diff * diff))
    print("MAE:", mae, "RMSE:", rmse)

    # ---- Vectorized correlation ----
    # Center sequences
    Yt = Y_true - Y_true.mean(axis=1, keepdims=True)
    Yp = Y_pred - Y_pred.mean(axis=1, keepdims=True)
    print("Mean centered Y_true and Y_pred for correlation calculation.")

    # Compute numerator: cov
    num = np.sum(Yt * Yp, axis=1)

    # Denominator: std(True)*std(Pred)
    denom = np.sqrt(np.sum(Yt * Yt, axis=1) * np.sum(Yp * Yp, axis=1))
    print("Computed denominator for correlation calculation.")

    # Avoid division by zero
    corr_per_block = np.where(denom == 0, np.nan, num / denom)

    # Average correlation (ignoring NaN)
    corr = np.nanmean(corr_per_block)
    print("Correlation:", corr)

    return {
        "MAE": mae,
        "RMSE": rmse,
        "Corr": corr
    }


In [None]:
import matplotlib.pyplot as plt

def plot_block_predictions(df, horizon=24, detector_id=None,
                           years=None, months=None,
                           true_prefix="future_", pred_prefix="pred_",
                           max_blocks=10):
    """
    Plot 24h forecast trajectories:
    - t+1 ... t+horizon for each chosen block
    - sample every 'horizon' timestamps
    """

    df_plot = df.copy()

    # ---- FILTERING ----
    if detector_id is None:
        detector_id = df_plot["detector_id"].iloc[0]
    df_plot = df_plot[df_plot["detector_id"] == detector_id]

    if years is not None:
        df_plot = df_plot[df_plot["timestamp"].dt.year.isin(years)]

    if months is not None:
        df_plot = df_plot[df_plot["timestamp"].dt.month.isin(months)]

    # ---- Take blocks every 'horizon' timesteps ----
    df_blocks = df_plot.iloc[::horizon].copy()
    df_blocks = df_blocks.head(max_blocks)

    # ---- Column lists ----
    true_cols = [f"{true_prefix}{h}h" for h in range(1, horizon+1)]
    pred_cols = [f"{pred_prefix}{h}h" for h in range(1, horizon+1)]

    plt.figure(figsize=(14, 7))
    print("Plotting...")
    for _, row in df_blocks.iterrows():
        base_time = row["timestamp"]
        horizon_times = base_time + pd.to_timedelta(np.arange(1, horizon+1), "h")

        plt.plot(horizon_times, row[true_cols].values,
                 label=f"True (start {base_time})", alpha=0.6)

        plt.plot(horizon_times, row[pred_cols].values,
                 label=f"Pred (start {base_time})", alpha=0.6)

    plt.title(f"{horizon}-hour Forecast Trajectories")
    plt.xlabel("Time")
    plt.ylabel("Congestion Index")
    plt.legend()
    plt.show()


In [None]:
def evaluate_and_plot_block(df, horizon=24,
                            detector_id=None,
                            years=None,
                            months=None,
                            true_prefix="future_",
                            pred_prefix="pred_"):
    """
    Full multi-step forecast evaluation and plotting.
    """

    # ---- Extract arrays ----
    true_cols = [f"{true_prefix}{h}h" for h in range(1, horizon+1)]
    pred_cols = [f"{pred_prefix}{h}h" for h in range(1, horizon+1)]

    Y_true = df[true_cols].values
    Y_pred = df[pred_cols].values

    # ---- Compute metrics ----
    metrics = evaluate_block_predictions(Y_true, Y_pred)

    print("=== Block Forecast Evaluation ===")
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")

    # ---- Plot ----
    plot_block_predictions(
        df,
        horizon=horizon,
        detector_id=detector_id,
        years=years,
        months=months,
        true_prefix=true_prefix,
        pred_prefix=pred_prefix
    )

    return metrics


In [None]:
def historical_baseline_multi(df, window_size=5, horizon=24):
    df_h = df[["detector_id", "timestamp", "congestion_index"]].copy()

    # For each horizon h = 1..24 create a future target
    for h in range(1, horizon+1):
        df_h[f"future_{h}h"] = (
            df_h.groupby("detector_id")["congestion_index"]
                .shift(-h)
        )

    # Baseline uses rolling mean of past
    df_h["hist_baseline"] = (
        df_h.groupby("detector_id")["congestion_index"]
             .rolling(window_size, min_periods=1)
             .mean()
             .reset_index(level=0, drop=True)
    )

    # Expand baseline into 24 identical horizons
    for h in range(1, horizon+1):
        df_h[f"pred_{h}h"] = df_h["hist_baseline"]

    # Drop rows where ANY future target is missing
    future_cols = [f"future_{h}h" for h in range(1, horizon+1)]
    df_h = df_h.dropna(subset=future_cols)

    return df_h


print("Baseline: historical average congestion.")
df_historical = historical_baseline_multi(df, horizon=24)

In [None]:
evaluate_and_plot_block(df_historical, horizon=24, years=[2019])


In [None]:
# Add lags function
def add_lag_features(df, lags: list):
    for lag in lags:
        df[f'lag_{lag}h'] = (
            df.groupby("detector_id")["congestion_index"]
                      .shift(lag)
        )
    return df

# Adds future targets
def add_future_targets(df, horizon: int):
    for h in range(1, horizon + 1):
        df[f'future_{h}h'] = (
            df.groupby("detector_id")["congestion_index"]
                      .shift(-h)
        )
    return df

In [None]:
df.info()

In [None]:
# Prepare data for trainings (LGMB...)

horizon = 24
lags = [1, 6, 12, 24]

feature_cols = [
    # --- Time features ---
    "hour",
    "day_of_week",
    "month",
    "year",              # optional, can remove to avoid overfitting
    #"is_weekend",

    # --- Calendar features ---
    "is_holiday",
    "is_school_holiday",
    "is_rush_hour",
    #"season",            # categorical

    # --- Weather features ---
    "temperature",
    #"dew_point",
    "precipitation",
    #"relative_humidity",
    "visibility",
    "cloud_cover",
    #"is_rain",
    #"is_snow",
    #"is_fog",

    # --- Road info ---
    "free_flow_speed",   # numerical
    "detector_id",      # categorical
    "congestion_index",  # current value

    # --- Spatial features ---
    "lon",
    "lat"
]

lags_cols = [f'lag_{lag}h' for lag in lags]
target_cols = [f'future_{h}h' for h in range(1, horizon + 1)]


df["detector_id"] = df["detector_id"].astype("category")
if "season" in df.columns: 
    df["season"] = df["season"].astype("category")
df = add_lag_features(df, lags)
df = add_future_targets(df, horizon)

df = df.dropna(subset=lags_cols + target_cols)
df.drop(columns=df.columns.difference(feature_cols + lags_cols + target_cols), inplace=True)

for col in df.select_dtypes(include=["float64"]).columns:
    df[col] = df[col].astype("float32")

for col in df.select_dtypes(include=["int64"]).columns:
    df[col] = df[col].astype("int32")

years_train = [2015, 2016, 2017, 2018, 2019, 2020, 2021]
years_val = [2022, 2023]
years_test = [2024]

train_df = df[df["year"].isin([2015, 2016, 2017, 2018, 2019, 2020, 2021])]
val_df   = df[df["year"].isin([2022, 2023])]
test_df  = df[df["year"].isin([2024])]

X_train = train_df[feature_cols + lags_cols]
X_val   = val_df[feature_cols + lags_cols]
X_test  = test_df[feature_cols + lags_cols]

In [None]:
df.info()

In [None]:
# LightGMB Model training

import lightgbm as lgb

models = {}
horizon = 24

# LightGBM params
params = {
    "objective": "regression",
    "metric": "l2",
    "learning_rate": 0.05,
    "num_leaves": 96,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 3,
    "min_data_in_leaf": 200,
    "max_bin": 255,
    "verbosity": -1,
    
    # GPU SUPPORT
    "device": "cpu",
    "gpu_platform_id": -1,
    "gpu_device_id": -1,
    "gpu_use_dp": False
}

categorical_features = ["detector_id", "season"]

print("Training 24 LightGBM models...")

for i, tgt in enumerate(target_cols):
    print(f"\n - Training horizon {i+1}/{horizon}: {tgt}")

    y_train = train_df[tgt].values
    y_val   = val_df[tgt].values

    train_ds = lgb.Dataset(
        X_train, 
        y_train,
        categorical_feature=categorical_features
    )

    val_ds = lgb.Dataset(
        X_val, 
        y_val,
        categorical_feature=categorical_features,
        reference=train_ds
    )

    model = lgb.train(
        params,
        train_ds,
        valid_sets=[train_ds, val_ds],
        valid_names=["train", "val"],
        num_boost_round=2000
    )

    models[tgt] = model

print("\n Training complete.")



In [None]:
# =====================================================
# Fast LightGBM Training (Optimized for Speed)
# =====================================================

import lightgbm as lgb
import numpy as np
from joblib import Parallel, delayed

horizon = 24
models = {}

# ================================
# 1) FAST DATA REDUCTION
# ================================
print("Sampling training data for speed...")
train_df_sampled = train_df.sample(frac=0.30, random_state=42)

X_train = train_df_sampled[feature_cols + lags_cols]
X_val   = val_df[feature_cols + lags_cols]

# ================================
# 2) LightGBM FAST CPU PARAMS
# ================================
params = {
    "objective": "regression",
    "metric": "l2",
    "learning_rate": 0.05,

    # Faster trees
    "num_leaves": 48,
    "min_data_in_leaf": 100,
    "max_bin": 255,

    # Regularization to stabilize
    "feature_fraction": 0.8,
    "bagging_fraction": 0.8,
    "bagging_freq": 3,

    "verbosity": -1,

    # CPU FAST MODE
    "device": "cpu",
    "num_threads": -1
}

categorical_features = ["detector_id", "season"] if "season" in df.columns else ["detector_id"]
target_cols = [f"future_{h}h" for h in range(1, horizon+1)]


# ================================
# 3) TRAIN ONE HORIZON
# ================================
def train_one_horizon(i, tgt):
    print(f"\n - Training horizon {i}/{horizon}: {tgt}")

    train_ds = lgb.Dataset(
        X_train,
        train_df_sampled[tgt].values,
        categorical_feature=categorical_features
    )

    val_ds = lgb.Dataset(
        X_val,
        val_df[tgt].values,
        categorical_feature=categorical_features,
        reference=train_ds
    )

    model = lgb.train(
        params,
        train_ds,
        valid_sets=[val_ds],
        valid_names=["val"],
        num_boost_round=800,               # much smaller
    )

    return tgt, model


# ================================
# 4) PARALLEL TRAINING (BIG SPEEDUP)
# ================================
print("Training 24 LightGBM models in parallel...")
results = Parallel(n_jobs=2)(   # adjust based on CPU cores
    delayed(train_one_horizon)(i+1, tgt)
    for i, tgt in enumerate(target_cols)
)

# Store results
models = {tgt: model for tgt, model in results}

print("\n Training complete.")


In [None]:
def predict_24h(models, X):
    preds = np.zeros((len(X), horizon), dtype="float32")
    for i, tgt in enumerate(target_cols):
        preds[:, i] = models[tgt].predict(X, num_iteration=models[tgt].best_iteration)
    return preds


Y_pred_test = predict_24h(models, X_test)

# Convert to DataFrame
pred_cols = [f"pred_{h}h" for h in range(1, horizon+1)]
pred_df = pd.DataFrame(Y_pred_test, columns=pred_cols)
pred_df["timestamp"] = test_df["timestamp"].values
pred_df["detector_id"] = test_df["detector_id"].values


In [None]:
merged = test_df.reset_index(drop=True).join(pred_df[pred_cols])
results = evaluate_block_predictions(
    merged[target_cols].values,
    merged[pred_cols].values
)
