In [1]:
import os
import sys

# Add project root to PYTHONPATH automatically
PROJECT_ROOT = r"C:\Users\adib4\OneDrive\Documents\Projets perso\CongestionAI\find_issues.ipynb"
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
import torch.nn as nn

from src.model_pipelines.dl_pipeline import train_model, evaluate, predict
from src.utils.model_evaluation import evaluate_and_plot_block
from src.utils.hist_baseline import historical_baseline_multi
from src.utils.preprocessing import cyclical_encode, scale_features, encode_detectors
from src.utils.sequences import create_nhits_sequences, NHitsDataset
from src.utils.plots import plot_training_curves
from src.model_pipelines.losses import (
    SpikeWeightedMSELoss,
    TwoTermSpikeLoss,
    DeltaLoss,
    LossConfig,
    create_loss
)

from src.utils.crafted_features import (
    SpikeFeatureConfig,
    add_spike_features,
    add_delta_features,
    add_rolling_stats,
    add_spike_labels,
    add_lags_and_drop
)


from src.models.mlp_forecaster import MLPForecaster
from src.models.n_hits import NHitsForecaster
from src.models.tcn_forecaster import TCNForecaster
from src.models.transformer_forecaster import TransformerForecaster

FILE_PATH = "prepared_data/preprocessed_full_data.csv"

In [2]:
df_base = pd.read_csv(FILE_PATH)
df_base["timestamp"] = pd.to_datetime(df_base["timestamp"])
df_base["orig_idx"] = df_base.index
df_base = cyclical_encode(df_base)

In [6]:
config = {"n_blocks": 6, "hidden_dim": 512, "history": "24h_hourly", "weather_lags": "sparse_24h"}

# Fixed params
forecast_horizon = 8
nb_detectors = 20
years_split = [[2016, 2017, 2018, 2020, 2021, 2022, 2023, 2024], [2019], []]

# Feature configs
feature_cols_norm_base = [
    "temperature", "precipitation", "visibility", "congestion_index", "free_flow_speed"
]
feature_cols_base = [
    "hour_sin", "hour_cos", "dow_sin", "dow_cos", "month_sin", "month_cos",
    "lon", "lat", "year", "season",
    "temperature", "precipitation", "visibility",
    "congestion_index", "free_flow_speed"
]

h_offsets = list(range(24))
weather_lags = list(range(0,-8,-1))
spike_config = None
w_lags = [-1, -2, -6, -8]

In [4]:
def prepare_dl_data_with_spikes(history_offsets, forecast_horizon, nb_detectors, df_base,
                                years_split, feature_cols_norm, feature_cols_base,
                                weather_lags, spike_config=None):
    """Extended data prep with optional spike features."""
    
    print("Loading data...")
    df_small = df_base[df_base["detector_id"].isin(df_base["detector_id"].unique()[:nb_detectors])].copy()
    df_small = df_small.sort_values(["detector_id", "timestamp"])
    
    # Season encoding
    df_small.loc[(df_small["month"] <= 2) | (df_small["month"] == 12), "season"] = 0
    df_small.loc[(df_small["month"] > 2) & (df_small["month"] <= 5), "season"] = 1
    df_small.loc[(df_small["month"] > 5) & (df_small["month"] <= 8), "season"] = 2
    df_small.loc[(df_small["month"] > 8) & (df_small["month"] <= 11), "season"] = 3
    
    # Add spike features if configured
    feature_cols = feature_cols_base.copy()
    feature_cols_norm_full = feature_cols_norm.copy()
    
    if spike_config is not None:
        print(f"Adding spike features: deltas={spike_config.enable_deltas}, rolling={spike_config.enable_rolling_stats}")
        df_small = add_spike_features(df_small, spike_config)
        spike_feature_cols = spike_config.get_feature_columns()
        spike_norm_cols = spike_config.get_normalization_columns()
        feature_cols = feature_cols + spike_feature_cols
        feature_cols_norm_full = feature_cols_norm_full + spike_norm_cols
        print(f"  Added columns: {spike_feature_cols}")
    
    # Detector encoding
    df_small, det2idx = encode_detectors(df_small)
    
    # Add weather lag column names
    if "temperature" in feature_cols:
        feature_cols = feature_cols + [f"temperature_lag_{lag}h" for lag in weather_lags] \
            + [f"precipitation_lag_{lag}h" for lag in weather_lags] \
            + [f"visibility_lag_{lag}h" for lag in weather_lags]
    
    # Split
    train = df_small[df_small["timestamp"].dt.year.isin(years_split[0])].copy()
    val = df_small[df_small["timestamp"].dt.year.isin(years_split[1])].copy()
    test = df_small[df_small["timestamp"].dt.year.isin(years_split[2])].copy() if years_split[2] else None
    
    train = train.set_index("orig_idx")
    val = val.set_index("orig_idx")
    if test is not None:
        test = test.set_index("orig_idx")
    
    # Normalization
    minmax_cols = ["lon", "lat", "year", "season"]
    train, val, test, std_scaler, mm_scaler = scale_features(
        train, val, test, feature_cols_norm_full, latlon_cols=minmax_cols
    )
    
    # Weather lags
    if "temperature" in feature_cols_base:
        train = add_lags_and_drop(train, weather_lags)
        val = add_lags_and_drop(val, weather_lags)
        if test is not None:
            test = add_lags_and_drop(test, weather_lags)
    
    # Drop NaNs from spike features
    if spike_config is not None:
        spike_cols_in_df = [c for c in spike_feature_cols if c in train.columns]
        train = train.dropna(subset=spike_cols_in_df)
        val = val.dropna(subset=spike_cols_in_df)
        if test is not None:
            test = test.dropna(subset=spike_cols_in_df)
    
    # Keep only needed columns (congestion_index is already in feature_cols)
    keep_cols = feature_cols + ["timestamp", "detector_id", "det_index"]
    keep_cols = [c for c in keep_cols if c in train.columns]
    
    train = train[keep_cols]
    val = val[keep_cols]
    if test is not None:
        test = test[keep_cols]
    
    # Build sequences
    X_train_hist, Y_train, idx_train, det_train = create_nhits_sequences(
        train, feature_cols, history_offsets, forecast_horizon)
    X_val_hist, Y_val, idx_val, det_val = create_nhits_sequences(
        val, feature_cols, history_offsets, forecast_horizon)
    
    if test is not None:
        X_test_hist, Y_test, idx_test, det_test = create_nhits_sequences(
            test, feature_cols, history_offsets, forecast_horizon)
    else:
        X_test_hist, Y_test, idx_test, det_test = None, None, None, None
    
    print(f"Sequences created. Features: {len(feature_cols)}, Train samples: {len(Y_train)}")
    
    return (X_train_hist, Y_train, idx_train, det_train,
            X_val_hist, Y_val, idx_val, det_val,
            X_test_hist, Y_test, idx_test, det_test,
            train, val, test, std_scaler, mm_scaler)

In [8]:
X_train_hist, Y_train, idx_train, det_train, \
                X_val_hist, Y_val, idx_val, det_val, \
                X_test_hist, Y_test, idx_test, det_test, \
                train, val, test, std_scaler, mm_scaler = prepare_dl_data_with_spikes(
                    h_offsets, forecast_horizon, nb_detectors, df_base,
                    years_split=years_split,
                    feature_cols_norm=feature_cols_norm_base,
                    feature_cols_base=feature_cols_base,
                    weather_lags=w_lags,
                    spike_config=None
                )

Loading data...
Sequences created. Features: 27, Train samples: 1068551


In [9]:
# Check what 0.15 threshold means in your data
train_ci = train["congestion_index"]  # Already z-normalized

# Compute deltas
deltas = train_ci.diff().abs().dropna()

print(f"Delta stats (z-normalized):")
print(f"  Mean: {deltas.mean():.4f}")
print(f"  Std:  {deltas.std():.4f}")
print(f"  Median: {deltas.median():.4f}")
print(f"  90th percentile: {deltas.quantile(0.90):.4f}")
print(f"  95th percentile: {deltas.quantile(0.95):.4f}")
print(f"  99th percentile: {deltas.quantile(0.99):.4f}")

# What % is flagged as spike with current threshold?
pct_flagged = (deltas > 0.15).mean() * 100
print(f"\n% flagged as spike with threshold=0.15: {pct_flagged:.1f}%")

# Recommended: use 90th or 95th percentile
recommended_threshold = deltas.quantile(0.90)
print(f"\nRecommended threshold (90th pct): {recommended_threshold:.3f}")

Delta stats (z-normalized):
  Mean: 0.1639
  Std:  0.1985
  Median: 0.1011
  90th percentile: 0.3797
  95th percentile: 0.5263
  99th percentile: 0.9592

% flagged as spike with threshold=0.15: 36.4%

Recommended threshold (90th pct): 0.380
