# EDA and Baselines

This notebook covers data loading, STL Decomposition, and Changepoint Detection.

## 1. STL Decomposition

In [1]:
import os
import sys
from pathlib import Path

# Robustly determine project root and NAB path
current_dir = Path(os.getcwd())
if current_dir.name == 'notebooks':
    project_root = current_dir.parent
else:
    project_root = current_dir

# Add project root to sys.path
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

nab_root = str(project_root / 'NAB')
print(f"Project Root: {project_root}")
print(f"NAB Root: {nab_root}")
# src/run_stl_analysis.py
# Add project root to sys.path

from pathlib import Path
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import STL
from src.load_nab import load_series, load_labels, mark_anomaly_windows
from src.kalman_model import detect_anomalies_by_residual
from src.evaluate import compute_event_level_metrics

def run_stl_pipeline(nab_root: str, file_key: str, period: int = 48, save_dir: str = "./results/stl"):
    """
    Run STL Decomposition and detect anomalies on the residual.
    """
    save_dir = Path(save_dir) / file_key.replace("/", "__")
    save_dir.mkdir(parents=True, exist_ok=True)

    # load data
    df = load_series(nab_root, file_key)
    labels = load_labels(nab_root)
    label_times = labels.get(file_key, labels.get("data/" + file_key, []))
    df = mark_anomaly_windows(df, label_times, window_size=3) # gap=3 as requested
    
    values = df['value']
    
    print(f"Running STL Decomposition (period={period})...")
    stl = STL(values, period=period, robust=True)
    res = stl.fit()
    
    # Components
    trend = res.trend
    seasonal = res.seasonal
    resid = res.resid
    
    # Plot Decomposition
    fig, axes = plt.subplots(4, 1, figsize=(15, 12), sharex=True)
    axes[0].plot(df['timestamp'], values, label='Original', color='black')
    axes[0].set_title('Original Data')
    axes[0].legend()
    
    axes[1].plot(df['timestamp'], trend, label='Trend', color='blue')
    axes[1].set_title('Trend')
    axes[1].legend()
    
    axes[2].plot(df['timestamp'], seasonal, label='Seasonal', color='green')
    axes[2].set_title('Seasonal')
    axes[2].legend()
    
    axes[3].plot(df['timestamp'], resid, label='Residual', color='red')
    axes[3].set_title('Residual')
    axes[3].legend()
    
    plt.tight_layout()
    plt.savefig(save_dir / "stl_decomposition.png")
    print(f"Saved decomposition plot to {save_dir / 'stl_decomposition.png'}")
    
    # Detect Anomalies on Residuals
    # We treat the residual as the "anomaly signal"
    # We can use our robust thresholding logic here
    
    print("\n--- Starting Threshold Sweep on Residuals (Rolling Sigma) ---")
    best_k = 3.0
    best_f1 = -1.0
    best_metrics = None
    
    # We use the ENTIRE residual series for detection (unsupervised)
    # But for evaluation, we only care about the labeled region (usually full series for NAB)
    
    # Rolling Sigma on Residuals
    # Window = 48 (Daily)
    
    for k_candidate in np.linspace(3.0, 12.0, 10):
        # We can reuse detect_anomalies_by_residual
        # actual = resid, mean = 0 (since it's residual)
        # train_residuals = resid (we use the whole series history)
        
        flags_temp = detect_anomalies_by_residual(
            resid.values, 
            np.zeros_like(resid.values), 
            resid.values, 
            k=k_candidate, 
            use_mad=False, 
            use_rolling=True,
            window=48,
            persistence=2
        )
        
        m_evt = compute_event_level_metrics(df['is_anomaly'].values, flags_temp, gap=3)
        f1 = m_evt['f1']
        
        if f1 > best_f1:
            best_f1 = f1
            best_k = k_candidate
            best_metrics = m_evt
            
    print(f"--- Best Threshold: k={best_k:.1f} with F1={best_f1:.4f} ---\n")
    
    flags = detect_anomalies_by_residual(
        resid.values, 
        np.zeros_like(resid.values), 
        resid.values, 
        k=best_k, 
        use_mad=False, 
        use_rolling=True,
        window=48,
        persistence=2
    )
    
    # Save results
    out_df = df.copy()
    out_df['trend'] = trend
    out_df['seasonal'] = seasonal
    out_df['residual'] = resid
    out_df['detected'] = flags
    out_df.to_csv(save_dir / "stl_results.csv", index=False)
    
    metrics = {
        "event_level": best_metrics,
        "best_k": best_k
    }
    
    with open(save_dir / "metrics.json", "w") as f:
        json.dump(metrics, f, indent=2)
        
    print(f"Saved results to {save_dir}")
    print("STL Event Metrics:", best_metrics)
    return metrics

if __name__ == "__main__":
    # nab_root defined globally or dynamically above
    nab_root = str(project_root / 'NAB')
    file_key = "realKnownCause/nyc_taxi.csv"
    run_stl_pipeline(
        nab_root=nab_root,
        file_key=file_key,
        period=48, # Daily seasonality (30 min * 48 = 24h)
        save_dir="./results/stl"
    )


FileNotFoundError: [Errno 2] No such file or directory: 'NAB\\data\\realKnownCause\\nyc_taxi.csv'

In [ ]:

# --- Residual Diagnostics ---
# We analyze the residuals from the STL decomposition to check for Normality and Heteroskedasticity.
from src.plotting import plot_residual_diagnostics
import pandas as pd

# Load the STL results
stl_results_path = Path("./results/stl/realKnownCause__nyc_taxi.csv/stl_results.csv")
if stl_results_path.exists():
    df_stl = pd.read_csv(stl_results_path)
    residuals = df_stl['residual'].values
    
    print("Generating Residual Diagnostics...")
    save_dir = Path("./results/eda")
    plot_residual_diagnostics(residuals, "STL_Residuals", save_dir)
    
    # Display the plots inline
    from IPython.display import Image, display
    display(Image(filename=save_dir / "residual_hist.png"))
    display(Image(filename=save_dir / "residual_qq.png"))
    display(Image(filename=save_dir / "residual_rolling_std.png"))
else:
    print("STL results not found. Run the STL cell above first.")




### Interpretation
- **Seasonality**: The data shows strong daily and weekly seasonality, which STL captures well.
- **Residuals**: The residuals are **heavy-tailed** (high Kurtosis) and **heteroskedastic** (variance changes over time), as seen in the Rolling Volatility plot.
- **Implication**: Standard thresholding (mean +/- 3*std) will fail. We must use **Robust Statistics** (MAD) or **Adaptive Thresholding** (Rolling Sigma).



## 2. Bayesian Online Changepoint Detection (BOCPD)

In [None]:
# src/run_bocpd_on_taxi.py
import sys
import os
# Add project root to sys.path
sys.path.append(os.path.abspath('..'))

from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from src.bocpd_model import BOCPD
from src.load_nab import load_series, load_labels, mark_anomaly_windows

def run_bocpd_pipeline(nab_root: str, file_key: str, save_dir: str = "./results/bocpd"):
    """
    Run BOCPD on NYC Taxi data.
    """
    save_dir = Path(save_dir) / file_key.replace("/", "__")
    save_dir.mkdir(parents=True, exist_ok=True)

    # load data
    df = load_series(nab_root, file_key)
    labels = load_labels(nab_root)
    label_times = labels.get(file_key, labels.get("data/" + file_key, []))
    df = mark_anomaly_windows(df, label_times, window_size=1)
    
    values = df['value'].values
    # Normalize data for BOCPD (important for priors)
    mean_val = np.mean(values)
    std_val = np.std(values)
    values_norm = (values - mean_val) / std_val
    
    print("Initializing BOCPD...")
    # Hazard rate: 1/100 implies we expect a changepoint every 100 steps roughly
    # Taxi data (30 min) -> 100 steps = 50 hours (2 days). Reasonable.
    bocpd = BOCPD(hazard_rate=1/100, mean_prior=0, var_prior=1, var_data=1)
    
    print(f"Processing {len(values)} points...")
    for i, x in enumerate(values_norm):
        bocpd.update(x)
        if (i+1) % 1000 == 0:
            print(f"Processed {i+1}/{len(values)}")
            
    R_mat = bocpd.get_run_length_matrix()
    
    # Calculate MAP run length path (for visualization)
    # Or just probability of CP (r=0)
    # R_mat has shape (T+1, T+1) because it includes initial state.
    # We want to align with data, so we skip the first row (initial prior).
    cp_probs = R_mat[1:, 0]
    
    # Plot
    fig, axes = plt.subplots(2, 1, figsize=(15, 10), sharex=True)
    
    # Plot Data
    axes[0].plot(df['timestamp'], values, label='Data', color='black', alpha=0.7)
    # Plot Labeled Anomalies
    anom_times = df[df['is_anomaly'] == 1]['timestamp']
    axes[0].scatter(anom_times, df[df['is_anomaly'] == 1]['value'], color='red', label='Labeled Anomaly')
    axes[0].set_title(f"Data & Labels: {file_key}")
    axes[0].legend()
    
    # Plot CP Probability
    axes[1].plot(df['timestamp'], cp_probs, label='Changepoint Prob (r=0)', color='blue')
    axes[1].set_title("Changepoint Probability")
    axes[1].set_ylim(0, 1.1)
    
    # Highlight high CP probability regions
    high_cp_indices = np.where(cp_probs > 0.5)[0]
    if len(high_cp_indices) > 0:
        axes[1].scatter(df['timestamp'].iloc[high_cp_indices], cp_probs[high_cp_indices], color='orange', s=10)
        
    plt.tight_layout()
    plt.savefig(save_dir / "bocpd_result.png")
    print(f"Saved plot to {save_dir / 'bocpd_result.png'}")
    
    # Save CP probabilities
    out_df = df.copy()
    out_df['cp_prob'] = cp_probs
    out_df.to_csv(save_dir / "cp_probs.csv", index=False)
    
    return out_df

if __name__ == "__main__":
    # nab_root defined globally or dynamically above
    nab_root = str(project_root / 'NAB')
    file_key = "realKnownCause/nyc_taxi.csv"
    run_bocpd_pipeline(
        nab_root=nab_root,
        file_key=file_key,
        save_dir="./results/bocpd"
    )


In [None]:

# --- Additional EDA: ACF and PACF ---
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import matplotlib.pyplot as plt

# Assuming 'df' is loaded from previous cells (e.g. STL cell)
# We'll reload it just in case or use the last known df
if 'df' in locals():
    fig, ax = plt.subplots(2, 1, figsize=(12, 8))
    plot_acf(df['value'], lags=50, ax=ax[0])
    plot_pacf(df['value'], lags=50, ax=ax[1])
    plt.tight_layout()
    save_dir = Path('./results/eda')
    save_dir.mkdir(parents=True, exist_ok=True)
    plt.savefig(save_dir / 'acf_pacf.png')
    print(f'Saved plot to {save_dir / "acf_pacf.png"}')
    plt.show()
else:
    print("Dataframe 'df' not found. Run previous cells first.")
