In [1]:
# Imports and Setup
import os
import json
import shutil
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

sys.path.insert(0, os.path.abspath('../src'))

from utils import config
from utils.preprocessing import RobustScaler, StandardScaler, create_windows

In [2]:
# Configuration
DATA_ROOT = str(config.DATA_DIR)

TARGET_DATASETS = config.DATASETS
WINDOW_SIZE = config.WINDOW_SIZE
STRIDE = config.STRIDE

In [3]:
# Main Processing Loop
for dataset_name in TARGET_DATASETS:

    # Paths
    clean_dir = config.cleaned_dir(dataset_name)
    out_dir = config.method_ready_dir(dataset_name)

    # Reset output dir
    if out_dir.exists():
        shutil.rmtree(out_dir)
    config.ensure_dir(out_dir)

    # Load cleaned outputs
    clean_csv_path = clean_dir / "cleaned.csv"
    meta_json_path = clean_dir / "metadata.json"
    if (not clean_csv_path.exists()) or (not meta_json_path.exists()):
        raise FileNotFoundError

    df = pd.read_csv(clean_csv_path)
    with open(meta_json_path, "r") as f:
        meta = json.load(f)

    # Series and labels
    full_series = df["value"].to_numpy(dtype=float)
    full_labels = df["is_anomaly"].to_numpy(dtype=int)
    train_end = int(meta["train_end"])

    # Split
    train_raw = full_series[:train_end]
    test_raw = full_series[train_end:]
    train_labels_point = full_labels[:train_end]
    test_labels_point = full_labels[train_end:]

    # Scale (fit on train)
    z_scaler = StandardScaler()
    z_scaler.fit(train_raw)
    train_z = z_scaler.transform(train_raw)
    test_z = z_scaler.transform(test_raw)

    r_scaler = RobustScaler()
    r_scaler.fit(train_raw)
    train_robust = r_scaler.transform(train_raw)
    test_robust = r_scaler.transform(test_raw)

    # Save continuous arrays
    np.save(out_dir / "train_raw.npy", train_raw)
    np.save(out_dir / "test_raw.npy", test_raw)
    np.save(out_dir / "train_z.npy", train_z)
    np.save(out_dir / "test_z.npy", test_z)
    np.save(out_dir / "train_robust.npy", train_robust)
    np.save(out_dir / "test_robust.npy", test_robust)

    # Window starts (global indices)
    train_starts = np.arange(0, len(train_raw) - WINDOW_SIZE + 1, STRIDE, dtype=int)
    test_starts = np.arange(0, len(test_raw) - WINDOW_SIZE + 1, STRIDE, dtype=int) + train_end

    # Window labels
    train_lbl_win = create_windows(train_labels_point, WINDOW_SIZE, STRIDE)
    test_lbl_win = create_windows(test_labels_point, WINDOW_SIZE, STRIDE)

    if config.WINDOW_LABEL_MODE == "any":
        train_win_labels = train_lbl_win.max(axis=1).astype(int)
        test_win_labels = test_lbl_win.max(axis=1).astype(int)
    elif config.WINDOW_LABEL_MODE == "majority":
        train_win_labels = (train_lbl_win.mean(axis=1) > 0.5).astype(int)
        test_win_labels = (test_lbl_win.mean(axis=1) > 0.5).astype(int)
    else:
        raise ValueError

    # Save window artifacts
    np.save(out_dir / "train_win_starts.npy", train_starts)
    np.save(out_dir / "test_win_starts.npy", test_starts)
    np.save(out_dir / "train_win_labels.npy", train_win_labels)
    np.save(out_dir / "test_win_labels.npy", test_win_labels)

    # Save metadata
    z_mean = float(z_scaler.mean_ if hasattr(z_scaler, "mean_") else z_scaler.mean)
    z_std = float(z_scaler.std_ if hasattr(z_scaler, "std_") else z_scaler.std)
    r_median = float(r_scaler.median_ if hasattr(r_scaler, "median_") else r_scaler.median)
    r_iqr = float(r_scaler.iqr_ if hasattr(r_scaler, "iqr_") else r_scaler.iqr)

    method_meta = {
        "dataset": dataset_name,
        "window_parameters": {
            "size": WINDOW_SIZE,
            "stride": STRIDE,
            "label_mode": config.WINDOW_LABEL_MODE,
        },
        "variants": getattr(config, "EXPORT_VARIANTS", ["raw", "z", "robust"]),
        "scalers": {
            "z_score": {"mean": z_mean, "std": z_std},
            "robust": {"median": r_median, "iqr": r_iqr},
        },
        "counts": {
            "train_windows": int(len(train_starts)),
            "test_windows": int(len(test_starts)),
            "train_anomalies": int(train_win_labels.sum()),
            "test_anomalies": int(test_win_labels.sum()),
        },
    }

    with open(out_dir / "method_ready_metadata.json", "w") as f:
        json.dump(method_meta, f, indent=2)