In [1]:
# Imports and Setup
import json
import sys
from pathlib import Path
import numpy as np
import pandas as pd

sys.path.insert(0, str(Path.cwd().parent / "src"))

from utils import config
from utils.load_dataset_ucr import parse_ucr_filename, load_raw_data
from utils.cleaning import clean_series

In [2]:
# Configuration
DATASETS = config.DATASETS
PINNED = config.PINNED_RAW_FILES

In [3]:
# Main Cleaning Loop
for dataset_name in config.DATASETS:
    
    # Raw file (pinned)
    raw_path = config.ALL_RAW_DIR / config.PINNED_RAW_FILES[dataset_name]

    # Parse indices and load signal
    meta = parse_ucr_filename(str(raw_path))
    data = load_raw_data(str(raw_path))

    # Clean signal
    data, cleaning_report = clean_series(data)

    # Point labels (0=normal, 1=anomaly)
    labels = np.zeros(len(data), dtype=int)
    a0 = int(meta["anomaly_start"])
    a1 = int(meta["anomaly_end"])
    labels[a0:a1] = 1  # exclusive

    # Save to data/<dataset>/cleaned/
    out_dir = config.ensure_dir(config.cleaned_dir(dataset_name))
    pd.DataFrame({"value": data, "is_anomaly": labels}).to_csv(out_dir / "cleaned.csv", index=False)

    meta.update({
        "dataset_name": dataset_name,
        "length": int(len(data)),
        "raw_file": raw_path.name,
        "cleaning_report": cleaning_report,
    })
    with open(out_dir / "metadata.json", "w") as f:
        json.dump(meta, f, indent=2)