In [1]:
# Imports
import sys
from pathlib import Path

PROJECT_ROOT = Path("..").resolve()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

import json
import numpy as np
import pandas as pd

# Loader
from src.utils.load_dataset_ucr import load_dataset

In [2]:
# Configuration
DATASET_NAME = "respiration1"    

BASE_DIR     = Path("../data") / DATASET_NAME
RAW_DIR      = Path("../data/all_raw")
CLEANED_DIR  = BASE_DIR / "cleaned"

# Ensure directory exist
BASE_DIR.mkdir(parents=True, exist_ok=True)
CLEANED_DIR.mkdir(parents=True, exist_ok=True)

print("Dataset:", DATASET_NAME)
print("Raw directory:", RAW_DIR)
print("Cleaned directory:", CLEANED_DIR)

Dataset: respiration1
Raw directory: ../data/all_raw
Cleaned directory: ../data/respiration1/cleaned


In [3]:
# Locate raw ucr file
matching_files = list(RAW_DIR.glob(f"*{DATASET_NAME}*.txt"))

if len(matching_files) == 0:
    raise FileNotFoundError(f"No UCR file found for dataset '{DATASET_NAME}' in {RAW_DIR}")

txt_path = matching_files[0]
print("Found raw file:", txt_path)

Found raw file: ../data/all_raw/186_UCR_Anomaly_respiration1_100000_110260_110412.txt


In [4]:
# Load raw data and metadata
values_raw, meta = load_dataset(txt_path)

print("Loaded raw length:", len(values_raw))
print("Metadata:", meta)

Loaded raw length: 200000
Metadata: {'name': 'respiration1', 'train_end': 100000, 'anomaly_start': 110260, 'anomaly_end': 110412, 'length': 200000}


In [5]:
# Minimal cleaning
values = np.array(values_raw, dtype=float)
values = values[~np.isnan(values)]

if len(values) != meta["length"]:
    print(f"Warning: NaNs removed: raw length {meta['length']} â†’ cleaned length {len(values)}")

print("Final cleaned length:", len(values))

Final cleaned length: 200000


In [6]:
# Save clean csv
df_clean = pd.DataFrame({"value": values})
df_clean.to_csv(CLEANED_DIR / "cleaned.csv", index=False)

print("Saved:", CLEANED_DIR / "cleaned.csv")

Saved: ../data/respiration1/cleaned/cleaned.csv


In [7]:
# Generate labels
labels = np.zeros(len(values), dtype=int)

a_start = meta["anomaly_start"]
a_end   = meta["anomaly_end"]

# Set anomaly interval
labels[a_start : a_end + 1] = 1

# Save labels to JSON
labels_out = {"labels": labels.tolist()}

with open(CLEANED_DIR / "labels_cleaned.json", "w") as f:
    json.dump(labels_out, f, indent=4)

print("Saved:", CLEANED_DIR / "labels_cleaned.json")
print("Anomalous points:", labels.sum())

Saved: ../data/respiration1/cleaned/labels_cleaned.json
Anomalous points: 153


In [8]:
# Save metadata
meta_path = CLEANED_DIR / "metadata.json"

with open(meta_path, "w") as f:
    json.dump(meta, f, indent=4)

print("Saved:", meta_path)

Saved: ../data/respiration1/cleaned/metadata.json


In [9]:
print("Cleaned.csv exists ", (CLEANED_DIR / "cleaned.csv").exists())
print("labels_cleaned.json", (CLEANED_DIR / "labels_cleaned.json").exists())
print("metadata.json      ", (CLEANED_DIR / "metadata.json").exists())

Cleaned.csv exists  True
labels_cleaned.json True
metadata.json       True
