# USAD

In [None]:
import numpy as np
import pandas as pd
import polars as pl
import torch
import torch.utils.data as data_utils
from sklearn import preprocessing
from tqdm import tqdm
import os
import time
import sys

# Add project root to path
sys.path.append(os.path.abspath("../.."))

from spectrum.models import USAD
from spectrum.models.sr.sr import SpectralResidual
from spectrum.utils import device

BATCH_SIZE = 1024
N_EPOCHS = 100
HIDDEN_SIZE = 100
WINDOW_SIZE = 12

results_dir = "../../results/models/hybrid_sr_usad"
os.makedirs(results_dir, exist_ok=True)
selected_ids = [1, 2, 3]

def find_best_threshold(scores, true_labels, thresholds=None):
    if np.isnan(scores).any() or np.isinf(scores).any():
        scores = np.nan_to_num(scores, nan=0.0, posinf=np.max(scores[np.isfinite(scores)]) if np.any(np.isfinite(scores)) else 1.0)

    if thresholds is None:
        if np.min(scores) == np.max(scores):
            thresholds = [scores[0]]
        else:
            thresholds = [np.percentile(scores, p) for p in range(0, 90, 5)]
            thresholds.extend([np.percentile(scores, p) for p in range(90, 100, 1)])
            thresholds.extend([np.percentile(scores, p) for p in [99.1, 99.3, 99.5, 99.7, 99.9, 99.95, 99.99]])
            
    thresholds = sorted(list(set(thresholds)), reverse=True)
    best_f1 = -1
    best_threshold = thresholds[0] if len(thresholds) > 0 else 0.0
    best_metrics = {}

    true_labels = true_labels.astype(int)

    for threshold in thresholds:
        pred_labels = (scores > threshold).astype(int)
        TP = ((true_labels == 1) & (pred_labels == 1)).sum()
        FP = ((true_labels == 0) & (pred_labels == 1)).sum()
        TN = ((true_labels == 0) & (pred_labels == 0)).sum()
        FN = ((true_labels == 1) & (pred_labels == 0)).sum()

        accuracy = (TP + TN) / (TP + TN + FP + FN) if (TP + TN + FP + FN) > 0 else 0
        precision = TP / (TP + FP) if (TP + FP) > 0 else 0
        recall = TP / (TP + FN) if (TP + FN) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
            best_metrics = {'threshold': threshold, 'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}
            
    return best_threshold, best_metrics

def process_hybrid(data_id):
    print(f"\nProcessing: {data_id} (Hybrid SR + USAD)")
    
    # Read data
    train_df = pd.read_csv(f"../../datasets/Tencent/train/{data_id}.csv")
    test_df = pd.read_csv(f"../../datasets/Tencent/test/{data_id}.csv")
    
    # Labels
    test_labels = test_df["label"].to_numpy().astype(int)
    
    # --- 1. SR on system_usage:0 ---
    sr_col = "system_usage:0"
    sr_scores = np.zeros(len(test_df))
    
    if sr_col in test_df.columns:
        print(f"  Running SR on {sr_col}...")
        sr_model = SpectralResidual(window_size=WINDOW_SIZE)
        sr_scores_pl = sr_model.predict(pl.Series(test_df[sr_col].values))
        sr_scores = sr_scores_pl.to_numpy()
        sr_scores = np.nan_to_num(sr_scores, nan=0.0)
    else:
        print(f"  Warning: {sr_col} missing. SR scores set to 0.")

    # --- 2. USAD on Rest ---
    cols_exclude = ["timestamp", "label", sr_col]
    train_usad = train_df.drop([c for c in cols_exclude if c in train_df.columns], axis=1).astype(float)
    test_usad = test_df.drop([c for c in cols_exclude if c in test_df.columns], axis=1).astype(float)
    
    print(f"  Running USAD on {train_usad.shape[1]} features...")
    
    # Normalize
    scaler = preprocessing.MinMaxScaler()
    x_train = scaler.fit_transform(train_usad.values)
    x_test = scaler.transform(test_usad.values)
    
    # Windowing
    def make_windows(data, ws):
        n = data.shape[0]
        if n <= ws: return np.empty((0, ws, data.shape[1]))
        idx = np.arange(ws)[None, :] + np.arange(n - ws + 1)[:, None]
        return data[idx]

    train_win = make_windows(x_train, WINDOW_SIZE)
    test_win = make_windows(x_test, WINDOW_SIZE)
    
    # Flatten
    w_size = WINDOW_SIZE * x_train.shape[1]
    z_size = WINDOW_SIZE * HIDDEN_SIZE
    train_flat = train_win.reshape(-1, w_size)
    test_flat = test_win.reshape(-1, w_size)
    
    # DataLoaders
    split = int(0.8 * len(train_flat))
    train_loader = torch.utils.data.DataLoader(data_utils.TensorDataset(torch.from_numpy(train_flat[:split]).float()), batch_size=BATCH_SIZE, shuffle=False)
    val_loader = torch.utils.data.DataLoader(data_utils.TensorDataset(torch.from_numpy(train_flat[split:]).float()), batch_size=BATCH_SIZE, shuffle=False)
    test_loader = torch.utils.data.DataLoader(data_utils.TensorDataset(torch.from_numpy(test_flat).float()), batch_size=BATCH_SIZE, shuffle=False)
    
    # Train USAD
    model_usad = USAD(N_EPOCHS, w_size, z_size).to(device())
    model_usad.fit(train_loader, val_loader)
    
    # Predict USAD
    res = model_usad.predict(test_loader)
    if len(res) > 0:
        usad_scores = torch.cat(res).cpu().numpy()
    else:
        usad_scores = np.array([])
        
    # --- 3. Combine ---
    # Align lengths. USAD output length is N - W + 1.
    valid_len = len(usad_scores)
    start_idx = WINDOW_SIZE - 1
    
    # Slice SR scores and Labels to match USAD
    sr_scores_aligned = sr_scores[start_idx : start_idx + valid_len]
    labels_aligned = test_labels[start_idx : start_idx + valid_len]
    
    print(f"  Aligned lengths: {valid_len}")
    
    # Optimize Thresholds
    print("  Optimizing SR threshold...")
    best_th_sr, metrics_sr = find_best_threshold(sr_scores_aligned, labels_aligned)
    pred_sr = (sr_scores_aligned > best_th_sr).astype(int)
    print(f"    SR F1: {metrics_sr.get('f1', 0):.4f}")
    
    print("  Optimizing USAD threshold...")
    best_th_usad, metrics_usad = find_best_threshold(usad_scores, labels_aligned)
    pred_usad = (usad_scores > best_th_usad).astype(int)
    print(f"    USAD F1: {metrics_usad.get('f1', 0):.4f}")
    
    # Logical OR
    pred_final = (pred_sr | pred_usad).astype(int)
    
    # Calculate Final Metrics
    TP = ((labels_aligned == 1) & (pred_final == 1)).sum()
    FP = ((labels_aligned == 0) & (pred_final == 1)).sum()
    TN = ((labels_aligned == 0) & (pred_final == 0)).sum()
    FN = ((labels_aligned == 1) & (pred_final == 0)).sum()
    
    accuracy = (TP + TN) / len(labels_aligned) if len(labels_aligned) > 0 else 0
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    print(f"  Hybrid Result: F1={f1:.4f}, P={precision:.4f}, R={recall:.4f}")
    
    # Save Results
    full_df = pd.concat([train_df, test_df])
    complete_values = full_df.iloc[:, 1].to_numpy() # approx
    complete_timestamps = range(len(complete_values)) # approx
    
    complete_predictions = np.zeros(len(complete_values))
    pred_start_full = len(train_df) + WINDOW_SIZE - 1
    end_idx_full = pred_start_full + valid_len
    
    complete_predictions[pred_start_full : end_idx_full] = pred_final
    
    res_df = pd.DataFrame({
        'timestamp': complete_timestamps,
        'value': complete_values,
        'predicted': complete_predictions
    })
    res_df.to_csv(os.path.join(results_dir, f"{data_id}.csv"), index=False)
    
    return {
        'id': data_id,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'accuracy': accuracy,
        'sr_f1': metrics_sr.get('f1', 0),
        'usad_f1': metrics_usad.get('f1', 0)
    }

# Run Loop
results = []
for i in selected_ids:
    results.append(process_hybrid(i))

summary_df = pd.DataFrame(results)
print("\nSummary:")
print(summary_df.round(4))
summary_df.to_csv(os.path.join(results_dir, "summary.csv"), index=False)

In [51]:
BATCH_SIZE = 1024
N_EPOCHS = 100
HIDDEN_SIZE = 100
WINDOW_SIZE = 12

results_dir = "../../results/models/usad"
os.makedirs(results_dir, exist_ok=True)
selected_ids = [1, 2, 3]

def find_best_threshold(scores, true_labels, thresholds=None):
    if thresholds is None:
        # Handle case where scores are all same (e.g. 0)
        if np.min(scores) == np.max(scores):
            thresholds = [scores[0]]
        else:
            thresholds = [np.percentile(scores, p) for p in range(0, 90, 5)]
            thresholds.extend([np.percentile(scores, p) for p in range(90, 100, 1)])
            thresholds.extend([np.percentile(scores, p) for p in [99.1, 99.3, 99.5, 99.7, 99.9, 99.95, 99.99]])
            
    thresholds = sorted(list(set(thresholds)), reverse=True)

    best_f1 = -1
    best_threshold = thresholds[0] if len(thresholds) > 0 else 0.0
    
    best_metrics = {
        'threshold': best_threshold,
        'accuracy': 0.0,
        'precision': 0.0,
        'recall': 0.0,
        'f1': 0.0,
        'fnr': 0.0,
        'fpr': 0.0,
        'tp': 0,
        'fp': 0,
        'tn': 0,
        'fn': 0
    }

    for threshold in thresholds:
        pred_labels = (scores > threshold).astype(int)
        TP = ((true_labels == 1) & (pred_labels == 1)).sum()
        FP = ((true_labels == 0) & (pred_labels == 1)).sum()
        TN = ((true_labels == 0) & (pred_labels == 0)).sum()
        FN = ((true_labels == 1) & (pred_labels == 0)).sum()

        accuracy = (TP + TN) / (TP + TN + FP + FN) if (TP + TN + FP + FN) > 0 else 0
        precision = TP / (TP + FP) if (TP + FP) > 0 else 0
        recall = TP / (TP + FN) if (TP + FN) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        fnr = FN / (FN + TP) if (FN + TP) > 0 else 0
        fpr = FP / (FP + TN) if (FP + TN) > 0 else 0
        
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
            best_metrics = {
                'threshold': threshold,
                'accuracy': accuracy,
                'precision': precision,
                'recall': recall,
                'f1': f1,
                'fnr': fnr,
                'fpr': fpr,
                'tp': TP,
                'fp': FP,
                'tn': TN,
                'fn': FN
            }
            
    return best_threshold, best_metrics

def process_single_id(data_id):
    print(f"processing: {data_id}")
    
    # Read data
    train_df_raw = pd.read_csv(f"../../datasets/Tencent/train/{data_id}.csv")
    test_df_raw = pd.read_csv(f"../../datasets/Tencent/test/{data_id}.csv")
    
    # Prepare Train Data
    train_vals = train_df_raw.drop(["timestamp", "label"], axis=1, errors='ignore')
    train_vals = train_vals.astype(float)
    
    # Prepare Test Data
    test_vals = test_df_raw.drop(["timestamp", "label"], axis=1, errors='ignore')
    test_vals = test_vals.astype(float)
    
    # Normalization
    min_max_scaler = preprocessing.MinMaxScaler()
    x_train = min_max_scaler.fit_transform(train_vals.values)
    x_test = min_max_scaler.transform(test_vals.values)
    
    # Create Windows
    def make_windows(data_arr):
        # data_arr: (N, F)
        n = data_arr.shape[0]
        if n <= WINDOW_SIZE:
            return np.empty((0, WINDOW_SIZE, data_arr.shape[1]))
        indexer = np.arange(WINDOW_SIZE)[None, :] + np.arange(n - WINDOW_SIZE + 1)[:, None]
        return data_arr[indexer]

    train_windows = make_windows(x_train)
    test_windows = make_windows(x_test)
    
    # Flatten windows for USAD
    w_size = WINDOW_SIZE * x_train.shape[1]
    z_size = WINDOW_SIZE * HIDDEN_SIZE
    
    train_windows_flat = train_windows.reshape(-1, w_size)
    test_windows_flat = test_windows.reshape(-1, w_size)
    
    # Split Train/Val
    split_idx = int(0.8 * len(train_windows_flat))
    train_loader = torch.utils.data.DataLoader(
        data_utils.TensorDataset(
            torch.from_numpy(train_windows_flat[:split_idx]).float()
        ),
        batch_size=BATCH_SIZE,
        shuffle=False
    )
    val_loader = torch.utils.data.DataLoader(
        data_utils.TensorDataset(
            torch.from_numpy(train_windows_flat[split_idx:]).float()
        ),
        batch_size=BATCH_SIZE,
        shuffle=False
    )
    test_loader = torch.utils.data.DataLoader(
        data_utils.TensorDataset(
            torch.from_numpy(test_windows_flat).float()
        ),
        batch_size=BATCH_SIZE,
        shuffle=False
    )
    
    # Train
    model = USAD(N_EPOCHS, w_size, z_size).to(device())
    
    print("  training model...")
    start_time = time.time()
    history = model.fit(train_loader, val_loader)
    training_time = time.time() - start_time
    
    print("  anomaly detection...")
    start_time = time.time()
    results_list = model.predict(test_loader)
    if len(results_list) > 0:
        scores = torch.cat(results_list).cpu().numpy()
    else:
        scores = np.array([])
    scoring_time = time.time() - start_time
    
    print(f"  Scores stats: min={scores.min():.6f}, max={scores.max():.6f}, mean={scores.mean():.6f}")
    
    # Window-based Labeling Logic
    test_true_labels = test_df_raw["label"].to_numpy()
    
    # Create windows of labels using the same make_windows function
    # We reshape labels to (N, 1) then flatten the result to (num_windows, WINDOW_SIZE)
    label_windows = make_windows(test_true_labels.reshape(-1, 1)).reshape(-1, WINDOW_SIZE)
    
    # If ANY point in window is anomaly (1), then window label is 1
    y_test = (label_windows.sum(axis=1) > 0).astype(int)
    
    # Align lengths
    min_len = min(len(scores), len(y_test))
    scores = scores[:min_len]
    y_test = y_test[:min_len]

    print("  finding best threshold...")
    best_threshold, best_metrics = find_best_threshold(scores, y_test)
    
    # Save Results (Complete Data)
    full_df = pd.concat([train_df_raw, test_df_raw])
    
    if "value" in full_df.columns:
        complete_values = full_df["value"].to_numpy()
    else:
        complete_values = full_df.iloc[:, 1].to_numpy() # Use first feature column
        
    if "label" in full_df.columns:
        complete_labels = full_df["label"].to_numpy()
    else:
        complete_labels = np.zeros(len(complete_values))
        
    if "timestamp" in full_df.columns:
        complete_timestamps = full_df["timestamp"].to_numpy()
    else:
        complete_timestamps = range(len(complete_values))
        
    complete_predictions = np.zeros(len(complete_values))
    complete_anomaly_scores = np.zeros(len(complete_values))
    
    # Align predictions
    # Test scores correspond to windows. 
    # We align the score to the END of the window (point-based alignment for visualization)
    # Start index in full_df: len(train) + WINDOW_SIZE - 1
    
    pred_start_idx = len(train_df_raw) + WINDOW_SIZE - 1
    
    end_idx = pred_start_idx + len(scores)
    if end_idx > len(complete_values):
        end_idx = len(complete_values)
        scores = scores[:end_idx - pred_start_idx]
        
    complete_predictions[pred_start_idx:end_idx] = (scores > best_threshold).astype(int)
    complete_anomaly_scores[pred_start_idx:end_idx] = scores
    
    result_df = pd.DataFrame({
        'timestamp': complete_timestamps,
        'value': complete_values,
        'label': complete_labels,
        'predicted': complete_predictions,
        'anomaly_score': complete_anomaly_scores
    })
    
    output_file = os.path.join(results_dir, f"{data_id}.csv")
    result_df.to_csv(output_file, index=False)
    print(f"  results saved to: {output_file}")
    
    return {
        'id': data_id,
        'training_time': training_time,
        'testing_time': scoring_time,
        'total_time': training_time + scoring_time,
        'train_samples': len(train_df_raw),
        'test_samples': len(test_df_raw),
        'best_threshold': best_threshold,
        **best_metrics
    }

# Main Loop
all_results = []
print(f"processing {len(selected_ids)} datasets...")

for data_id in tqdm(selected_ids, desc="processing"):
    try:
        result = process_single_id(data_id)
        all_results.append(result)
        
        print(f"  ID {data_id} completed:")
        print(f"    best_threshold: {result['best_threshold']:.4f}")
        print(f"    f1: {result['f1']:.4f}")
        print(f"    precision: {result['precision']:.4f}")
        print(f"    recall: {result['recall']:.4f}")
        print(f"    accuracy: {result['accuracy']:.4f}")
        
    except Exception as e:
        print(f"  processing {data_id} failed: {str(e)}")
        import traceback
        traceback.print_exc()
        continue

if all_results:
    summary_df = pd.DataFrame(all_results)
    summary_file = os.path.join(results_dir, "usad.csv")
    summary_df.to_csv(summary_file, index=False)
    print(f"summary results saved to: {summary_file}")
    
    print("\n" + "=" * 80)
    print("USAD anomaly detection results")
    print("=" * 80)
    print(f"processed {len(all_results)} datasets")
    print(f"average F1: {summary_df['f1'].mean():.4f} ± {summary_df['f1'].std():.4f}")
    print(f"average precision: {summary_df['precision'].mean():.4f} ± {summary_df['precision'].std():.4f}")
    print(f"average recall: {summary_df['recall'].mean():.4f} ± {summary_df['recall'].std():.4f}")
    print(f"average accuracy: {summary_df['accuracy'].mean():.4f} ± {summary_df['accuracy'].std():.4f}")
    print(f"average training time: {summary_df['training_time'].mean():.2f}s")
    print(f"average scoring time: {summary_df['testing_time'].mean():.2f}s")
    print("=" * 80)
    
    print("details:")
    display_cols = ['id', 'f1', 'precision', 'recall', 'accuracy', 'best_threshold', 'tp', 'fp', 'tn', 'fn']
    print(summary_df[display_cols].round(4))
else:
    print("no results")

processing 3 datasets...


processing:   0%|          | 0/3 [00:00<?, ?it/s]

processing: 1
  training model...
Epoch [0], val_loss1: 0.2308, val_loss2: 0.2286
Epoch [1], val_loss1: 0.2270, val_loss2: -0.0001
Epoch [2], val_loss1: 0.2232, val_loss2: -0.0742
Epoch [3], val_loss1: 0.2187, val_loss2: -0.1092
Epoch [4], val_loss1: 0.2133, val_loss2: -0.1281
Epoch [5], val_loss1: 0.2068, val_loss2: -0.1384
Epoch [6], val_loss1: 0.1989, val_loss2: -0.1433
Epoch [7], val_loss1: 0.1896, val_loss2: -0.1444
Epoch [8], val_loss1: 0.1788, val_loss2: -0.1424
Epoch [9], val_loss1: 0.1667, val_loss2: -0.1379
Epoch [10], val_loss1: 0.1535, val_loss2: -0.1312
Epoch [11], val_loss1: 0.1394, val_loss2: -0.1219
Epoch [12], val_loss1: 0.1267, val_loss2: -0.1125
Epoch [13], val_loss1: 0.1142, val_loss2: -0.1024
Epoch [14], val_loss1: 0.1018, val_loss2: -0.0921
Epoch [15], val_loss1: 0.0912, val_loss2: -0.0827
Epoch [16], val_loss1: 0.0819, val_loss2: -0.0740
Epoch [17], val_loss1: 0.0734, val_loss2: -0.0662
Epoch [18], val_loss1: 0.0660, val_loss2: -0.0593
Epoch [19], val_loss1: 0.06

processing:  33%|███▎      | 1/3 [00:14<00:28, 14.31s/it]

Epoch [99], val_loss1: 0.0363, val_loss2: -0.0356
  anomaly detection...
  Scores stats: min=0.052722, max=726000.250000, mean=2218.185059
  finding best threshold...
  results saved to: ../../results/models/usad/1.csv
  ID 1 completed:
    best_threshold: 0.4465
    f1: 0.8627
    precision: 0.7586
    recall: 1.0000
    accuracy: 0.9951
processing: 2
  training model...
Epoch [0], val_loss1: 0.2307, val_loss2: 0.2337
Epoch [1], val_loss1: 0.2290, val_loss2: 0.0000
Epoch [2], val_loss1: 0.2259, val_loss2: -0.0755
Epoch [3], val_loss1: 0.2218, val_loss2: -0.1110
Epoch [4], val_loss1: 0.2165, val_loss2: -0.1299
Epoch [5], val_loss1: 0.2096, val_loss2: -0.1398
Epoch [6], val_loss1: 0.2014, val_loss2: -0.1439
Epoch [7], val_loss1: 0.1917, val_loss2: -0.1439
Epoch [8], val_loss1: 0.1807, val_loss2: -0.1409
Epoch [9], val_loss1: 0.1685, val_loss2: -0.1356
Epoch [10], val_loss1: 0.1553, val_loss2: -0.1284
Epoch [11], val_loss1: 0.1415, val_loss2: -0.1198
Epoch [12], val_loss1: 0.1281, val_lo

processing:  67%|██████▋   | 2/3 [00:26<00:12, 12.77s/it]

Epoch [99], val_loss1: 0.0351, val_loss2: -0.0345
  anomaly detection...
  Scores stats: min=0.048963, max=1.011518, mean=0.213491
  finding best threshold...
  results saved to: ../../results/models/usad/2.csv
  ID 2 completed:
    best_threshold: 0.4389
    f1: 0.3038
    precision: 0.9231
    recall: 0.1818
    accuracy: 0.9872
processing: 3
  training model...
Epoch [0], val_loss1: 0.2288, val_loss2: 0.2313
Epoch [1], val_loss1: 0.2273, val_loss2: -0.0000
Epoch [2], val_loss1: 0.2244, val_loss2: -0.0751
Epoch [3], val_loss1: 0.2203, val_loss2: -0.1103
Epoch [4], val_loss1: 0.2149, val_loss2: -0.1291
Epoch [5], val_loss1: 0.2082, val_loss2: -0.1390
Epoch [6], val_loss1: 0.2000, val_loss2: -0.1432
Epoch [7], val_loss1: 0.1905, val_loss2: -0.1434
Epoch [8], val_loss1: 0.1797, val_loss2: -0.1406
Epoch [9], val_loss1: 0.1675, val_loss2: -0.1353
Epoch [10], val_loss1: 0.1546, val_loss2: -0.1280
Epoch [11], val_loss1: 0.1410, val_loss2: -0.1192
Epoch [12], val_loss1: 0.1264, val_loss2: -0

processing: 100%|██████████| 3/3 [00:37<00:00, 12.44s/it]

Epoch [99], val_loss1: 0.0360, val_loss2: -0.0353
  anomaly detection...
  Scores stats: min=0.052601, max=1.022958, mean=0.222413
  finding best threshold...
  results saved to: ../../results/models/usad/3.csv
  ID 3 completed:
    best_threshold: 1.0229
    f1: 0.0000
    precision: 0.0000
    recall: 0.0000
    accuracy: 0.9998
summary results saved to: ../../results/models/usad/usad.csv

USAD anomaly detection results
processed 3 datasets
average F1: 0.3888 ± 0.4376
average precision: 0.5606 ± 0.4924
average recall: 0.3939 ± 0.5327
average accuracy: 0.9940 ± 0.0063
average training time: 12.36s
average scoring time: 0.03s
details:
   id      f1  precision  recall  accuracy  best_threshold  tp  fp    tn  fn
0   1  0.8627     0.7586  1.0000    0.9951          0.4465  66  21  4222   0
1   2  0.3038     0.9231  0.1818    0.9872          0.4389  12   1  4242  54
2   3  0.0000     0.0000  0.0000    0.9998          1.0229   0   1  4308   0





In [52]:
def process_synthetic_data_usad():
    print("\n" + "="*40)
    print("Processing Synthetic Dataset (USAD)...")
    print("="*40)
    
    # Paths
    train_path = "../../datasets/synthetic/train/train.csv"
    test_path = "../../datasets/synthetic/test/test.csv"
    
    if not os.path.exists(train_path):
        print(f"Error: {train_path} not found. Please run preprocess/synthetic.ipynb first.")
        return

    # Read data
    train_df_raw = pd.read_csv(train_path)
    test_df_raw = pd.read_csv(test_path)
    
    print(f"  Train shape: {train_df_raw.shape}")
    print(f"  Test shape: {test_df_raw.shape}")
    
    # Prepare Numeric Data
    # Drop timestamp and label
    cols_to_drop = ["timestamp", "label"]
    train_vals = train_df_raw.drop([c for c in cols_to_drop if c in train_df_raw.columns], axis=1)
    test_vals = test_df_raw.drop([c for c in cols_to_drop if c in test_df_raw.columns], axis=1)
    
    train_vals = train_vals.astype(float)
    test_vals = test_vals.astype(float)
    
    # Normalization
    min_max_scaler = preprocessing.MinMaxScaler()
    x_train = min_max_scaler.fit_transform(train_vals.values)
    x_test = min_max_scaler.transform(test_vals.values)
    
    # Windowing
    def make_windows(data_arr):
        n = data_arr.shape[0]
        if n <= WINDOW_SIZE:
            return np.empty((0, WINDOW_SIZE, data_arr.shape[1]))
        indexer = np.arange(WINDOW_SIZE)[None, :] + np.arange(n - WINDOW_SIZE + 1)[:, None]
        return data_arr[indexer]

    train_windows = make_windows(x_train)
    test_windows = make_windows(x_test)
    
    # Flatten for USAD
    w_size = WINDOW_SIZE * x_train.shape[1]
    z_size = WINDOW_SIZE * HIDDEN_SIZE
    
    train_windows_flat = train_windows.reshape(-1, w_size)
    test_windows_flat = test_windows.reshape(-1, w_size)
    
    # DataLoaders
    split_idx = int(0.8 * len(train_windows_flat))
    train_loader = torch.utils.data.DataLoader(
        data_utils.TensorDataset(torch.from_numpy(train_windows_flat[:split_idx]).float()),
        batch_size=BATCH_SIZE, shuffle=False
    )
    val_loader = torch.utils.data.DataLoader(
        data_utils.TensorDataset(torch.from_numpy(train_windows_flat[split_idx:]).float()),
        batch_size=BATCH_SIZE, shuffle=False
    )
    test_loader = torch.utils.data.DataLoader(
        data_utils.TensorDataset(torch.from_numpy(test_windows_flat).float()),
        batch_size=BATCH_SIZE, shuffle=False
    )
    
    # Train
    model = USAD(N_EPOCHS, w_size, z_size).to(device())
    
    print("  training model...")
    start_time = time.time()
    model.fit(train_loader, val_loader)
    training_time = time.time() - start_time
    
    print("  anomaly detection...")
    start_time = time.time()
    results_list = model.predict(test_loader)
    if len(results_list) > 0:
        scores = torch.cat(results_list).cpu().numpy()
    else:
        scores = np.array([])
    scoring_time = time.time() - start_time
    
    print(f"  Scores stats: min={scores.min():.6f}, max={scores.max():.6f}, mean={scores.mean():.6f}")
    
    # Window-based Labeling Logic
    if "label" in test_df_raw.columns:
        test_true_labels = test_df_raw["label"].to_numpy()
        label_windows = make_windows(test_true_labels.reshape(-1, 1)).reshape(-1, WINDOW_SIZE)
        y_test = (label_windows.sum(axis=1) > 0).astype(int)
    else:
        y_test = np.zeros(len(scores))

    # Align lengths
    min_len = min(len(scores), len(y_test))
    scores = scores[:min_len]
    y_test = y_test[:min_len]

    print("  finding best threshold...")
    best_threshold, best_metrics = find_best_threshold(scores, y_test)
    
    # Save Results
    results_dir_syn = "../../results/models/usad_synthetic"
    os.makedirs(results_dir_syn, exist_ok=True)
    
    full_df = pd.concat([train_df_raw, test_df_raw])
    
    # Use first feature column for visualization
    feature_cols = [c for c in train_df_raw.columns if c not in cols_to_drop]
    if feature_cols:
        complete_values = full_df[feature_cols[0]].to_numpy()
    else:
        complete_values = full_df.iloc[:, 0].to_numpy()

    if "label" in full_df.columns:
        complete_labels = full_df["label"].to_numpy()
    else:
        complete_labels = np.zeros(len(complete_values))
        
    if "timestamp" in full_df.columns:
        complete_timestamps = full_df["timestamp"].to_numpy()
    else:
        complete_timestamps = range(len(complete_values))
        
    complete_predictions = np.zeros(len(complete_values))
    complete_anomaly_scores = np.zeros(len(complete_values))
    
    # Align predictions
    pred_start_idx = len(train_df_raw) + WINDOW_SIZE - 1
    
    end_idx = pred_start_idx + len(scores)
    if end_idx > len(complete_values):
        end_idx = len(complete_values)
        scores = scores[:end_idx - pred_start_idx]
        
    complete_predictions[pred_start_idx:end_idx] = (scores > best_threshold).astype(int)
    complete_anomaly_scores[pred_start_idx:end_idx] = scores
    
    result_df = pd.DataFrame({
        'timestamp': complete_timestamps,
        'value': complete_values,
        'label': complete_labels,
        'predicted': complete_predictions,
        'anomaly_score': complete_anomaly_scores
    })
    
    output_file = os.path.join(results_dir_syn, "synthetic.csv")
    result_df.to_csv(output_file, index=False)
    print(f"  results saved to: {output_file}")
    
    print("\nResults:")
    print(f"  best_threshold: {best_threshold:.4f}")
    for k, v in best_metrics.items():
        if isinstance(v, float):
            print(f"  {k}: {v:.4f}")
        else:
            print(f"  {k}: {v}")

# Run
process_synthetic_data_usad()


Processing Synthetic Dataset (USAD)...
  Train shape: (2500, 12)
  Test shape: (2500, 12)
  training model...
Epoch [0], val_loss1: 0.0418, val_loss2: 0.0413
Epoch [1], val_loss1: 0.0419, val_loss2: -0.0000
Epoch [2], val_loss1: 0.0422, val_loss2: -0.0140
Epoch [3], val_loss1: 0.0425, val_loss2: -0.0212
Epoch [4], val_loss1: 0.0429, val_loss2: -0.0257
Epoch [5], val_loss1: 0.0434, val_loss2: -0.0289
Epoch [6], val_loss1: 0.0439, val_loss2: -0.0313
Epoch [7], val_loss1: 0.0445, val_loss2: -0.0333
Epoch [8], val_loss1: 0.0451, val_loss2: -0.0350
Epoch [9], val_loss1: 0.0457, val_loss2: -0.0365
Epoch [10], val_loss1: 0.0463, val_loss2: -0.0378
Epoch [11], val_loss1: 0.0470, val_loss2: -0.0391
Epoch [12], val_loss1: 0.0478, val_loss2: -0.0404
Epoch [13], val_loss1: 0.0488, val_loss2: -0.0418
Epoch [14], val_loss1: 0.0501, val_loss2: -0.0434
Epoch [15], val_loss1: 0.0516, val_loss2: -0.0452
Epoch [16], val_loss1: 0.0532, val_loss2: -0.0471
Epoch [17], val_loss1: 0.0546, val_loss2: -0.0487
