In [16]:
import numpy as np
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import logging
import os
import sys
import warnings
import torch.utils.data as data_utils
from sklearn import preprocessing
from tqdm import tqdm
import os
import time

# Ensure USAD and device are imported

sys.path.append(os.path.abspath("../.."))
from spectrum.models import USAD
from spectrum.utils import device
from spectrum.utils.random import set_random_state
from spectrum.models import USAD, SpectralResidual
from spectrum.utils import device
logging.basicConfig(level=logging.INFO)

warnings.filterwarnings("ignore")

sns.set_theme(style="whitegrid")
plt.rcParams.update(
    {
        "axes.edgecolor": "0.3",
        "axes.linewidth": 0.8,
        "font.size": 12,
        "axes.titlesize": 14,
        "axes.labelsize": 12,
        "axes.titleweight": "bold",
        "legend.fontsize": 10,
        "figure.dpi": 120,
        "legend.frameon": False,
    }
)

set_random_state()

In [17]:
BATCH_SIZE = 1024
N_EPOCHS = 20
HIDDEN_SIZE = 100
WINDOW_SIZE = 32

results_dir = "../../results/models/sr_usad"
os.makedirs(results_dir, exist_ok=True)
selected_ids = [1, 2, 3]

def find_best_threshold(scores, true_labels, thresholds=None):
    if np.isnan(scores).any() or np.isinf(scores).any():
        scores = np.nan_to_num(scores, nan=0.0, posinf=np.max(scores[np.isfinite(scores)]) if np.any(np.isfinite(scores)) else 1.0)

    if thresholds is None:
        if np.min(scores) == np.max(scores):
            thresholds = [scores[0]]
        else:
            thresholds = [np.percentile(scores, p) for p in range(0, 90, 5)]
            thresholds.extend([np.percentile(scores, p) for p in range(90, 100, 1)])
            thresholds.extend([np.percentile(scores, p) for p in [99.1, 99.3, 99.5, 99.7, 99.9, 99.95, 99.99]])
            
    thresholds = sorted(list(set(thresholds)), reverse=True)
    best_f1 = -1
    best_threshold = thresholds[0] if len(thresholds) > 0 else 0.0
    best_metrics = {}

    true_labels = true_labels.astype(int)

    for threshold in thresholds:
        pred_labels = (scores > threshold).astype(int)
        TP = ((true_labels == 1) & (pred_labels == 1)).sum()
        FP = ((true_labels == 0) & (pred_labels == 1)).sum()
        TN = ((true_labels == 0) & (pred_labels == 0)).sum()
        FN = ((true_labels == 1) & (pred_labels == 0)).sum()

        accuracy = (TP + TN) / (TP + TN + FP + FN) if (TP + TN + FP + FN) > 0 else 0
        precision = TP / (TP + FP) if (TP + FP) > 0 else 0
        recall = TP / (TP + FN) if (TP + FN) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
        
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
            best_metrics = {'threshold': threshold, 'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}
            
    return best_threshold, best_metrics

def process_hybrid(data_id):
    print(f"\nProcessing: {data_id} (Hybrid SR + USAD)")
    
    # Read data
    train_df = pd.read_csv(f"../../datasets/Tencent/train/{data_id}.csv")
    test_df = pd.read_csv(f"../../datasets/Tencent/test/{data_id}.csv")
    
    # Labels
    test_labels = test_df["label"].to_numpy().astype(int)
    
    # --- 1. SR on system_usage:0 ---
    sr_col = "system_usage:0"
    sr_scores = np.zeros(len(test_df))
    
    if sr_col in test_df.columns:
        print(f"  Running SR on {sr_col}...")
        sr_model = SpectralResidual(window_size=WINDOW_SIZE)
        # Input: pl.Series
        sr_scores_pl = sr_model.predict(pl.Series(test_df[sr_col].values))
        sr_scores = sr_scores_pl
        sr_scores = np.nan_to_num(sr_scores, nan=0.0)
    else:
        print(f"  Warning: {sr_col} missing. SR scores set to 0.")

    # --- 2. USAD on Rest ---
    cols_exclude = ["timestamp", "label", sr_col]
    train_usad = train_df.drop([c for c in cols_exclude if c in train_df.columns], axis=1).astype(float)
    test_usad = test_df.drop([c for c in cols_exclude if c in test_df.columns], axis=1).astype(float)
    
    print(f"  Running USAD on {train_usad.shape[1]} features...")
    
    # Normalize
    scaler = preprocessing.MinMaxScaler()
    x_train = scaler.fit_transform(train_usad.values)
    x_test = scaler.transform(test_usad.values)
    
    # Windowing
    def make_windows(data, ws):
        n = data.shape[0]
        if n <= ws: return np.empty((0, ws, data.shape[1]))
        idx = np.arange(ws)[None, :] + np.arange(n - ws + 1)[:, None]
        return data[idx]

    train_win = make_windows(x_train, WINDOW_SIZE)
    test_win = make_windows(x_test, WINDOW_SIZE)
    
    # Flatten
    w_size = WINDOW_SIZE * x_train.shape[1]
    z_size = WINDOW_SIZE * HIDDEN_SIZE
    train_flat = train_win.reshape(-1, w_size)
    test_flat = test_win.reshape(-1, w_size)
    
    # DataLoaders
    split = int(0.8 * len(train_flat))
    train_loader = torch.utils.data.DataLoader(data_utils.TensorDataset(torch.from_numpy(train_flat[:split]).float()), batch_size=BATCH_SIZE, shuffle=False)
    val_loader = torch.utils.data.DataLoader(data_utils.TensorDataset(torch.from_numpy(train_flat[split:]).float()), batch_size=BATCH_SIZE, shuffle=False)
    test_loader = torch.utils.data.DataLoader(data_utils.TensorDataset(torch.from_numpy(test_flat).float()), batch_size=BATCH_SIZE, shuffle=False)
    
    # Train USAD
    model_usad = USAD(N_EPOCHS, w_size, z_size).to(device())
    model_usad.fit(train_loader, val_loader)
    
    # Predict USAD
    res = model_usad.predict(test_loader)
    if len(res) > 0:
        usad_scores = torch.cat(res).cpu().numpy()
    else:
        usad_scores = np.array([])
        
    # --- 3. Combine ---
    valid_len = len(usad_scores)
    start_idx = WINDOW_SIZE - 1
    
    sr_scores_aligned = sr_scores[start_idx : start_idx + valid_len]
    labels_aligned = test_labels[start_idx : start_idx + valid_len]
    
    print(f"  Aligned lengths: {valid_len}")
    
    # Optimize Thresholds
    print("  Optimizing SR threshold...")
    best_th_sr, metrics_sr = find_best_threshold(sr_scores_aligned, labels_aligned)
    pred_sr = (sr_scores_aligned > best_th_sr).astype(int)
    print(f"    SR F1: {metrics_sr.get('f1', 0):.4f}")
    
    print("  Optimizing USAD threshold...")
    best_th_usad, metrics_usad = find_best_threshold(usad_scores, labels_aligned)
    pred_usad = (usad_scores > best_th_usad).astype(int)
    print(f"    USAD F1: {metrics_usad.get('f1', 0):.4f}")
    
    # Logical OR
    pred_final = (pred_sr | pred_usad).astype(int)
    
    # Metrics
    TP = ((labels_aligned == 1) & (pred_final == 1)).sum()
    FP = ((labels_aligned == 0) & (pred_final == 1)).sum()
    TN = ((labels_aligned == 0) & (pred_final == 0)).sum()
    FN = ((labels_aligned == 1) & (pred_final == 0)).sum()
    
    accuracy = (TP + TN) / len(labels_aligned) if len(labels_aligned) > 0 else 0
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    print(f"  Hybrid Result: F1={f1:.4f}, P={precision:.4f}, R={recall:.4f}")
    
    # Save Results
    # Construct DataFrame aligned with original Test DF
    # We fill 'predicted' column. 0 for untrained part?
    complete_predictions = np.zeros(len(test_df))
    complete_predictions[start_idx : start_idx + valid_len] = pred_final
    
    # We can save timestamp, value, label, predicted
    res_df = test_df.copy()
    res_df["predicted"] = complete_predictions
    res_df["sr_score"] = sr_scores
    
    # usad_score aligned
    usad_scores_full = np.zeros(len(test_df))
    usad_scores_full[start_idx : start_idx + valid_len] = usad_scores
    res_df["usad_score"] = usad_scores_full
    
    output_file = os.path.join(results_dir, f"{data_id}.csv")
    res_df.to_csv(output_file, index=False)
    print(f"  Saved to {output_file}")
    
    return {
        'id': data_id,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'accuracy': accuracy,
        'sr_f1': metrics_sr.get('f1', 0),
        'usad_f1': metrics_usad.get('f1', 0)
    }

# Run Loop
results = []
for i in selected_ids:
    results.append(process_hybrid(i))

summary_df = pd.DataFrame(results)
print("\nSummary:")
print(summary_df.round(4))
summary_df.to_csv(os.path.join(results_dir, "summary.csv"), index=False)


Processing: 1 (Hybrid SR + USAD)
  Running SR on system_usage:0...
  Running USAD on 4 features...
Epoch [0], val_loss1: 0.2184, val_loss2: 0.2194
Epoch [1], val_loss1: 0.1966, val_loss2: -0.0021
Epoch [2], val_loss1: 0.1633, val_loss2: -0.0588
Epoch [3], val_loss1: 0.1265, val_loss2: -0.0681
Epoch [4], val_loss1: 0.0931, val_loss2: -0.0590
Epoch [5], val_loss1: 0.0685, val_loss2: -0.0468
Epoch [6], val_loss1: 0.0531, val_loss2: -0.0376
Epoch [7], val_loss1: 0.0446, val_loss2: -0.0325
Epoch [8], val_loss1: 0.0410, val_loss2: -0.0314
Epoch [9], val_loss1: 0.0415, val_loss2: -0.0336
Epoch [10], val_loss1: 0.0445, val_loss2: -0.0375
Epoch [11], val_loss1: 0.0450, val_loss2: -0.0387
Epoch [12], val_loss1: 0.0411, val_loss2: -0.0358
Epoch [13], val_loss1: 0.0371, val_loss2: -0.0326
Epoch [14], val_loss1: 0.0371, val_loss2: -0.0328
Epoch [15], val_loss1: 0.0394, val_loss2: -0.0350
Epoch [16], val_loss1: 0.0413, val_loss2: -0.0370
Epoch [17], val_loss1: 0.0417, val_loss2: -0.0376
Epoch [18],

In [18]:
from spectrum.models import LSTM

results_dir_lstm = "../../results/models/sr_lstm"
os.makedirs(results_dir_lstm, exist_ok=True)

def process_hybrid_lstm(data_id):
    print(f"\nProcessing: {data_id} (Hybrid SR + LSTM)")
    
    # Read data
    train_df = pd.read_csv(f"../../datasets/Tencent/train/{data_id}.csv")
    test_df = pd.read_csv(f"../../datasets/Tencent/test/{data_id}.csv")
    
    test_labels = test_df["label"].to_numpy().astype(int)
    
    # --- 1. SR ---
    sr_col = "system_usage:0"
    sr_scores = np.zeros(len(test_df))
    if sr_col in test_df.columns:
        print(f"  Running SR on {sr_col}...")
        sr_model = SpectralResidual(window_size=WINDOW_SIZE)
        sr_scores = sr_model.predict(pl.Series(test_df[sr_col].values))
        sr_scores = np.nan_to_num(sr_scores, nan=0.0)
    else:
        print(f"  Warning: {sr_col} missing.")

    # --- 2. LSTM ---
    cols_exclude = ["timestamp", "label", sr_col]
    train_lstm = train_df.drop([c for c in cols_exclude if c in train_df.columns], axis=1)
    test_lstm = test_df.drop([c for c in cols_exclude if c in test_df.columns], axis=1)
    
    print(f"  Running LSTM on {train_lstm.shape[1]} features...")
    
    # Normalize
    scaler = preprocessing.MinMaxScaler()
    x_train = scaler.fit_transform(train_lstm.values)
    x_test = scaler.transform(test_lstm.values)
    
    input_size = x_train.shape[1]
    output_dims = list(range(input_size))
    
    # Increase epochs if needed, using N_EPOCHS from cell 1
    model_lstm = LSTM(input_size=input_size, output_dims=output_dims, epochs=N_EPOCHS, window_size=WINDOW_SIZE)
    model_lstm.fit(x_train)
    
    lstm_scores = model_lstm.predict(x_test)
    
    # --- 3. Combine ---
    valid_len = len(lstm_scores)
    # LSTM predictions usually align to end
    start_idx = len(test_df) - valid_len
    
    sr_scores_aligned = sr_scores[start_idx : start_idx + valid_len]
    labels_aligned = test_labels[start_idx : start_idx + valid_len]
    
    print(f"  Aligned lengths: {valid_len}")
    
    print("  Optimizing SR threshold...")
    best_th_sr, metrics_sr = find_best_threshold(sr_scores_aligned, labels_aligned)
    pred_sr = (sr_scores_aligned > best_th_sr).astype(int)
    print(f"    SR F1: {metrics_sr.get('f1', 0):.4f}")
    
    print("  Optimizing LSTM threshold...")
    best_th_lstm, metrics_lstm = find_best_threshold(lstm_scores, labels_aligned)
    pred_lstm = (lstm_scores > best_th_lstm).astype(int)
    print(f"    LSTM F1: {metrics_lstm.get('f1', 0):.4f}")
    
    pred_final = (pred_sr | pred_lstm).astype(int)
    
    # Metrics
    TP = ((labels_aligned == 1) & (pred_final == 1)).sum()
    FP = ((labels_aligned == 0) & (pred_final == 1)).sum()
    TN = ((labels_aligned == 0) & (pred_final == 0)).sum()
    FN = ((labels_aligned == 1) & (pred_final == 0)).sum()
    
    accuracy = (TP + TN) / len(labels_aligned) if len(labels_aligned) > 0 else 0
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    print(f"  Hybrid Result: F1={f1:.4f}, P={precision:.4f}, R={recall:.4f}")
    
    # Save
    res_df = test_df.copy()
    complete_predictions = np.zeros(len(test_df))
    complete_predictions[start_idx : start_idx + valid_len] = pred_final
    res_df["predicted"] = complete_predictions
    res_df["sr_score"] = sr_scores
    
    lstm_scores_full = np.zeros(len(test_df))
    lstm_scores_full[start_idx : start_idx + valid_len] = lstm_scores
    res_df["lstm_score"] = lstm_scores_full
    
    res_df.to_csv(os.path.join(results_dir_lstm, f"{data_id}.csv"), index=False)
    print(f"  Saved to {results_dir_lstm}/{data_id}.csv")
    
    return {
        'id': data_id,
        'f1': f1,
        'precision': precision,
        'recall': recall,
        'accuracy': accuracy,
        'sr_f1': metrics_sr.get('f1', 0),
        'lstm_f1': metrics_lstm.get('f1', 0)
    }

results_lstm = []
for i in selected_ids:
    results_lstm.append(process_hybrid_lstm(i))

print("\nLSTM Hybrid Summary:")
print(pd.DataFrame(results_lstm).round(4))
pd.DataFrame(results_lstm).to_csv(os.path.join(results_dir_lstm, "summary.csv"), index=False)


Processing: 1 (Hybrid SR + LSTM)
  Running SR on system_usage:0...
  Running LSTM on 4 features...
  Aligned lengths: 4288
  Optimizing SR threshold...
    SR F1: 0.7143
  Optimizing LSTM threshold...
    LSTM F1: 0.7143
  Hybrid Result: F1=1.0000, P=1.0000, R=1.0000
  Saved to ../../results/models/sr_lstm/1.csv

Processing: 2 (Hybrid SR + LSTM)
  Running SR on system_usage:0...
  Running LSTM on 4 features...
  Aligned lengths: 4288
  Optimizing SR threshold...
    SR F1: 0.7143
  Optimizing LSTM threshold...
    LSTM F1: 0.7143
  Hybrid Result: F1=1.0000, P=1.0000, R=1.0000
  Saved to ../../results/models/sr_lstm/2.csv

Processing: 3 (Hybrid SR + LSTM)
  Running SR on system_usage:0...
  Running LSTM on 4 features...
  Aligned lengths: 4288
  Optimizing SR threshold...
    SR F1: 0.0000
  Optimizing LSTM threshold...
    LSTM F1: 0.0000
  Hybrid Result: F1=0.0000, P=0.0000, R=0.0000
  Saved to ../../results/models/sr_lstm/3.csv

LSTM Hybrid Summary:
   id   f1  precision  recall  ac