# Time-Adaptive Anomaly Detection

This notebook addresses the issue of increasing anomaly scores over time by implementing:
1. Sliding window approach
2. Proper data normalization
3. Alternative anomaly detection methods

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from xgboost import XGBRegressor
from sklearn.metrics import roc_auc_score
import time

# Assuming you have these custom modules
from dataset import get_df_action, get_features_ts, get_train_test_data
from plots import plot_anomalies, plot_anomalies_over_time

# Set style for matplotlib
plt.style.use("Solarize_Light2")

In [None]:
# Path to the root directory of the dataset
ROOTDIR_DATASET_NORMAL =  '../../dataset/normal'#'/content/drive/MyDrive/Kuka_v1/normal'
ROOTDIR_DATASET_ANOMALY = '../../dataset/collisions'#'/content/drive/MyDrive/Kuka_v1/collisions'

# TF_ENABLE_ONEDNN_OPTS=0 means that the model will not use the oneDNN library for optimization

import os
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

In [None]:
#freq = '1.0'
freq = '0.1'
#freq = '0.01'
#freq = '0.005'

# NORMAL DATA
filepath_csv = [os.path.join(ROOTDIR_DATASET_NORMAL, f"rec{r}_20220811_rbtc_{freq}s.csv") for r in [0, 2, 3, 4]]
filepath_meta = [os.path.join(ROOTDIR_DATASET_NORMAL, f"rec{r}_20220811_rbtc_{freq}s.metadata") for r in [0, 2, 3, 4]]
df_action, df, df_meta, action2int = get_df_action(filepath_csv, filepath_meta)


# COLLISION DATA
xls = pd.ExcelFile(os.path.join(ROOTDIR_DATASET_ANOMALY, "20220811_collisions_timestamp.xlsx"))
collision_rec1 = pd.read_excel(xls, 'rec1')
collision_rec5 = pd.read_excel(xls, 'rec5')

collisions = pd.concat([collision_rec1, collision_rec5])
collisions_init = collisions[collisions['Inizio/fine'] == "i"].Timestamp - pd.to_timedelta([2] * len(collisions[collisions['Inizio/fine'] == "i"].Timestamp), 'h')

filepath_csv = [os.path.join(ROOTDIR_DATASET_ANOMALY, f"rec{r}_collision_20220811_rbtc_{freq}s.csv") for r in [1, 5]]
filepath_meta = [os.path.join(ROOTDIR_DATASET_ANOMALY, f"rec{r}_collision_20220811_rbtc_{freq}s.metadata") for r in [1, 5]]
df_action_collision, df_collision, df_meta_collision, action2int_collision = get_df_action(filepath_csv, filepath_meta)

filepath_csv = [os.path.join(ROOTDIR_DATASET_ANOMALY, f"rec1_collision_20220811_rbtc_{freq}s.csv")]
filepath_meta = [os.path.join(ROOTDIR_DATASET_ANOMALY, f"rec1_collision_20220811_rbtc_{freq}s.metadata")]
df_action_collision_1, df_collision_1, df_meta_collision_1, action2int_collision_1 = get_df_action(filepath_csv, filepath_meta)

filepath_csv = [os.path.join(ROOTDIR_DATASET_ANOMALY, f"rec5_collision_20220811_rbtc_{freq}s.csv")]
filepath_meta = [os.path.join(ROOTDIR_DATASET_ANOMALY, f"rec5_collision_20220811_rbtc_{freq}s.metadata")]
df_action_collision_5, df_collision_5, df_meta_collision_5, action2int_collision_5 = get_df_action(filepath_csv, filepath_meta)

In [None]:
start_time = time.time()
frequency = 1/float(freq)
df_features = get_features_ts("statistical", df_action, df_meta, frequency, action2int)
df_features_collision = get_features_ts("statistical", df_action_collision, df_meta_collision, frequency, action2int_collision)
df_features_collision_1 = get_features_ts("statistical", df_action_collision_1, df_meta_collision_1, frequency, action2int_collision_1)
df_features_collision_5 = get_features_ts("statistical", df_action_collision_5, df_meta_collision_5, frequency, action2int_collision_5)
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
X_train, y_train, X_test, y_test = get_train_test_data(df_features, df_features_collision, full_normal=True)
X_train1, y_train1, X_test1, y_test1 = get_train_test_data(df_features, df_features_collision_1, full_normal=True)
X_train5, y_train5, X_test5, y_test5 = get_train_test_data(df_features, df_features_collision_5, full_normal=True)

In [None]:
# Load and preprocess data (as in your original notebook)
# ...

# After preprocessing, you should have X_train and X_test
# For this example, let's assume X_train and X_test are DataFrames
# with a DatetimeIndex

## 1. Sliding Window Approach

In [None]:
def sliding_window_anomaly_detection(X_train, X_test, window_size=1000, stride=100):
    anomaly_scores = []
    scaler = StandardScaler()
    model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
    
    for i in range(0, len(X_test), stride):
        # Select window of training data
        train_window = X_train.iloc[max(0, i-window_size):i]
        
        # Normalize data
        train_scaled = scaler.fit_transform(train_window)
        test_scaled = scaler.transform(X_test.iloc[i:i+stride])
        
        # Train model
        model.fit(train_scaled, train_scaled)
        
        # Compute reconstruction error
        test_pred = model.predict(test_scaled)
        mse = np.mean(np.power(test_scaled - test_pred, 2), axis=1)
        anomaly_scores.extend(mse)
    
    return np.array(anomaly_scores)

sliding_window_scores = sliding_window_anomaly_detection(X_train, X_test)

plt.figure(figsize=(12, 6))
plt.plot(sliding_window_scores)
plt.title('Anomaly Scores using Sliding Window Approach')
plt.xlabel('Sample Index')
plt.ylabel('Anomaly Score')
plt.show()

## 2. Alternative Anomaly Detection Methods

In [None]:
def isolation_forest_detection(X_train, X_test):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    clf = IsolationForest(contamination=0.1, random_state=42)
    clf.fit(X_train_scaled)
    
    # Negative scores represent anomalies
    return -clf.score_samples(X_test_scaled)

def local_outlier_factor_detection(X_train, X_test):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    clf = LocalOutlierFactor(n_neighbors=20, contamination=0.1, novelty=True)
    clf.fit(X_train_scaled)
    
    # Negative scores represent anomalies
    return -clf.score_samples(X_test_scaled)

if_scores = isolation_forest_detection(X_train, X_test)
lof_scores = local_outlier_factor_detection(X_train, X_test)

plt.figure(figsize=(12, 10))
plt.subplot(3, 1, 1)
plt.plot(sliding_window_scores)
plt.title('XGBoost with Sliding Window')
plt.subplot(3, 1, 2)
plt.plot(if_scores)
plt.title('Isolation Forest')
plt.subplot(3, 1, 3)
plt.plot(lof_scores)
plt.title('Local Outlier Factor')
plt.tight_layout()
plt.show()

## 3. Evaluation and Comparison

In [None]:
def evaluate_anomaly_detection(y_true, anomaly_scores):
    auc_roc = roc_auc_score(y_true, anomaly_scores)
    return auc_roc

# Assuming y_test is your ground truth anomaly labels
print("AUC-ROC Scores:")
print(f"XGBoost with Sliding Window: {evaluate_anomaly_detection(y_test, sliding_window_scores):.4f}")
print(f"Isolation Forest: {evaluate_anomaly_detection(y_test, if_scores):.4f}")
print(f"Local Outlier Factor: {evaluate_anomaly_detection(y_test, lof_scores):.4f}")

## 4. Analyzing Time Dependency

In [None]:
def plot_feature_over_time(X, feature_name):
    plt.figure(figsize=(12, 6))
    plt.plot(X.index, X[feature_name])
    plt.title(f'{feature_name} over Time')
    plt.xlabel('Time')
    plt.ylabel('Value')
    plt.show()

# Plot a few important features over time
important_features = ['feature1', 'feature2', 'feature3']  # Replace with actual feature names
for feature in important_features:
    plot_feature_over_time(X_test, feature)

## 5. Conclusion and Next Steps

Based on the results of these different approaches, we can draw the following conclusions:

1. If the sliding window approach significantly reduced the trend of increasing anomaly scores, it suggests that there might be drift in your data over time.

2. If Isolation Forest or Local Outlier Factor perform better (higher AUC-ROC), it might indicate that these methods are more suitable for your specific dataset.

3. The feature plots over time can help identify if certain features are causing the time-dependent behavior.

Next steps:
1. Fine-tune the best performing model (adjust window size, number of estimators, etc.)
2. Investigate any features showing strong time dependency and consider how to handle them (e.g., detrending, differencing)
3. Consider implementing a more advanced time series anomaly detection method if the time component is crucial
4. Regularly retrain your model on recent data to adapt to potential concept drift