In [23]:
import river
import pandas as pd

In [24]:
data_path = "../data/resource_events.csv"
df = pd.read_csv(data_path)

print("Dataset shape:", df.shape)
print("Columns:", df.columns.tolist())

Dataset shape: (454, 25)
Columns: ['pid', 'comm', 'uid', 'gid', 'ppid', 'user_pid', 'user_ppid', 'cgroup_id', 'cgroup_name', 'user', 'cpu_ns', 'user_faults', 'kernel_faults', 'vm_mmap_bytes', 'vm_munmap_bytes', 'vm_brk_grow_bytes', 'vm_brk_shrink_bytes', 'bytes_written', 'bytes_read', 'isActive', 'wall_time_dt', 'wall_time_ms', 'container_id', 'container_image', 'container_labels_json']


In [25]:
feature_cols = [
    "cpu_ns",
    "user_faults",
    "kernel_faults",
    "vm_mmap_bytes",
    "vm_munmap_bytes",
    "vm_brk_grow_bytes",
    "vm_brk_shrink_bytes",
    "bytes_written",
    "bytes_read"
]

context_cols = [
    "pid",
    "comm",
    "user",
    "container_id",
    "container_image",
    "wall_time_ms"
]

In [26]:
X = df[feature_cols].copy()
context = df[context_cols].copy()

print("\nFeature matrix sample:")
display(X.head())

print("\nContext sample:")
display(context.head())


Feature matrix sample:


Unnamed: 0,cpu_ns,user_faults,kernel_faults,vm_mmap_bytes,vm_munmap_bytes,vm_brk_grow_bytes,vm_brk_shrink_bytes,bytes_written,bytes_read
0,3374785,0,0,0,0,0,0,0,0
1,899677,14,0,0,0,0,0,498,2660
2,0,0,0,0,0,0,0,0,0
3,12700969,2,0,0,0,0,0,0,4
4,3758715,23,1,0,0,0,0,17126,14909



Context sample:


Unnamed: 0,pid,comm,user,container_id,container_image,wall_time_ms
0,237,chronyd,,,,1757364894741
1,4226,containerd-shim,root,,,1757364894741
2,3806,buildkitd,root,,,1757364894741
3,4323,VM Periodic Tas,systemd-network,04242fd9b1ae2cf32adc297f0b79e8120c2833f0e4cfe5...,provectuslabs/kafka-ui:latest,1757364894741
4,7710,main,aleyi,,,1757364894741


In [27]:
X = X.fillna(0)

In [28]:
records = []
for i, row in X.iterrows():
    record = row.to_dict()
    # Keep context attached
    records.append((record, context.iloc[i].to_dict()))

print("\nExample record with context:")
print(records[0])


Example record with context:
({'cpu_ns': 3374785, 'user_faults': 0, 'kernel_faults': 0, 'vm_mmap_bytes': 0, 'vm_munmap_bytes': 0, 'vm_brk_grow_bytes': 0, 'vm_brk_shrink_bytes': 0, 'bytes_written': 0, 'bytes_read': 0}, {'pid': 237, 'comm': 'chronyd', 'user': nan, 'container_id': nan, 'container_image': nan, 'wall_time_ms': 1757364894741})


In [29]:
from river import compose, preprocessing, anomaly, feature_extraction
import numpy as np

In [30]:
hst = compose.Pipeline(
    preprocessing.MinMaxScaler(),
    anomaly.HalfSpaceTrees(
        n_trees=25,
        height=8,
        window_size=250,
        seed=42
    )
)

lof = model = compose.Pipeline(
    preprocessing.StandardScaler(),   # <-- better for LOF
    anomaly.LocalOutlierFactor(n_neighbors=35)
)

ocsvm_linear = model = compose.Pipeline(
    preprocessing.StandardScaler(),
    anomaly.OneClassSVM(nu=0.1, intercept_lr=0.01)
)

ocsvm_rbf = compose.Pipeline(
    preprocessing.StandardScaler(),
    feature_extraction.RBFSampler(n_components=50, seed=42),
    anomaly.OneClassSVM(nu=0.1, intercept_lr=0.01)
)

In [31]:

results = []

for features, meta in records:
    # Get anomaly scores
    s_hst = hst.score_one(features)
    hst.learn_one(features)
    s_lof = lof.score_one(features)
    lof.learn_one(features)
    s_ocsvm_lin = ocsvm_linear.score_one(features)
    ocsvm_linear.learn_one(features)
    s_ocsvm_rbf = ocsvm_rbf.score_one(features)
    ocsvm_rbf.learn_one(features)

    row = {**meta, **features,
           "hst_score": s_hst,
           "lof_score": s_lof,
           "ocsvm_linear_score": s_ocsvm_lin,
           "ocsvm_rbf_score": s_ocsvm_rbf}
    results.append(row)

df = pd.DataFrame(results)

In [32]:
def summarize_scores(scores, name):
    mean = scores.mean()
    std = scores.std()
    min_ = scores.min()
    max_ = scores.max()
    q95 = np.percentile(scores, 95)
    q99 = np.percentile(scores, 99)
    rate95 = (scores > q95).mean() * 100
    rate99 = (scores > q99).mean() * 100
    print(f"📊 {name}")
    print(f"Mean: {mean:.4f}, Std: {std:.4f}")
    print(f"Min: {min_:.4f}, Max: {max_:.4f}")
    print(f"95th percentile: {q95:.4f} → anomaly rate ~ {rate95:.2f}%")
    print(f"99th percentile: {q99:.4f} → anomaly rate ~ {rate99:.2f}%\n")

for col in ["hst_score", "lof_score", "ocsvm_linear_score", "ocsvm_rbf_score"]:
    summarize_scores(df[col], col)

📊 hst_score
Mean: 0.0682, Std: 0.1190
Min: 0.0000, Max: 0.9680
95th percentile: 0.1726 → anomaly rate ~ 4.85%
99th percentile: 0.6755 → anomaly rate ~ 1.10%

📊 lof_score
Mean: 6.9240, Std: 72.4857
Min: 0.0000, Max: 1515.2155
95th percentile: 11.2425 → anomaly rate ~ 5.07%
99th percentile: 40.1253 → anomaly rate ~ 1.10%

📊 ocsvm_linear_score
Mean: 0.0254, Std: 0.3814
Min: -2.5002, Max: 5.2788
95th percentile: 0.1129 → anomaly rate ~ 5.07%
99th percentile: 0.7240 → anomaly rate ~ 1.10%

📊 ocsvm_rbf_score
Mean: 1.7567, Std: 0.2289
Min: 0.0000, Max: 2.0873
95th percentile: 1.9962 → anomaly rate ~ 5.07%
99th percentile: 2.0092 → anomaly rate ~ 1.10%



In [33]:
def top_anomalies(series, q=99):
    return set(df.loc[series > np.percentile(series, q), "pid"])

anoms_hst = top_anomalies(df["hst_score"])
anoms_lof = top_anomalies(df["lof_score"])
anoms_lin = top_anomalies(df["ocsvm_linear_score"])
anoms_rbf = top_anomalies(df["ocsvm_rbf_score"])

print("Overlap HST & LOF:", len(anoms_hst & anoms_lof))
print("Overlap HST & OCSVM (linear):", len(anoms_hst & anoms_lin))
print("Overlap HST & OCSVM (rbf):", len(anoms_hst & anoms_rbf))
print("Overlap LOF & OCSVM (linear):", len(anoms_lof & anoms_lin))
print("Overlap LOF & OCSVM (rbf):", len(anoms_lof & anoms_rbf))

Overlap HST & LOF: 1
Overlap HST & OCSVM (linear): 1
Overlap HST & OCSVM (rbf): 0
Overlap LOF & OCSVM (linear): 3
Overlap LOF & OCSVM (rbf): 0
