In [1]:
import river
import pandas as pd

In [2]:
data_path = "../data/resource_events2.csv"
df = pd.read_csv(data_path)

print("Dataset shape:", df.shape)
print("Columns:", df.columns.tolist())

Dataset shape: (5024, 25)
Columns: ['pid', 'comm', 'uid', 'gid', 'ppid', 'user_pid', 'user_ppid', 'cgroup_id', 'cgroup_name', 'user', 'cpu_ns', 'user_faults', 'kernel_faults', 'vm_mmap_bytes', 'vm_munmap_bytes', 'vm_brk_grow_bytes', 'vm_brk_shrink_bytes', 'bytes_written', 'bytes_read', 'isActive', 'wall_time_dt', 'wall_time_ms', 'container_id', 'container_image', 'container_labels_json']


In [3]:
feature_cols = [
    "cpu_ns",
    "user_faults",
    "kernel_faults",
    "vm_mmap_bytes",
    "vm_munmap_bytes",
    "vm_brk_grow_bytes",
    "vm_brk_shrink_bytes",
    "bytes_written",
    "bytes_read"
]

context_cols = [
    "pid",
    "comm",
    "user",
    "container_id",
    "container_image",
    "wall_time_ms"
]

In [4]:
X = df[feature_cols].copy()
context = df[context_cols].copy()

print("\nFeature matrix sample:")
display(X.head())

print("\nContext sample:")
display(context.head())


Feature matrix sample:


Unnamed: 0,cpu_ns,user_faults,kernel_faults,vm_mmap_bytes,vm_munmap_bytes,vm_brk_grow_bytes,vm_brk_shrink_bytes,bytes_written,bytes_read
0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0
2,562689,2,0,0,0,0,0,0,0
3,275860759,2757,5,0,0,0,0,304006,173388
4,0,0,0,0,0,0,0,0,0



Context sample:


Unnamed: 0,pid,comm,user,container_id,container_image,wall_time_ms
0,307,weston,aleyi,,,1757522094277
1,680,systemd-logind,root,,,1757522094277
2,2444,buildkitd,root,,,1757522094277
3,3798,ThreadPool,systemd-resolve,f111e1925e8b07a6b917d9c963da0ce52f4d23bf654dc5...,clickhouse/clickhouse-server:latest,1757522094277
4,2461,Relay(1457),root,,,1757522094277


In [5]:
X = X.fillna(0)

In [6]:
records = []
for i, row in X.iterrows():
    record = row.to_dict()
    # Keep context attached
    records.append((record, context.iloc[i].to_dict()))

print("\nExample record with context:")
print(records[0])


Example record with context:
({'cpu_ns': 0, 'user_faults': 0, 'kernel_faults': 0, 'vm_mmap_bytes': 0, 'vm_munmap_bytes': 0, 'vm_brk_grow_bytes': 0, 'vm_brk_shrink_bytes': 0, 'bytes_written': 0, 'bytes_read': 0}, {'pid': 307, 'comm': 'weston', 'user': 'aleyi', 'container_id': nan, 'container_image': nan, 'wall_time_ms': 1757522094277})


In [7]:
from river import compose, preprocessing, anomaly, feature_extraction
import numpy as np

In [8]:
hst = compose.Pipeline(
    preprocessing.MinMaxScaler(),
    anomaly.HalfSpaceTrees(
        n_trees=25,
        height=8,
        window_size=250,
        seed=42
    )
)

lof = model = compose.Pipeline(
    preprocessing.StandardScaler(),   # <-- better for LOF
    anomaly.LocalOutlierFactor(n_neighbors=35)
)

ocsvm_linear = model = compose.Pipeline(
    preprocessing.StandardScaler(),
    anomaly.OneClassSVM(nu=0.1, intercept_lr=0.01)
)

ocsvm_rbf = compose.Pipeline(
    preprocessing.StandardScaler(),
    feature_extraction.RBFSampler(n_components=50, seed=42),
    anomaly.OneClassSVM(nu=0.1, intercept_lr=0.01)
)

In [9]:

results = []

for features, meta in records:
    # Get anomaly scores
    s_hst = hst.score_one(features)
    hst.learn_one(features)
    s_lof = lof.score_one(features)
    lof.learn_one(features)
    s_ocsvm_lin = ocsvm_linear.score_one(features)
    ocsvm_linear.learn_one(features)
    s_ocsvm_rbf = ocsvm_rbf.score_one(features)
    ocsvm_rbf.learn_one(features)

    row = {**meta, **features,
           "hst_score": s_hst,
           "lof_score": s_lof,
           "ocsvm_linear_score": s_ocsvm_lin,
           "ocsvm_rbf_score": s_ocsvm_rbf}
    results.append(row)

df = pd.DataFrame(results)

In [10]:
def summarize_scores(scores, name):
    mean = scores.mean()
    std = scores.std()
    min_ = scores.min()
    max_ = scores.max()
    q95 = np.percentile(scores, 95)
    q99 = np.percentile(scores, 99)
    rate95 = (scores > q95).mean() * 100
    rate99 = (scores > q99).mean() * 100
    print(f"📊 {name}")
    print(f"Mean: {mean:.4f}, Std: {std:.4f}")
    print(f"Min: {min_:.4f}, Max: {max_:.4f}")
    print(f"95th percentile: {q95:.4f} → anomaly rate ~ {rate95:.2f}%")
    print(f"99th percentile: {q99:.4f} → anomaly rate ~ {rate99:.2f}%\n")

for col in ["hst_score", "lof_score", "ocsvm_linear_score", "ocsvm_rbf_score"]:
    summarize_scores(df[col], col)

📊 hst_score
Mean: 0.0382, Std: 0.0988
Min: 0.0000, Max: 0.9950
95th percentile: 0.0747 → anomaly rate ~ 4.78%
99th percentile: 0.6720 → anomaly rate ~ 1.02%

📊 lof_score
Mean: 6.5968, Std: 351.2722
Min: 0.0000, Max: 24893.0546
95th percentile: 2.6994 → anomaly rate ~ 5.02%
99th percentile: 8.5252 → anomaly rate ~ 1.02%

📊 ocsvm_linear_score
Mean: 0.0170, Std: 0.7714
Min: -0.2228, Max: 48.7503
95th percentile: 0.0298 → anomaly rate ~ 5.02%
99th percentile: 0.2314 → anomaly rate ~ 1.02%

📊 ocsvm_rbf_score
Mean: 3.5407, Std: 1.5292
Min: 0.0000, Max: 6.9829
95th percentile: 6.2886 → anomaly rate ~ 5.02%
99th percentile: 6.7174 → anomaly rate ~ 1.02%



In [11]:
def top_anomalies(series, q=99):
    return set(df.loc[series > np.percentile(series, q), "pid"])

anoms_hst = top_anomalies(df["hst_score"])
anoms_lof = top_anomalies(df["lof_score"])
anoms_lin = top_anomalies(df["ocsvm_linear_score"])
anoms_rbf = top_anomalies(df["ocsvm_rbf_score"])

print("Overlap HST & LOF:", len(anoms_hst & anoms_lof))
print("Overlap HST & OCSVM (linear):", len(anoms_hst & anoms_lin))
print("Overlap HST & OCSVM (rbf):", len(anoms_hst & anoms_rbf))
print("Overlap LOF & OCSVM (linear):", len(anoms_lof & anoms_lin))
print("Overlap LOF & OCSVM (rbf):", len(anoms_lof & anoms_rbf))

Overlap HST & LOF: 3
Overlap HST & OCSVM (linear): 3
Overlap HST & OCSVM (rbf): 0
Overlap LOF & OCSVM (linear): 7
Overlap LOF & OCSVM (rbf): 10


In [12]:
display(anoms_hst)

{1292, 3798, 7414}