In [1]:
import river
import pandas as pd

In [2]:
data_path = "../data/resource_events2.csv"
df = pd.read_csv(data_path)

print("Dataset shape:", df.shape)
print("Columns:", df.columns.tolist())

Dataset shape: (5024, 25)
Columns: ['pid', 'comm', 'uid', 'gid', 'ppid', 'user_pid', 'user_ppid', 'cgroup_id', 'cgroup_name', 'user', 'cpu_ns', 'user_faults', 'kernel_faults', 'vm_mmap_bytes', 'vm_munmap_bytes', 'vm_brk_grow_bytes', 'vm_brk_shrink_bytes', 'bytes_written', 'bytes_read', 'isActive', 'wall_time_dt', 'wall_time_ms', 'container_id', 'container_image', 'container_labels_json']


In [3]:
feature_cols = [
    "cpu_ns",
    "user_faults",
    "kernel_faults",
    "vm_mmap_bytes",
    "vm_munmap_bytes",
    "vm_brk_grow_bytes",
    "vm_brk_shrink_bytes",
    "bytes_written",
    "bytes_read"
]

context_cols = [
    "pid",
    "comm",
    "user",
    "container_id",
    "container_image",
    "wall_time_ms"
]

In [4]:
X = df[feature_cols].copy()
context = df[context_cols].copy()

print("\nFeature matrix sample:")
display(X.head())

print("\nContext sample:")
display(context.head())


Feature matrix sample:


Unnamed: 0,cpu_ns,user_faults,kernel_faults,vm_mmap_bytes,vm_munmap_bytes,vm_brk_grow_bytes,vm_brk_shrink_bytes,bytes_written,bytes_read
0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0
2,562689,2,0,0,0,0,0,0,0
3,275860759,2757,5,0,0,0,0,304006,173388
4,0,0,0,0,0,0,0,0,0



Context sample:


Unnamed: 0,pid,comm,user,container_id,container_image,wall_time_ms
0,307,weston,aleyi,,,1757522094277
1,680,systemd-logind,root,,,1757522094277
2,2444,buildkitd,root,,,1757522094277
3,3798,ThreadPool,systemd-resolve,f111e1925e8b07a6b917d9c963da0ce52f4d23bf654dc5...,clickhouse/clickhouse-server:latest,1757522094277
4,2461,Relay(1457),root,,,1757522094277


In [5]:
X = X.fillna(0)

In [6]:
records = []
for i, row in X.iterrows():
    record = row.to_dict()
    # Keep context attached
    records.append((record, context.iloc[i].to_dict()))

print("\nExample record with context:")
print(records[0])


Example record with context:
({'cpu_ns': 0, 'user_faults': 0, 'kernel_faults': 0, 'vm_mmap_bytes': 0, 'vm_munmap_bytes': 0, 'vm_brk_grow_bytes': 0, 'vm_brk_shrink_bytes': 0, 'bytes_written': 0, 'bytes_read': 0}, {'pid': 307, 'comm': 'weston', 'user': 'aleyi', 'container_id': nan, 'container_image': nan, 'wall_time_ms': 1757522094277})


In [7]:
from river import compose, preprocessing, anomaly
import numpy as np

In [8]:
model = compose.Pipeline(
    preprocessing.MinMaxScaler(),
    anomaly.HalfSpaceTrees(
        n_trees=25,
        height=8,
        window_size=250,
        seed=42
    )
)

scores = []

for features, meta in records:
    score = model.score_one(features)  # anomaly score
    model.learn_one(features)          # update model
    
    record_out = {**meta, **features, "anomaly_score": score}
    scores.append(record_out)

In [9]:
results = pd.DataFrame(scores)
mean_score = results["anomaly_score"].mean()
std_score = results["anomaly_score"].std()
min_score = results["anomaly_score"].min()
max_score = results["anomaly_score"].max()

q95 = np.percentile(results["anomaly_score"], 95)
q99 = np.percentile(results["anomaly_score"], 99)

rate_q95 = (results["anomaly_score"] > q95).mean()
rate_q99 = (results["anomaly_score"] > q99).mean()

print("📊 Anomaly Score Metrics")
print(f"Mean: {mean_score:.4f}, Std: {std_score:.4f}")
print(f"Min: {min_score:.4f}, Max: {max_score:.4f}")
print(f"95th percentile: {q95:.4f} → anomaly rate ~ {rate_q95*100:.2f}%")
print(f"99th percentile: {q99:.4f} → anomaly rate ~ {rate_q99*100:.2f}%")


📊 Anomaly Score Metrics
Mean: 0.0382, Std: 0.0988
Min: 0.0000, Max: 0.9950
95th percentile: 0.0747 → anomaly rate ~ 4.78%
99th percentile: 0.6720 → anomaly rate ~ 1.02%


In [10]:
top_anomalies = results.sort_values("anomaly_score", ascending=False).head(10)
display(top_anomalies)

Unnamed: 0,pid,comm,user,container_id,container_image,wall_time_ms,cpu_ns,user_faults,kernel_faults,vm_mmap_bytes,vm_munmap_bytes,vm_brk_grow_bytes,vm_brk_shrink_bytes,bytes_written,bytes_read,anomaly_score
622,7414,apt.systemd.dai,root,,,1757521306063,347785956,12977,930,425988070,136538932,11952128,53248,5206627,15270070,0.994984
364,3798,ThreadPool,systemd-resolve,f111e1925e8b07a6b917d9c963da0ce52f4d23bf654dc5...,clickhouse/clickhouse-server:latest,1757521266173,273437862,9400,11,50339840,0,0,0,648924,174294,0.956053
420,3798,ThreadPool,systemd-resolve,f111e1925e8b07a6b917d9c963da0ce52f4d23bf654dc5...,clickhouse/clickhouse-server:latest,1757521276113,193046777,16290,14,0,0,0,0,669786,178912,0.941874
3885,3798,ThreadPool,systemd-resolve,f111e1925e8b07a6b917d9c963da0ce52f4d23bf654dc5...,clickhouse/clickhouse-server:latest,1757521884851,325590752,20877,1,0,0,0,0,788837,181584,0.911238
4512,3798,ThreadPool,systemd-resolve,f111e1925e8b07a6b917d9c963da0ce52f4d23bf654dc5...,clickhouse/clickhouse-server:latest,1757521994527,341435748,20719,22,0,0,0,0,1187378,178632,0.911207
3027,3798,ThreadPool,systemd-resolve,f111e1925e8b07a6b917d9c963da0ce52f4d23bf654dc5...,clickhouse/clickhouse-server:latest,1757521735215,314573875,21823,8,16777216,0,0,0,964279,180617,0.911207
3198,3798,ThreadPool,systemd-resolve,f111e1925e8b07a6b917d9c963da0ce52f4d23bf654dc5...,clickhouse/clickhouse-server:latest,1757521765148,359909533,23001,16,65011712,0,0,0,1122034,180186,0.911207
2798,3798,ThreadPool,systemd-resolve,f111e1925e8b07a6b917d9c963da0ce52f4d23bf654dc5...,clickhouse/clickhouse-server:latest,1757521695278,281736385,19316,7,0,0,0,0,632135,179420,0.892321
4284,3798,ThreadPool,systemd-resolve,f111e1925e8b07a6b917d9c963da0ce52f4d23bf654dc5...,clickhouse/clickhouse-server:latest,1757521954716,283691550,22922,21,0,0,0,0,1318073,181270,0.892156
4113,3798,ThreadPool,systemd-resolve,f111e1925e8b07a6b917d9c963da0ce52f4d23bf654dc5...,clickhouse/clickhouse-server:latest,1757521924779,270658970,21942,3,52428800,0,0,0,899197,179312,0.890529
