In [10]:
import river
import pandas as pd

In [11]:
data_path = "../data/resource_events.csv"
df = pd.read_csv(data_path)

print("Dataset shape:", df.shape)
print("Columns:", df.columns.tolist())

Dataset shape: (454, 25)
Columns: ['pid', 'comm', 'uid', 'gid', 'ppid', 'user_pid', 'user_ppid', 'cgroup_id', 'cgroup_name', 'user', 'cpu_ns', 'user_faults', 'kernel_faults', 'vm_mmap_bytes', 'vm_munmap_bytes', 'vm_brk_grow_bytes', 'vm_brk_shrink_bytes', 'bytes_written', 'bytes_read', 'isActive', 'wall_time_dt', 'wall_time_ms', 'container_id', 'container_image', 'container_labels_json']


In [12]:
feature_cols = [
    "cpu_ns",
    "user_faults",
    "kernel_faults",
    "vm_mmap_bytes",
    "vm_munmap_bytes",
    "vm_brk_grow_bytes",
    "vm_brk_shrink_bytes",
    "bytes_written",
    "bytes_read"
]

context_cols = [
    "pid",
    "comm",
    "user",
    "container_id",
    "container_image",
    "wall_time_ms"
]

In [13]:
X = df[feature_cols].copy()
context = df[context_cols].copy()

print("\nFeature matrix sample:")
display(X.head())

print("\nContext sample:")
display(context.head())


Feature matrix sample:


Unnamed: 0,cpu_ns,user_faults,kernel_faults,vm_mmap_bytes,vm_munmap_bytes,vm_brk_grow_bytes,vm_brk_shrink_bytes,bytes_written,bytes_read
0,3374785,0,0,0,0,0,0,0,0
1,899677,14,0,0,0,0,0,498,2660
2,0,0,0,0,0,0,0,0,0
3,12700969,2,0,0,0,0,0,0,4
4,3758715,23,1,0,0,0,0,17126,14909



Context sample:


Unnamed: 0,pid,comm,user,container_id,container_image,wall_time_ms
0,237,chronyd,,,,1757364894741
1,4226,containerd-shim,root,,,1757364894741
2,3806,buildkitd,root,,,1757364894741
3,4323,VM Periodic Tas,systemd-network,04242fd9b1ae2cf32adc297f0b79e8120c2833f0e4cfe5...,provectuslabs/kafka-ui:latest,1757364894741
4,7710,main,aleyi,,,1757364894741


In [14]:
X = X.fillna(0)

In [15]:
records = []
for i, row in X.iterrows():
    record = row.to_dict()
    # Keep context attached
    records.append((record, context.iloc[i].to_dict()))

print("\nExample record with context:")
print(records[0])


Example record with context:
({'cpu_ns': 3374785, 'user_faults': 0, 'kernel_faults': 0, 'vm_mmap_bytes': 0, 'vm_munmap_bytes': 0, 'vm_brk_grow_bytes': 0, 'vm_brk_shrink_bytes': 0, 'bytes_written': 0, 'bytes_read': 0}, {'pid': 237, 'comm': 'chronyd', 'user': nan, 'container_id': nan, 'container_image': nan, 'wall_time_ms': 1757364894741})


In [16]:
from river import compose, preprocessing, anomaly
import numpy as np

In [17]:
model = compose.Pipeline(
    preprocessing.MinMaxScaler(),
    anomaly.HalfSpaceTrees(
        n_trees=25,
        height=8,
        window_size=250,
        seed=42
    )
)

scores = []

for features, meta in records:
    score = model.score_one(features)  # anomaly score
    model.learn_one(features)          # update model
    
    record_out = {**meta, **features, "anomaly_score": score}
    scores.append(record_out)

In [19]:
results = pd.DataFrame(scores)
mean_score = results["anomaly_score"].mean()
std_score = results["anomaly_score"].std()
min_score = results["anomaly_score"].min()
max_score = results["anomaly_score"].max()

q95 = np.percentile(results["anomaly_score"], 95)
q99 = np.percentile(results["anomaly_score"], 99)

rate_q95 = (results["anomaly_score"] > q95).mean()
rate_q99 = (results["anomaly_score"] > q99).mean()

print("📊 Anomaly Score Metrics")
print(f"Mean: {mean_score:.4f}, Std: {std_score:.4f}")
print(f"Min: {min_score:.4f}, Max: {max_score:.4f}")
print(f"95th percentile: {q95:.4f} → anomaly rate ~ {rate_q95*100:.2f}%")
print(f"99th percentile: {q99:.4f} → anomaly rate ~ {rate_q99*100:.2f}%")


📊 Anomaly Score Metrics
Mean: 0.0682, Std: 0.1190
Min: 0.0000, Max: 0.9680
95th percentile: 0.1726 → anomaly rate ~ 4.85%
99th percentile: 0.6755 → anomaly rate ~ 1.10%


In [20]:
top_anomalies = results.sort_values("anomaly_score", ascending=False).head(10)
display(top_anomalies)

Unnamed: 0,pid,comm,user,container_id,container_image,wall_time_ms,cpu_ns,user_faults,kernel_faults,vm_mmap_bytes,vm_munmap_bytes,vm_brk_grow_bytes,vm_brk_shrink_bytes,bytes_written,bytes_read,anomaly_score
449,4145,ThreadPool,systemd-resolve,43ec67b6e84c43b3ef18e9bae1a5bcc5a53a9d8189a9f4...,clickhouse/clickhouse-server:latest,1757364874742,348349489,18452,5,47185920,0,0,0,674030,183123,0.968013
396,4145,ThreadPool,systemd-resolve,43ec67b6e84c43b3ef18e9bae1a5bcc5a53a9d8189a9f4...,clickhouse/clickhouse-server:latest,1757364864718,325719769,8720,5,8388608,0,0,0,576286,174232,0.9541
290,4145,ThreadPool,systemd-resolve,43ec67b6e84c43b3ef18e9bae1a5bcc5a53a9d8189a9f4...,clickhouse/clickhouse-server:latest,1757364844725,353755936,5125,3,31457280,0,0,0,297237,176937,0.869841
343,4145,ThreadPool,systemd-resolve,43ec67b6e84c43b3ef18e9bae1a5bcc5a53a9d8189a9f4...,clickhouse/clickhouse-server:latest,1757364854727,368097514,4861,7,15204352,0,0,0,328007,174000,0.863321
326,3343,dockerd,root,,,1757364854727,18194130,38,0,0,0,0,0,133389,2784570,0.721252
274,3343,dockerd,root,,,1757364844724,15963104,246,0,0,0,0,0,95060,394547,0.634856
289,3145,init,root,,,1757364844725,34101546,4,0,0,0,0,0,280114,269508,0.550442
327,4143,client-metrics-,aleyi,a1df103d039148efb25fc00f7cff04203adcf7c0849240...,apache/kafka:latest,1757364854727,148282893,3184,0,18559120,16670720,0,0,11529,15608,0.474294
433,4143,client-metrics-,aleyi,a1df103d039148efb25fc00f7cff04203adcf7c0849240...,apache/kafka:latest,1757364874742,140733148,924,0,4730880,4685824,0,0,11915,10668,0.474294
275,4143,client-metrics-,aleyi,a1df103d039148efb25fc00f7cff04203adcf7c0849240...,apache/kafka:latest,1757364844724,155888882,2062,0,9843152,8306688,0,0,11207,10233,0.474294
