In [21]:
import river
import pandas as pd

In [22]:
data_path = "../data/resource_events.csv"
df = pd.read_csv(data_path)

print("Dataset shape:", df.shape)
print("Columns:", df.columns.tolist())

Dataset shape: (454, 25)
Columns: ['pid', 'comm', 'uid', 'gid', 'ppid', 'user_pid', 'user_ppid', 'cgroup_id', 'cgroup_name', 'user', 'cpu_ns', 'user_faults', 'kernel_faults', 'vm_mmap_bytes', 'vm_munmap_bytes', 'vm_brk_grow_bytes', 'vm_brk_shrink_bytes', 'bytes_written', 'bytes_read', 'isActive', 'wall_time_dt', 'wall_time_ms', 'container_id', 'container_image', 'container_labels_json']


In [23]:
feature_cols = [
    "cpu_ns",
    "user_faults",
    "kernel_faults",
    "vm_mmap_bytes",
    "vm_munmap_bytes",
    "vm_brk_grow_bytes",
    "vm_brk_shrink_bytes",
    "bytes_written",
    "bytes_read"
]

context_cols = [
    "pid",
    "comm",
    "user",
    "container_id",
    "container_image",
    "wall_time_ms"
]

In [24]:
X = df[feature_cols].copy()
context = df[context_cols].copy()

print("\nFeature matrix sample:")
display(X.head())

print("\nContext sample:")
display(context.head())


Feature matrix sample:


Unnamed: 0,cpu_ns,user_faults,kernel_faults,vm_mmap_bytes,vm_munmap_bytes,vm_brk_grow_bytes,vm_brk_shrink_bytes,bytes_written,bytes_read
0,3374785,0,0,0,0,0,0,0,0
1,899677,14,0,0,0,0,0,498,2660
2,0,0,0,0,0,0,0,0,0
3,12700969,2,0,0,0,0,0,0,4
4,3758715,23,1,0,0,0,0,17126,14909



Context sample:


Unnamed: 0,pid,comm,user,container_id,container_image,wall_time_ms
0,237,chronyd,,,,1757364894741
1,4226,containerd-shim,root,,,1757364894741
2,3806,buildkitd,root,,,1757364894741
3,4323,VM Periodic Tas,systemd-network,04242fd9b1ae2cf32adc297f0b79e8120c2833f0e4cfe5...,provectuslabs/kafka-ui:latest,1757364894741
4,7710,main,aleyi,,,1757364894741


In [25]:
X = X.fillna(0)

In [26]:
records = []
for i, row in X.iterrows():
    record = row.to_dict()
    # Keep context attached
    records.append((record, context.iloc[i].to_dict()))

print("\nExample record with context:")
print(records[0])


Example record with context:
({'cpu_ns': 3374785, 'user_faults': 0, 'kernel_faults': 0, 'vm_mmap_bytes': 0, 'vm_munmap_bytes': 0, 'vm_brk_grow_bytes': 0, 'vm_brk_shrink_bytes': 0, 'bytes_written': 0, 'bytes_read': 0}, {'pid': 237, 'comm': 'chronyd', 'user': nan, 'container_id': nan, 'container_image': nan, 'wall_time_ms': 1757364894741})


In [27]:
from river import compose, preprocessing, anomaly
import numpy as np

In [31]:
model = anomaly.LocalOutlierFactor(n_neighbors=35)

scores = []

for features, meta in records:
    score = model.score_one(features)  # anomaly score
    model.learn_one(features)          # update model
    
    record_out = {**meta, **features, "anomaly_score": score}
    scores.append(record_out)

In [32]:
results = pd.DataFrame(scores)
mean_score = results["anomaly_score"].mean()
std_score = results["anomaly_score"].std()
min_score = results["anomaly_score"].min()
max_score = results["anomaly_score"].max()

q95 = np.percentile(results["anomaly_score"], 95)
q99 = np.percentile(results["anomaly_score"], 99)

rate_q95 = (results["anomaly_score"] > q95).mean()
rate_q99 = (results["anomaly_score"] > q99).mean()

print("ðŸ“Š Anomaly Score Metrics")
print(f"Mean: {mean_score:.4f}, Std: {std_score:.4f}")
print(f"Min: {min_score:.4f}, Max: {max_score:.4f}")
print(f"95th percentile: {q95:.4f} â†’ anomaly rate ~ {rate_q95*100:.2f}%")
print(f"99th percentile: {q99:.4f} â†’ anomaly rate ~ {rate_q99*100:.2f}%")


ðŸ“Š Anomaly Score Metrics
Mean: 3.8148, Std: 32.4527
Min: 0.0000, Max: 620.1936
95th percentile: 5.2860 â†’ anomaly rate ~ 5.07%
99th percentile: 25.6968 â†’ anomaly rate ~ 1.10%


In [33]:
top_anomalies = results.sort_values("anomaly_score", ascending=False).head(10)
display(top_anomalies)

Unnamed: 0,pid,comm,user,container_id,container_image,wall_time_ms,cpu_ns,user_faults,kernel_faults,vm_mmap_bytes,vm_munmap_bytes,vm_brk_grow_bytes,vm_brk_shrink_bytes,bytes_written,bytes_read,anomaly_score
130,8000,systemd,root,,,1757364814694,19607244,3665,23,3069060436,1384950036,425984,0,1175,95282,620.193579
109,7710,main,aleyi,,,1757364814693,599916,341,2,504446976,201326592,0,0,140,125,265.316893
249,7710,main,aleyi,,,1757364844723,3989631,329,3,223158272,67108864,0,0,16461,14825,147.796413
101,4145,ThreadPool,systemd-resolve,43ec67b6e84c43b3ef18e9bae1a5bcc5a53a9d8189a9f4...,clickhouse/clickhouse-server:latest,1757364884750,384032268,4117,5,27262976,0,0,0,365434,172279,55.541904
85,4143,client-metrics-,aleyi,a1df103d039148efb25fc00f7cff04203adcf7c0849240...,apache/kafka:latest,1757364884750,164648524,4307,0,19653652,18128896,4096,0,11766,15831,30.718021
148,4145,ThreadPool,systemd-resolve,43ec67b6e84c43b3ef18e9bae1a5bcc5a53a9d8189a9f4...,clickhouse/clickhouse-server:latest,1757364814694,285944578,4738,2,34603008,0,0,0,261545,175699,21.244063
133,4143,client-metrics-,aleyi,a1df103d039148efb25fc00f7cff04203adcf7c0849240...,apache/kafka:latest,1757364814694,132470841,4447,0,26537596,25927680,0,0,1869,6129,17.734422
102,3225,containerd,root,,,1757364884750,90622885,4,0,0,0,0,0,10568,9375,13.213989
241,4145,ThreadPool,systemd-resolve,43ec67b6e84c43b3ef18e9bae1a5bcc5a53a9d8189a9f4...,clickhouse/clickhouse-server:latest,1757364834694,363505296,19821,10,42991616,0,0,0,909854,186448,11.644188
343,4145,ThreadPool,systemd-resolve,43ec67b6e84c43b3ef18e9bae1a5bcc5a53a9d8189a9f4...,clickhouse/clickhouse-server:latest,1757364854727,368097514,4861,7,15204352,0,0,0,328007,174000,11.181956


In [34]:
model = compose.Pipeline(
    preprocessing.StandardScaler(),   # <-- better for LOF
    anomaly.LocalOutlierFactor(n_neighbors=35)
)
scores = []

for features, meta in records:
    score = model.score_one(features)  # anomaly score
    model.learn_one(features)          # update model
    
    record_out = {**meta, **features, "anomaly_score": score}
    scores.append(record_out)

In [35]:
results = pd.DataFrame(scores)
mean_score = results["anomaly_score"].mean()
std_score = results["anomaly_score"].std()
min_score = results["anomaly_score"].min()
max_score = results["anomaly_score"].max()

q95 = np.percentile(results["anomaly_score"], 95)
q99 = np.percentile(results["anomaly_score"], 99)

rate_q95 = (results["anomaly_score"] > q95).mean()
rate_q99 = (results["anomaly_score"] > q99).mean()

print("ðŸ“Š Anomaly Score Metrics")
print(f"Mean: {mean_score:.4f}, Std: {std_score:.4f}")
print(f"Min: {min_score:.4f}, Max: {max_score:.4f}")
print(f"95th percentile: {q95:.4f} â†’ anomaly rate ~ {rate_q95*100:.2f}%")
print(f"99th percentile: {q99:.4f} â†’ anomaly rate ~ {rate_q99*100:.2f}%")


ðŸ“Š Anomaly Score Metrics
Mean: 6.9240, Std: 72.4857
Min: 0.0000, Max: 1515.2155
95th percentile: 11.2425 â†’ anomaly rate ~ 5.07%
99th percentile: 40.1253 â†’ anomaly rate ~ 1.10%


In [36]:
top_anomalies = results.sort_values("anomaly_score", ascending=False).head(10)
display(top_anomalies)

Unnamed: 0,pid,comm,user,container_id,container_image,wall_time_ms,cpu_ns,user_faults,kernel_faults,vm_mmap_bytes,vm_munmap_bytes,vm_brk_grow_bytes,vm_brk_shrink_bytes,bytes_written,bytes_read,anomaly_score
130,8000,systemd,root,,,1757364814694,19607244,3665,23,3069060436,1384950036,425984,0,1175,95282,1515.21552
109,7710,main,aleyi,,,1757364814693,599916,341,2,504446976,201326592,0,0,140,125,264.409405
85,4143,client-metrics-,aleyi,a1df103d039148efb25fc00f7cff04203adcf7c0849240...,apache/kafka:latest,1757364884750,164648524,4307,0,19653652,18128896,4096,0,11766,15831,143.512571
258,356,systemd-udevd,root,,,1757364844723,1455939,95,4,0,0,0,0,0,142,45.161868
48,4145,ThreadPool,systemd-resolve,43ec67b6e84c43b3ef18e9bae1a5bcc5a53a9d8189a9f4...,clickhouse/clickhouse-server:latest,1757364894742,385668638,7622,4,0,0,0,0,545051,173592,42.586765
116,356,systemd-udevd,root,,,1757364814694,2192408,102,4,0,0,0,0,0,126,37.942584
249,7710,main,aleyi,,,1757364844723,3989631,329,3,223158272,67108864,0,0,16461,14825,35.268741
124,7980,ebpf_loader,root,,,1757364814694,553920,199,4,4456448,0,0,0,1159,16078,34.919767
101,4145,ThreadPool,systemd-resolve,43ec67b6e84c43b3ef18e9bae1a5bcc5a53a9d8189a9f4...,clickhouse/clickhouse-server:latest,1757364884750,384032268,4117,5,27262976,0,0,0,365434,172279,25.923178
84,3343,dockerd,root,,,1757364884750,18749037,23,0,0,0,0,0,107104,396077,22.890474
