In [1]:
import river
import pandas as pd

In [2]:
data_path = "../data/resource_events2.csv"
df = pd.read_csv(data_path)

print("Dataset shape:", df.shape)
print("Columns:", df.columns.tolist())

Dataset shape: (5024, 25)
Columns: ['pid', 'comm', 'uid', 'gid', 'ppid', 'user_pid', 'user_ppid', 'cgroup_id', 'cgroup_name', 'user', 'cpu_ns', 'user_faults', 'kernel_faults', 'vm_mmap_bytes', 'vm_munmap_bytes', 'vm_brk_grow_bytes', 'vm_brk_shrink_bytes', 'bytes_written', 'bytes_read', 'isActive', 'wall_time_dt', 'wall_time_ms', 'container_id', 'container_image', 'container_labels_json']


In [3]:
feature_cols = [
    "cpu_ns",
    "user_faults",
    "kernel_faults",
    "vm_mmap_bytes",
    "vm_munmap_bytes",
    "vm_brk_grow_bytes",
    "vm_brk_shrink_bytes",
    "bytes_written",
    "bytes_read"
]

context_cols = [
    "pid",
    "comm",
    "user",
    "container_id",
    "container_image",
    "wall_time_ms"
]

In [4]:
X = df[feature_cols].copy()
context = df[context_cols].copy()

print("\nFeature matrix sample:")
display(X.head())

print("\nContext sample:")
display(context.head())


Feature matrix sample:


Unnamed: 0,cpu_ns,user_faults,kernel_faults,vm_mmap_bytes,vm_munmap_bytes,vm_brk_grow_bytes,vm_brk_shrink_bytes,bytes_written,bytes_read
0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0
2,562689,2,0,0,0,0,0,0,0
3,275860759,2757,5,0,0,0,0,304006,173388
4,0,0,0,0,0,0,0,0,0



Context sample:


Unnamed: 0,pid,comm,user,container_id,container_image,wall_time_ms
0,307,weston,aleyi,,,1757522094277
1,680,systemd-logind,root,,,1757522094277
2,2444,buildkitd,root,,,1757522094277
3,3798,ThreadPool,systemd-resolve,f111e1925e8b07a6b917d9c963da0ce52f4d23bf654dc5...,clickhouse/clickhouse-server:latest,1757522094277
4,2461,Relay(1457),root,,,1757522094277


In [5]:
X = X.fillna(0)

In [6]:
records = []
for i, row in X.iterrows():
    record = row.to_dict()
    # Keep context attached
    records.append((record, context.iloc[i].to_dict()))

print("\nExample record with context:")
print(records[0])


Example record with context:
({'cpu_ns': 0, 'user_faults': 0, 'kernel_faults': 0, 'vm_mmap_bytes': 0, 'vm_munmap_bytes': 0, 'vm_brk_grow_bytes': 0, 'vm_brk_shrink_bytes': 0, 'bytes_written': 0, 'bytes_read': 0}, {'pid': 307, 'comm': 'weston', 'user': 'aleyi', 'container_id': nan, 'container_image': nan, 'wall_time_ms': 1757522094277})


In [7]:
from river import compose, preprocessing, anomaly
import numpy as np

In [8]:
model = anomaly.LocalOutlierFactor(n_neighbors=35)

scores = []

for features, meta in records:
    score = model.score_one(features)  # anomaly score
    model.learn_one(features)          # update model
    
    record_out = {**meta, **features, "anomaly_score": score}
    scores.append(record_out)

In [9]:
results = pd.DataFrame(scores)
mean_score = results["anomaly_score"].mean()
std_score = results["anomaly_score"].std()
min_score = results["anomaly_score"].min()
max_score = results["anomaly_score"].max()

q95 = np.percentile(results["anomaly_score"], 95)
q99 = np.percentile(results["anomaly_score"], 99)

rate_q95 = (results["anomaly_score"] > q95).mean()
rate_q99 = (results["anomaly_score"] > q99).mean()

print("📊 Anomaly Score Metrics")
print(f"Mean: {mean_score:.4f}, Std: {std_score:.4f}")
print(f"Min: {min_score:.4f}, Max: {max_score:.4f}")
print(f"95th percentile: {q95:.4f} → anomaly rate ~ {rate_q95*100:.2f}%")
print(f"99th percentile: {q99:.4f} → anomaly rate ~ {rate_q99*100:.2f}%")


📊 Anomaly Score Metrics
Mean: 1.0600, Std: 8.3506
Min: 0.0000, Max: 447.0571
95th percentile: 1.3139 → anomaly rate ~ 5.02%
99th percentile: 4.3233 → anomaly rate ~ 1.02%


In [10]:
top_anomalies = results.sort_values("anomaly_score", ascending=False).head(10)
display(top_anomalies)

Unnamed: 0,pid,comm,user,container_id,container_image,wall_time_ms,cpu_ns,user_faults,kernel_faults,vm_mmap_bytes,vm_munmap_bytes,vm_brk_grow_bytes,vm_brk_shrink_bytes,bytes_written,bytes_read,anomaly_score
179,7303,systemd,root,,,1757521226215,20792835,3647,31,3069584724,1452058900,450560,0,1176,95335,447.057072
237,7058,main,aleyi,,,1757521236222,4235436,579,5,294989824,67108864,0,0,14649,13692,206.954428
669,7058,main,aleyi,,,1757521316070,3465237,69,1,142610432,67108864,0,0,18277,16275,172.229507
447,7058,main,aleyi,,,1757521276113,3377062,122,0,142872576,67108864,0,0,17666,15439,165.670625
1105,7058,main,aleyi,,,1757521395892,2338345,6,0,75501568,0,0,0,17383,15090,164.340371
284,7058,main,aleyi,,,1757521246173,4014390,295,2,218701824,67108864,0,0,15488,13583,128.337735
119,3798,ThreadPool,systemd-resolve,f111e1925e8b07a6b917d9c963da0ce52f4d23bf654dc5...,clickhouse/clickhouse-server:latest,1757522084277,270467231,4916,4,0,0,0,0,423628,173100,66.477575
131,3797,ExpirationReape,aleyi,8f1c9b9e79a5fc17b1d8041c747462a33ed682a85abf04...,apache/kafka:latest,1757522084277,116199865,2631,0,13722892,11530240,0,0,12269,15969,27.401799
73,3797,ExpirationReape,aleyi,8f1c9b9e79a5fc17b1d8041c747462a33ed682a85abf04...,apache/kafka:latest,1757522104278,86715551,402,0,2023424,1298432,0,0,12073,10999,21.256191
665,2237,containerd-shim,root,,,1757521316070,1284991,42,0,4194304,0,0,0,506,2736,18.774578


In [11]:
model = compose.Pipeline(
    preprocessing.StandardScaler(),   # <-- better for LOF
    anomaly.LocalOutlierFactor(n_neighbors=35)
)
scores = []

for features, meta in records:
    score = model.score_one(features)  # anomaly score
    model.learn_one(features)          # update model
    
    record_out = {**meta, **features, "anomaly_score": score}
    scores.append(record_out)

In [12]:
results = pd.DataFrame(scores)
mean_score = results["anomaly_score"].mean()
std_score = results["anomaly_score"].std()
min_score = results["anomaly_score"].min()
max_score = results["anomaly_score"].max()

q95 = np.percentile(results["anomaly_score"], 95)
q99 = np.percentile(results["anomaly_score"], 99)

rate_q95 = (results["anomaly_score"] > q95).mean()
rate_q99 = (results["anomaly_score"] > q99).mean()

print("📊 Anomaly Score Metrics")
print(f"Mean: {mean_score:.4f}, Std: {std_score:.4f}")
print(f"Min: {min_score:.4f}, Max: {max_score:.4f}")
print(f"95th percentile: {q95:.4f} → anomaly rate ~ {rate_q95*100:.2f}%")
print(f"99th percentile: {q99:.4f} → anomaly rate ~ {rate_q99*100:.2f}%")


📊 Anomaly Score Metrics
Mean: 6.5968, Std: 351.2722
Min: 0.0000, Max: 24893.0546
95th percentile: 2.6994 → anomaly rate ~ 5.02%
99th percentile: 8.5252 → anomaly rate ~ 1.02%


In [13]:
top_anomalies = results.sort_values("anomaly_score", ascending=False).head(10)
display(top_anomalies)

Unnamed: 0,pid,comm,user,container_id,container_image,wall_time_ms,cpu_ns,user_faults,kernel_faults,vm_mmap_bytes,vm_munmap_bytes,vm_brk_grow_bytes,vm_brk_shrink_bytes,bytes_written,bytes_read,anomaly_score
179,7303,systemd,root,,,1757521226215,20792835,3647,31,3069584724,1452058900,450560,0,1176,95335,24893.054582
131,3797,ExpirationReape,aleyi,8f1c9b9e79a5fc17b1d8041c747462a33ed682a85abf04...,apache/kafka:latest,1757522084277,116199865,2631,0,13722892,11530240,0,0,12269,15969,400.541852
622,7414,apt.systemd.dai,root,,,1757521306063,347785956,12977,930,425988070,136538932,11952128,53248,5206627,15270070,287.025805
107,1292,dockerd,root,,,1757522104279,9930664,2,0,0,0,0,0,132155,2784290,230.747544
73,3797,ExpirationReape,aleyi,8f1c9b9e79a5fc17b1d8041c747462a33ed682a85abf04...,apache/kafka:latest,1757522104278,86715551,402,0,2023424,1298432,0,0,12073,10999,151.684989
60,3798,ThreadPool,systemd-resolve,f111e1925e8b07a6b917d9c963da0ce52f4d23bf654dc5...,clickhouse/clickhouse-server:latest,1757522104278,211277623,4630,8,0,0,0,0,896835,176126,56.200604
447,7058,main,aleyi,,,1757521276113,3377062,122,0,142872576,67108864,0,0,17666,15439,50.833172
119,3798,ThreadPool,systemd-resolve,f111e1925e8b07a6b917d9c963da0ce52f4d23bf654dc5...,clickhouse/clickhouse-server:latest,1757522084277,270467231,4916,4,0,0,0,0,423628,173100,49.239291
88,977,init,root,,,1757522104279,15670043,3,0,0,0,0,0,355755,345213,38.036238
68,242,init,root,,,1757522104278,22737212,39,0,315412,32768,0,0,2380,23990,35.892255
