In [20]:
import river
import pandas as pd

In [21]:
data_path = "../data/resource_events2.csv"
df = pd.read_csv(data_path)

print("Dataset shape:", df.shape)
print("Columns:", df.columns.tolist())

Dataset shape: (5024, 25)
Columns: ['pid', 'comm', 'uid', 'gid', 'ppid', 'user_pid', 'user_ppid', 'cgroup_id', 'cgroup_name', 'user', 'cpu_ns', 'user_faults', 'kernel_faults', 'vm_mmap_bytes', 'vm_munmap_bytes', 'vm_brk_grow_bytes', 'vm_brk_shrink_bytes', 'bytes_written', 'bytes_read', 'isActive', 'wall_time_dt', 'wall_time_ms', 'container_id', 'container_image', 'container_labels_json']


In [22]:
feature_cols = [
    "cpu_ns",
    "user_faults",
    "kernel_faults",
    "vm_mmap_bytes",
    "vm_munmap_bytes",
    "vm_brk_grow_bytes",
    "vm_brk_shrink_bytes",
    "bytes_written",
    "bytes_read"
]

context_cols = [
    "pid",
    "comm",
    "user",
    "container_id",
    "container_image",
    "wall_time_ms"
]

In [23]:
X = df[feature_cols].copy()
context = df[context_cols].copy()

print("\nFeature matrix sample:")
display(X.head())

print("\nContext sample:")
display(context.head())


Feature matrix sample:


Unnamed: 0,cpu_ns,user_faults,kernel_faults,vm_mmap_bytes,vm_munmap_bytes,vm_brk_grow_bytes,vm_brk_shrink_bytes,bytes_written,bytes_read
0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0
2,562689,2,0,0,0,0,0,0,0
3,275860759,2757,5,0,0,0,0,304006,173388
4,0,0,0,0,0,0,0,0,0



Context sample:


Unnamed: 0,pid,comm,user,container_id,container_image,wall_time_ms
0,307,weston,aleyi,,,1757522094277
1,680,systemd-logind,root,,,1757522094277
2,2444,buildkitd,root,,,1757522094277
3,3798,ThreadPool,systemd-resolve,f111e1925e8b07a6b917d9c963da0ce52f4d23bf654dc5...,clickhouse/clickhouse-server:latest,1757522094277
4,2461,Relay(1457),root,,,1757522094277


In [24]:
X = X.fillna(0)

In [25]:
records = []
for i, row in X.iterrows():
    record = row.to_dict()
    # Keep context attached
    records.append((record, context.iloc[i].to_dict()))

print("\nExample record with context:")
print(records[0])


Example record with context:
({'cpu_ns': 0, 'user_faults': 0, 'kernel_faults': 0, 'vm_mmap_bytes': 0, 'vm_munmap_bytes': 0, 'vm_brk_grow_bytes': 0, 'vm_brk_shrink_bytes': 0, 'bytes_written': 0, 'bytes_read': 0}, {'pid': 307, 'comm': 'weston', 'user': 'aleyi', 'container_id': nan, 'container_image': nan, 'wall_time_ms': 1757522094277})


In [26]:
from river import compose, preprocessing, anomaly, feature_extraction
import numpy as np

In [27]:
model = compose.Pipeline(
    preprocessing.StandardScaler(),
    anomaly.OneClassSVM(nu=0.1, intercept_lr=0.01)
)

scores = []
for features, meta in records:
    score = model.score_one(features)    # anomaly score
    model.learn_one(features)    # update model
    
    record_out = {**meta, **features, "anomaly_score": score}
    scores.append(record_out)



In [28]:
results = pd.DataFrame(scores)
mean_score = results["anomaly_score"].mean()
std_score = results["anomaly_score"].std()
min_score = results["anomaly_score"].min()
max_score = results["anomaly_score"].max()

q95 = np.percentile(results["anomaly_score"], 95)
q99 = np.percentile(results["anomaly_score"], 99)

rate_q95 = (results["anomaly_score"] > q95).mean()
rate_q99 = (results["anomaly_score"] > q99).mean()

print("📊 Anomaly Score Metrics")
print(f"Mean: {mean_score:.4f}, Std: {std_score:.4f}")
print(f"Min: {min_score:.4f}, Max: {max_score:.4f}")
print(f"95th percentile: {q95:.4f} → anomaly rate ~ {rate_q95*100:.2f}%")
print(f"99th percentile: {q99:.4f} → anomaly rate ~ {rate_q99*100:.2f}%")


📊 Anomaly Score Metrics
Mean: 0.0170, Std: 0.7714
Min: -0.2228, Max: 48.7503
95th percentile: 0.0298 → anomaly rate ~ 5.02%
99th percentile: 0.2314 → anomaly rate ~ 1.02%


In [29]:
top_anomalies = results.sort_values("anomaly_score", ascending=False).head(10)
display(top_anomalies)

Unnamed: 0,pid,comm,user,container_id,container_image,wall_time_ms,cpu_ns,user_faults,kernel_faults,vm_mmap_bytes,vm_munmap_bytes,vm_brk_grow_bytes,vm_brk_shrink_bytes,bytes_written,bytes_read,anomaly_score
622,7414,apt.systemd.dai,root,,,1757521306063,347785956,12977,930,425988070,136538932,11952128,53248,5206627,15270070,48.750309
179,7303,systemd,root,,,1757521226215,20792835,3647,31,3069584724,1452058900,450560,0,1176,95335,21.533098
15,3797,ExpirationReape,aleyi,8f1c9b9e79a5fc17b1d8041c747462a33ed682a85abf04...,apache/kafka:latest,1757522094277,104181245,4,0,0,2228224,0,0,12091,10967,10.971862
107,1292,dockerd,root,,,1757522104279,9930664,2,0,0,0,0,0,132155,2784290,1.542469
4500,1292,dockerd,root,,,1757521984617,19541211,265,0,0,0,0,0,349837,5179496,1.540917
4742,3798,ThreadPool,systemd-resolve,f111e1925e8b07a6b917d9c963da0ce52f4d23bf654dc5...,clickhouse/clickhouse-server:latest,1757522034464,254847684,21375,10,14680064,0,0,0,987097,182162,1.507727
214,3798,ThreadPool,systemd-resolve,f111e1925e8b07a6b917d9c963da0ce52f4d23bf654dc5...,clickhouse/clickhouse-server:latest,1757521236222,271198581,22759,17,89653248,0,0,0,974970,195457,1.340162
131,3797,ExpirationReape,aleyi,8f1c9b9e79a5fc17b1d8041c747462a33ed682a85abf04...,apache/kafka:latest,1757522084277,116199865,2631,0,13722892,11530240,0,0,12269,15969,1.271849
2798,3798,ThreadPool,systemd-resolve,f111e1925e8b07a6b917d9c963da0ce52f4d23bf654dc5...,clickhouse/clickhouse-server:latest,1757521695278,281736385,19316,7,0,0,0,0,632135,179420,1.2341
4569,3798,ThreadPool,systemd-resolve,f111e1925e8b07a6b917d9c963da0ce52f4d23bf654dc5...,clickhouse/clickhouse-server:latest,1757522004532,318052073,3298,5,0,0,0,0,477277,174925,1.029699


In [30]:
model = compose.Pipeline(
    preprocessing.StandardScaler(),
    feature_extraction.RBFSampler(n_components=50, seed=42),
    anomaly.OneClassSVM(nu=0.1, intercept_lr=0.01)
)

scores = []
for features, meta in records:
    score = model.score_one(features)    # anomaly score
    model.learn_one(features)    # update model
    
    record_out = {**meta, **features, "anomaly_score": score}
    scores.append(record_out)



In [31]:
results = pd.DataFrame(scores)
mean_score = results["anomaly_score"].mean()
std_score = results["anomaly_score"].std()
min_score = results["anomaly_score"].min()
max_score = results["anomaly_score"].max()

q95 = np.percentile(results["anomaly_score"], 95)
q99 = np.percentile(results["anomaly_score"], 99)

rate_q95 = (results["anomaly_score"] > q95).mean()
rate_q99 = (results["anomaly_score"] > q99).mean()

print("📊 Anomaly Score Metrics")
print(f"Mean: {mean_score:.4f}, Std: {std_score:.4f}")
print(f"Min: {min_score:.4f}, Max: {max_score:.4f}")
print(f"95th percentile: {q95:.4f} → anomaly rate ~ {rate_q95*100:.2f}%")
print(f"99th percentile: {q99:.4f} → anomaly rate ~ {rate_q99*100:.2f}%")


📊 Anomaly Score Metrics
Mean: 3.5407, Std: 1.5292
Min: 0.0000, Max: 6.9829
95th percentile: 6.2886 → anomaly rate ~ 5.02%
99th percentile: 6.7174 → anomaly rate ~ 1.02%


In [32]:
top_anomalies = results.sort_values("anomaly_score", ascending=False).head(10)
display(top_anomalies)

Unnamed: 0,pid,comm,user,container_id,container_image,wall_time_ms,cpu_ns,user_faults,kernel_faults,vm_mmap_bytes,vm_munmap_bytes,vm_brk_grow_bytes,vm_brk_shrink_bytes,bytes_written,bytes_read,anomaly_score
4721,348,Relay(21),root,,,1757522024456,8736338,0,0,0,0,0,0,107508,107508,6.982885
4711,977,init,root,,,1757522024456,19861001,0,0,0,0,0,0,187001,185358,6.925
4701,7283,ebpf_loader,root,,,1757522024456,1415962,0,0,0,0,0,0,11026,18700,6.894498
4687,1206,python3.10,root,,,1757522024455,631847,0,0,0,0,0,0,0,64,6.889412
4686,2461,Relay(1457),root,,,1757522024455,0,0,0,0,0,0,0,0,0,6.889389
4688,3767,containerd-shim,root,,,1757522024455,55070,4,0,0,0,0,0,273,1406,6.886229
4689,440,systemd-udevd,root,,,1757522024455,852097,10,0,0,0,0,0,0,142,6.884625
4690,283,systemd,root,,,1757522024455,876112,0,0,0,0,0,0,0,125,6.880506
4692,414,systemd-journal,root,,,1757522024455,824856,24,0,0,0,0,0,0,1559,6.877891
4691,655,dbus-daemon,messagebus,,,1757522024455,833637,0,0,0,0,0,0,0,0,6.876632


In [33]:
model = compose.Pipeline(
    preprocessing.StandardScaler(),
    anomaly.QuantileFilter(
        anomaly.OneClassSVM(nu=0.2),
        q=0.95
    )
)


scores = []
for features, meta in records:
    score = model.score_one(features)    # anomaly score
    is_anomaly = model['QuantileFilter'].classify(score)
    model.learn_one(features)    # update model
    
    record_out = {**meta, **features, "anomaly_score": score, "is_anomaly": is_anomaly}
    scores.append(record_out)


In [34]:
results = pd.DataFrame(scores)
mean_score = results["anomaly_score"].mean()
std_score = results["anomaly_score"].std()
min_score = results["anomaly_score"].min()
max_score = results["anomaly_score"].max()

q95 = np.percentile(results["anomaly_score"], 95)
q99 = np.percentile(results["anomaly_score"], 99)

rate_q95 = (results["anomaly_score"] > q95).mean()
rate_q99 = (results["anomaly_score"] > q99).mean()

print("📊 Anomaly Score Metrics")
print(f"Mean: {mean_score:.4f}, Std: {std_score:.4f}")
print(f"Min: {min_score:.4f}, Max: {max_score:.4f}")
print(f"95th percentile: {q95:.4f} → anomaly rate ~ {rate_q95*100:.2f}%")
print(f"99th percentile: {q99:.4f} → anomaly rate ~ {rate_q99*100:.2f}%")


📊 Anomaly Score Metrics
Mean: 0.0151, Std: 0.7192
Min: -0.4273, Max: 41.3767
95th percentile: 0.0207 → anomaly rate ~ 5.02%
99th percentile: 0.1835 → anomaly rate ~ 1.02%


In [35]:
top_anomalies = results.sort_values("anomaly_score", ascending=False).head(10)
display(top_anomalies)

Unnamed: 0,pid,comm,user,container_id,container_image,wall_time_ms,cpu_ns,user_faults,kernel_faults,vm_mmap_bytes,vm_munmap_bytes,vm_brk_grow_bytes,vm_brk_shrink_bytes,bytes_written,bytes_read,anomaly_score,is_anomaly
622,7414,apt.systemd.dai,root,,,1757521306063,347785956,12977,930,425988070,136538932,11952128,53248,5206627,15270070,41.376691,True
179,7303,systemd,root,,,1757521226215,20792835,3647,31,3069584724,1452058900,450560,0,1176,95335,29.414969,True
131,3797,ExpirationReape,aleyi,8f1c9b9e79a5fc17b1d8041c747462a33ed682a85abf04...,apache/kafka:latest,1757522084277,116199865,2631,0,13722892,11530240,0,0,12269,15969,1.844534,True
73,3797,ExpirationReape,aleyi,8f1c9b9e79a5fc17b1d8041c747462a33ed682a85abf04...,apache/kafka:latest,1757522104278,86715551,402,0,2023424,1298432,0,0,12073,10999,1.443343,True
4284,3798,ThreadPool,systemd-resolve,f111e1925e8b07a6b917d9c963da0ce52f4d23bf654dc5...,clickhouse/clickhouse-server:latest,1757521954716,283691550,22922,21,0,0,0,0,1318073,181270,1.186228,True
3198,3798,ThreadPool,systemd-resolve,f111e1925e8b07a6b917d9c963da0ce52f4d23bf654dc5...,clickhouse/clickhouse-server:latest,1757521765148,359909533,23001,16,65011712,0,0,0,1122034,180186,1.168857,True
3039,3797,ExpirationReape,aleyi,8f1c9b9e79a5fc17b1d8041c747462a33ed682a85abf04...,apache/kafka:latest,1757521735216,119976884,11786,0,51512868,51056640,0,0,12095,11049,1.051566,True
4170,3798,ThreadPool,systemd-resolve,f111e1925e8b07a6b917d9c963da0ce52f4d23bf654dc5...,clickhouse/clickhouse-server:latest,1757521934707,311464927,5702,5,0,0,0,0,556816,174357,0.90894,True
4182,3797,ExpirationReape,aleyi,8f1c9b9e79a5fc17b1d8041c747462a33ed682a85abf04...,apache/kafka:latest,1757521934708,124473191,12904,0,55716804,38453248,0,0,12064,16019,0.899797,True
4500,1292,dockerd,root,,,1757521984617,19541211,265,0,0,0,0,0,349837,5179496,0.782259,True


In [36]:
model = compose.Pipeline(
    preprocessing.StandardScaler(),
    feature_extraction.RBFSampler(n_components=50, seed=42),
    anomaly.QuantileFilter(
        anomaly.OneClassSVM(nu=0.2),
        q=0.95
    )
)


scores = []
for features, meta in records:
    score = model.score_one(features)    # anomaly score
    is_anomaly = model['QuantileFilter'].classify(score)
    model.learn_one(features)    # update model
    
    record_out = {**meta, **features, "anomaly_score": score, "is_anomaly": is_anomaly}
    scores.append(record_out)


In [37]:
results = pd.DataFrame(scores)
mean_score = results["anomaly_score"].mean()
std_score = results["anomaly_score"].std()
min_score = results["anomaly_score"].min()
max_score = results["anomaly_score"].max()

q95 = np.percentile(results["anomaly_score"], 95)
q99 = np.percentile(results["anomaly_score"], 99)

rate_q95 = (results["anomaly_score"] > q95).mean()
rate_q99 = (results["anomaly_score"] > q99).mean()

print("📊 Anomaly Score Metrics")
print(f"Mean: {mean_score:.4f}, Std: {std_score:.4f}")
print(f"Min: {min_score:.4f}, Max: {max_score:.4f}")
print(f"95th percentile: {q95:.4f} → anomaly rate ~ {rate_q95*100:.2f}%")
print(f"99th percentile: {q99:.4f} → anomaly rate ~ {rate_q99*100:.2f}%")


📊 Anomaly Score Metrics
Mean: 4.7957, Std: 1.9943
Min: 0.0000, Max: 8.2069
95th percentile: 7.5465 → anomaly rate ~ 5.02%
99th percentile: 8.1276 → anomaly rate ~ 1.02%


In [38]:
top_anomalies = results.sort_values("anomaly_score", ascending=False).head(10)
display(top_anomalies)

Unnamed: 0,pid,comm,user,container_id,container_image,wall_time_ms,cpu_ns,user_faults,kernel_faults,vm_mmap_bytes,vm_munmap_bytes,vm_brk_grow_bytes,vm_brk_shrink_bytes,bytes_written,bytes_read,anomaly_score,is_anomaly
4921,242,init,root,,,1757522064363,10472813,10,0,81920,81920,0,0,0,39420,8.206915,True
4864,242,init,root,,,1757522054363,10824874,6,0,49152,49152,0,0,0,24132,8.203401,True
4883,7058,main,aleyi,,,1757522054363,759602,0,0,0,0,0,0,17484,15547,8.186078,True
4872,7283,ebpf_loader,root,,,1757522054363,1004067,0,0,0,0,0,0,10746,18692,8.182469,True
4940,7058,main,aleyi,,,1757522064363,2893184,2,0,0,0,0,0,17736,15679,8.172274,True
4929,7283,ebpf_loader,root,,,1757522064363,1674695,0,0,0,0,0,0,10885,18684,8.169768,True
4881,3982,C2 CompilerThre,systemd-network,04443e68885fad8bbb002b4be5070431920521f6f04a2f...,provectuslabs/kafka-ui:latest,1757522054363,8547291,9,0,98304,0,0,0,389,4876,8.165677,True
4869,3936,grafana,,919236c9febc1b0ca2a6baf24fe5ab4b2fa35e0b7f838c...,grafana/grafana:latest,1757522054363,10854737,0,0,0,0,0,0,328,328,8.156498,True
4878,273,Relay(9),root,,,1757522054363,1014491,0,0,0,0,0,0,6520,6520,8.156061,True
4909,1,mini_init,root,,,1757522054363,12413089,0,0,0,0,0,0,2,62,8.153859,True
