In [1]:
import river
import pandas as pd

In [2]:
data_path = "../data/resource_events.csv"
df = pd.read_csv(data_path)

print("Dataset shape:", df.shape)
print("Columns:", df.columns.tolist())

Dataset shape: (454, 25)
Columns: ['pid', 'comm', 'uid', 'gid', 'ppid', 'user_pid', 'user_ppid', 'cgroup_id', 'cgroup_name', 'user', 'cpu_ns', 'user_faults', 'kernel_faults', 'vm_mmap_bytes', 'vm_munmap_bytes', 'vm_brk_grow_bytes', 'vm_brk_shrink_bytes', 'bytes_written', 'bytes_read', 'isActive', 'wall_time_dt', 'wall_time_ms', 'container_id', 'container_image', 'container_labels_json']


In [3]:
feature_cols = [
    "cpu_ns",
    "user_faults",
    "kernel_faults",
    "vm_mmap_bytes",
    "vm_munmap_bytes",
    "vm_brk_grow_bytes",
    "vm_brk_shrink_bytes",
    "bytes_written",
    "bytes_read"
]

context_cols = [
    "pid",
    "comm",
    "user",
    "container_id",
    "container_image",
    "wall_time_ms"
]

In [4]:
X = df[feature_cols].copy()
context = df[context_cols].copy()

print("\nFeature matrix sample:")
display(X.head())

print("\nContext sample:")
display(context.head())


Feature matrix sample:


Unnamed: 0,cpu_ns,user_faults,kernel_faults,vm_mmap_bytes,vm_munmap_bytes,vm_brk_grow_bytes,vm_brk_shrink_bytes,bytes_written,bytes_read
0,3374785,0,0,0,0,0,0,0,0
1,899677,14,0,0,0,0,0,498,2660
2,0,0,0,0,0,0,0,0,0
3,12700969,2,0,0,0,0,0,0,4
4,3758715,23,1,0,0,0,0,17126,14909



Context sample:


Unnamed: 0,pid,comm,user,container_id,container_image,wall_time_ms
0,237,chronyd,,,,1757364894741
1,4226,containerd-shim,root,,,1757364894741
2,3806,buildkitd,root,,,1757364894741
3,4323,VM Periodic Tas,systemd-network,04242fd9b1ae2cf32adc297f0b79e8120c2833f0e4cfe5...,provectuslabs/kafka-ui:latest,1757364894741
4,7710,main,aleyi,,,1757364894741


In [5]:
X = X.fillna(0)

In [6]:
records = []
for i, row in X.iterrows():
    record = row.to_dict()
    # Keep context attached
    records.append((record, context.iloc[i].to_dict()))

print("\nExample record with context:")
print(records[0])


Example record with context:
({'cpu_ns': 3374785, 'user_faults': 0, 'kernel_faults': 0, 'vm_mmap_bytes': 0, 'vm_munmap_bytes': 0, 'vm_brk_grow_bytes': 0, 'vm_brk_shrink_bytes': 0, 'bytes_written': 0, 'bytes_read': 0}, {'pid': 237, 'comm': 'chronyd', 'user': nan, 'container_id': nan, 'container_image': nan, 'wall_time_ms': 1757364894741})


In [17]:
from river import compose, preprocessing, anomaly, feature_extraction
import numpy as np

In [13]:
model = compose.Pipeline(
    preprocessing.StandardScaler(),
    anomaly.OneClassSVM(nu=0.1, intercept_lr=0.01)
)

scores = []
for features, meta in records:
    score = model.score_one(features)    # anomaly score
    model.learn_one(features)    # update model
    
    record_out = {**meta, **features, "anomaly_score": score, "is_anomaly": is_anomaly}
    scores.append(record_out)



In [14]:
results = pd.DataFrame(scores)
mean_score = results["anomaly_score"].mean()
std_score = results["anomaly_score"].std()
min_score = results["anomaly_score"].min()
max_score = results["anomaly_score"].max()

q95 = np.percentile(results["anomaly_score"], 95)
q99 = np.percentile(results["anomaly_score"], 99)

rate_q95 = (results["anomaly_score"] > q95).mean()
rate_q99 = (results["anomaly_score"] > q99).mean()

print("📊 Anomaly Score Metrics")
print(f"Mean: {mean_score:.4f}, Std: {std_score:.4f}")
print(f"Min: {min_score:.4f}, Max: {max_score:.4f}")
print(f"95th percentile: {q95:.4f} → anomaly rate ~ {rate_q95*100:.2f}%")
print(f"99th percentile: {q99:.4f} → anomaly rate ~ {rate_q99*100:.2f}%")


📊 Anomaly Score Metrics
Mean: 0.0254, Std: 0.3814
Min: -2.5002, Max: 5.2788
95th percentile: 0.1129 → anomaly rate ~ 5.07%
99th percentile: 0.7240 → anomaly rate ~ 1.10%


In [15]:
top_anomalies = results.sort_values("anomaly_score", ascending=False).head(10)
display(top_anomalies)

Unnamed: 0,pid,comm,user,container_id,container_image,wall_time_ms,cpu_ns,user_faults,kernel_faults,vm_mmap_bytes,vm_munmap_bytes,vm_brk_grow_bytes,vm_brk_shrink_bytes,bytes_written,bytes_read,anomaly_score,is_anomaly
109,7710,main,aleyi,,,1757364814693,599916,341,2,504446976,201326592,0,0,140,125,5.278824,False
48,4145,ThreadPool,systemd-resolve,43ec67b6e84c43b3ef18e9bae1a5bcc5a53a9d8189a9f4...,clickhouse/clickhouse-server:latest,1757364894742,385668638,7622,4,0,0,0,0,545051,173592,3.724458,False
85,4143,client-metrics-,aleyi,a1df103d039148efb25fc00f7cff04203adcf7c0849240...,apache/kafka:latest,1757364884750,164648524,4307,0,19653652,18128896,4096,0,11766,15831,3.432682,False
32,4143,client-metrics-,aleyi,a1df103d039148efb25fc00f7cff04203adcf7c0849240...,apache/kafka:latest,1757364894742,154071537,643,0,2043904,2826240,0,0,11650,10617,2.009545,False
101,4145,ThreadPool,systemd-resolve,43ec67b6e84c43b3ef18e9bae1a5bcc5a53a9d8189a9f4...,clickhouse/clickhouse-server:latest,1757364884750,384032268,4117,5,27262976,0,0,0,365434,172279,0.990927,False
31,3343,dockerd,root,,,1757364894742,13634988,2,0,0,0,0,0,97343,392651,0.487368,False
432,3343,dockerd,root,,,1757364874742,17993810,229,0,0,0,0,0,108681,515402,0.426102,False
379,3343,dockerd,root,,,1757364864718,11106978,3,0,0,0,0,0,94570,392626,0.334299,False
342,3145,init,root,,,1757364854727,30947172,12,0,0,0,0,0,357948,346971,0.327998,False
47,3145,init,root,,,1757364894742,24890952,1,0,0,0,0,0,274524,267375,0.284075,False


In [18]:
model = compose.Pipeline(
    preprocessing.StandardScaler(),
    feature_extraction.RBFSampler(n_components=50, seed=42),
    anomaly.OneClassSVM(nu=0.1, intercept_lr=0.01)
)

scores = []
for features, meta in records:
    score = model.score_one(features)    # anomaly score
    model.learn_one(features)    # update model
    
    record_out = {**meta, **features, "anomaly_score": score, "is_anomaly": is_anomaly}
    scores.append(record_out)



In [19]:
results = pd.DataFrame(scores)
mean_score = results["anomaly_score"].mean()
std_score = results["anomaly_score"].std()
min_score = results["anomaly_score"].min()
max_score = results["anomaly_score"].max()

q95 = np.percentile(results["anomaly_score"], 95)
q99 = np.percentile(results["anomaly_score"], 99)

rate_q95 = (results["anomaly_score"] > q95).mean()
rate_q99 = (results["anomaly_score"] > q99).mean()

print("📊 Anomaly Score Metrics")
print(f"Mean: {mean_score:.4f}, Std: {std_score:.4f}")
print(f"Min: {min_score:.4f}, Max: {max_score:.4f}")
print(f"95th percentile: {q95:.4f} → anomaly rate ~ {rate_q95*100:.2f}%")
print(f"99th percentile: {q99:.4f} → anomaly rate ~ {rate_q99*100:.2f}%")


📊 Anomaly Score Metrics
Mean: 1.7567, Std: 0.2289
Min: 0.0000, Max: 2.0873
95th percentile: 1.9962 → anomaly rate ~ 5.07%
99th percentile: 2.0092 → anomaly rate ~ 1.10%


In [20]:
top_anomalies = results.sort_values("anomaly_score", ascending=False).head(10)
display(top_anomalies)

Unnamed: 0,pid,comm,user,container_id,container_image,wall_time_ms,cpu_ns,user_faults,kernel_faults,vm_mmap_bytes,vm_munmap_bytes,vm_brk_grow_bytes,vm_brk_shrink_bytes,bytes_written,bytes_read,anomaly_score,is_anomaly
1,4226,containerd-shim,root,,,1757364894741,899677,14,0,0,0,0,0,498,2660,2.087273,False
38,4295,containerd-shim,root,,,1757364894742,341543,9,0,0,0,0,0,514,2666,2.021918,False
61,2826,Relay(9),root,,,1757364884749,4089590,0,0,0,0,0,0,8846,8846,2.017847,False
76,7980,ebpf_loader,root,,,1757364884750,982185,18,0,0,0,0,0,10683,22397,2.012075,False
56,4323,VM Periodic Tas,systemd-network,04242fd9b1ae2cf32adc297f0b79e8120c2833f0e4cfe5...,provectuslabs/kafka-ui:latest,1757364884749,18327243,19,0,65536,0,0,0,389,4880,2.010456,False
58,3750,containerd-shim,root,,,1757364884749,2680589,28,0,0,0,0,0,759,4104,2.008007,False
37,3281,rpcbind,systemd-network,,,1757364894742,0,0,0,0,0,0,0,0,0,2.005408,False
35,469,systemd-resolve,systemd-resolve,,,1757364894742,0,0,0,0,0,0,0,0,0,2.005405,False
54,4226,containerd-shim,root,,,1757364884749,865108,14,0,0,0,0,0,498,2660,2.004468,False
34,512,systemd-timesyn,systemd-timesync,,,1757364894742,0,0,0,0,0,0,0,0,0,2.004149,False


In [24]:
model = compose.Pipeline(
    preprocessing.StandardScaler(),
    anomaly.QuantileFilter(
        anomaly.OneClassSVM(nu=0.2),
        q=0.95
    )
)


scores = []
for features, meta in records:
    score = model.score_one(features)    # anomaly score
    is_anomaly = model['QuantileFilter'].classify(score)
    model.learn_one(features)    # update model
    
    record_out = {**meta, **features, "anomaly_score": score, "is_anomaly": is_anomaly}
    scores.append(record_out)


In [25]:
results = pd.DataFrame(scores)
mean_score = results["anomaly_score"].mean()
std_score = results["anomaly_score"].std()
min_score = results["anomaly_score"].min()
max_score = results["anomaly_score"].max()

q95 = np.percentile(results["anomaly_score"], 95)
q99 = np.percentile(results["anomaly_score"], 99)

rate_q95 = (results["anomaly_score"] > q95).mean()
rate_q99 = (results["anomaly_score"] > q99).mean()

print("📊 Anomaly Score Metrics")
print(f"Mean: {mean_score:.4f}, Std: {std_score:.4f}")
print(f"Min: {min_score:.4f}, Max: {max_score:.4f}")
print(f"95th percentile: {q95:.4f} → anomaly rate ~ {rate_q95*100:.2f}%")
print(f"99th percentile: {q99:.4f} → anomaly rate ~ {rate_q99*100:.2f}%")


📊 Anomaly Score Metrics
Mean: 0.0041, Std: 0.4973
Min: -9.2882, Max: 2.9621
95th percentile: 0.0787 → anomaly rate ~ 5.07%
99th percentile: 0.6033 → anomaly rate ~ 1.10%


In [26]:
top_anomalies = results.sort_values("anomaly_score", ascending=False).head(10)
display(top_anomalies)

Unnamed: 0,pid,comm,user,container_id,container_image,wall_time_ms,cpu_ns,user_faults,kernel_faults,vm_mmap_bytes,vm_munmap_bytes,vm_brk_grow_bytes,vm_brk_shrink_bytes,bytes_written,bytes_read,anomaly_score,is_anomaly
48,4145,ThreadPool,systemd-resolve,43ec67b6e84c43b3ef18e9bae1a5bcc5a53a9d8189a9f4...,clickhouse/clickhouse-server:latest,1757364894742,385668638,7622,4,0,0,0,0,545051,173592,2.962102,True
32,4143,client-metrics-,aleyi,a1df103d039148efb25fc00f7cff04203adcf7c0849240...,apache/kafka:latest,1757364894742,154071537,643,0,2043904,2826240,0,0,11650,10617,2.323321,True
109,7710,main,aleyi,,,1757364814693,599916,341,2,504446976,201326592,0,0,140,125,2.285372,True
85,4143,client-metrics-,aleyi,a1df103d039148efb25fc00f7cff04203adcf7c0849240...,apache/kafka:latest,1757364884750,164648524,4307,0,19653652,18128896,4096,0,11766,15831,2.167738,True
326,3343,dockerd,root,,,1757364854727,18194130,38,0,0,0,0,0,133389,2784570,0.848505,True
194,4145,ThreadPool,systemd-resolve,43ec67b6e84c43b3ef18e9bae1a5bcc5a53a9d8189a9f4...,clickhouse/clickhouse-server:latest,1757364824694,326423382,5147,9,23072768,0,0,0,424627,185385,0.385883,True
101,4145,ThreadPool,systemd-resolve,43ec67b6e84c43b3ef18e9bae1a5bcc5a53a9d8189a9f4...,clickhouse/clickhouse-server:latest,1757364884750,384032268,4117,5,27262976,0,0,0,365434,172279,0.381997,True
241,4145,ThreadPool,systemd-resolve,43ec67b6e84c43b3ef18e9bae1a5bcc5a53a9d8189a9f4...,clickhouse/clickhouse-server:latest,1757364834694,363505296,19821,10,42991616,0,0,0,909854,186448,0.274913,True
343,4145,ThreadPool,systemd-resolve,43ec67b6e84c43b3ef18e9bae1a5bcc5a53a9d8189a9f4...,clickhouse/clickhouse-server:latest,1757364854727,368097514,4861,7,15204352,0,0,0,328007,174000,0.238822,True
47,3145,init,root,,,1757364894742,24890952,1,0,0,0,0,0,274524,267375,0.22476,True


In [28]:
model = compose.Pipeline(
    preprocessing.StandardScaler(),
    feature_extraction.RBFSampler(n_components=50, seed=42),
    anomaly.QuantileFilter(
        anomaly.OneClassSVM(nu=0.2),
        q=0.95
    )
)


scores = []
for features, meta in records:
    score = model.score_one(features)    # anomaly score
    is_anomaly = model['QuantileFilter'].classify(score)
    model.learn_one(features)    # update model
    
    record_out = {**meta, **features, "anomaly_score": score, "is_anomaly": is_anomaly}
    scores.append(record_out)


In [29]:
results = pd.DataFrame(scores)
mean_score = results["anomaly_score"].mean()
std_score = results["anomaly_score"].std()
min_score = results["anomaly_score"].min()
max_score = results["anomaly_score"].max()

q95 = np.percentile(results["anomaly_score"], 95)
q99 = np.percentile(results["anomaly_score"], 99)

rate_q95 = (results["anomaly_score"] > q95).mean()
rate_q99 = (results["anomaly_score"] > q99).mean()

print("📊 Anomaly Score Metrics")
print(f"Mean: {mean_score:.4f}, Std: {std_score:.4f}")
print(f"Min: {min_score:.4f}, Max: {max_score:.4f}")
print(f"95th percentile: {q95:.4f} → anomaly rate ~ {rate_q95*100:.2f}%")
print(f"99th percentile: {q99:.4f} → anomaly rate ~ {rate_q99*100:.2f}%")


📊 Anomaly Score Metrics
Mean: 1.7420, Std: 0.3436
Min: 0.0000, Max: 2.4509
95th percentile: 2.3984 → anomaly rate ~ 5.07%
99th percentile: 2.4050 → anomaly rate ~ 1.10%


In [30]:
top_anomalies = results.sort_values("anomaly_score", ascending=False).head(10)
display(top_anomalies)

Unnamed: 0,pid,comm,user,container_id,container_image,wall_time_ms,cpu_ns,user_faults,kernel_faults,vm_mmap_bytes,vm_munmap_bytes,vm_brk_grow_bytes,vm_brk_shrink_bytes,bytes_written,bytes_read,anomaly_score,is_anomaly
447,247,init,root,,,1757364874742,19169638,20,0,163840,163840,0,0,0,43860,2.450941,True
424,7980,ebpf_loader,root,,,1757364874742,5620004,8,0,0,0,0,0,10594,22397,2.437649,True
405,7710,main,aleyi,,,1757364874742,1946024,70,0,262144,0,0,0,17058,14870,2.424775,True
409,2826,Relay(9),root,,,1757364874742,2218554,0,0,0,0,0,0,6839,6839,2.41219,True
406,3750,containerd-shim,root,,,1757364874742,530262,24,0,0,0,0,0,762,4107,2.405397,True
425,4092,containerd-shim,root,,,1757364874742,2831484,19,0,0,0,0,0,504,2672,2.404692,True
421,4118,containerd-shim,root,,,1757364874742,1120382,19,0,0,0,0,0,568,2914,2.40418,True
402,4226,containerd-shim,root,,,1757364874742,1607377,18,0,0,0,0,0,498,2660,2.403139,True
427,2339,Relay(2008),root,,,1757364874742,148507,0,0,0,0,0,0,1601,1601,2.402043,True
410,2827,wsl-bootstrap,root,,,1757364874742,876853,5,0,0,0,0,0,5056,424,2.401275,True
