In [1]:
import pandas as pd
import numpy as np
from timeit import default_timer as timer
from tqdm import tqdm
import plotly.graph_objects as go

from vmft_lad.BaseDetector import BaseDetector
from heap_providers.SubsequenceMaxHeap import SubsequenceMaxHeap
from heap_providers.ListProvider import ListMaxHeapProvider
from log_inference_providers.NullLogInferenceProvider import NullLogInferenceProvider

In [2]:
dataset_df = pd.read_csv("../tests/test_data/benign.csv", header=0)
data = dataset_df["value"].tolist()
window_size = 4
anomaly_threshold = 0.5

In [3]:
inference_provider = NullLogInferenceProvider()
max_heap_provider = ListMaxHeapProvider()

modelUsingList = BaseDetector(
    inferenceProvider=inference_provider,
    maxHeapProvider=max_heap_provider,
    data=data,
    windowSize=window_size,
    probationaryPeriod=150,
    subsequenceMatchThreshold=0,
    anomalyThreshold=anomaly_threshold)

list_model_out_df = dataset_df.copy()
anomaly_scores = []
labels = [0]*len(data)
record_handle_times = []
print(f"Starting detection...")
for i in tqdm(range(len(data))):
    start = timer()
    anomaly_scores.append(modelUsingList.handleRecord(i))
    end = timer()
    record_handle_times.append(end - start)

print(f"Finished detection.")

# Add the anomaly scores to the dataframe
list_model_out_df["anomaly_score"] = anomaly_scores
list_model_out_df["label"] = labels
list_model_out_df["record_handle_time"] = record_handle_times

Starting detection...


100%|██████████| 6520/6520 [00:01<00:00, 3810.43it/s]

Finished detection.





In [4]:
print(
    f"Average record handle time: {list_model_out_df['record_handle_time'].mean()}")

Average record handle time: 0.0002598581134969374


In [5]:
avg_time_list = list_model_out_df['record_handle_time'].mean() * 1000
avgs = {"List": round(avg_time_list, 4)}

In [6]:
subsequence_match_thresholds = np.arange(0, 0.5, 0.05).round(3).tolist()

for subsequence_match_threshold in tqdm(subsequence_match_thresholds):
    max_heap_provider = SubsequenceMaxHeap()

    modelUsingPQ = BaseDetector(
        inferenceProvider=inference_provider,
        maxHeapProvider=max_heap_provider,
        data=data,
        windowSize=window_size,
        probationaryPeriod=150,
        subsequenceMatchThreshold=subsequence_match_threshold,
        anomalyThreshold=anomaly_threshold)

    pq_model_out_df = pd.DataFrame()
    record_handle_times = []
    for i in range(len(data)):
        start = timer()
        anomaly_scores.append(modelUsingPQ.handleRecord(i))
        end = timer()
        record_handle_times.append(end - start)
    pq_model_out_df["record_handle_time"] = record_handle_times
    avg_time_pq = pq_model_out_df['record_handle_time'].mean() * 1000
    avgs[str(subsequence_match_threshold)] = round(avg_time_pq, 4)


100%|██████████| 10/10 [00:06<00:00,  1.66it/s]


In [7]:
print(avgs)

# print(f"Savings: {avg_time_list - avg_time_pq} ms ({(avg_time_list - avg_time_pq) / avg_time_list * 100}%)")
min_val = None
max_val = None
avg_time_list = avgs["List"]
for key, value in avgs.items():
    if key == "List":
        continue
    if min_val is None:
        min_val = value
    else:
        min_val = min(min_val, value)
    if max_val is None:
        max_val = value
    else:
        max_val = max(max_val, value)

print("Maximum savings: ", avg_time_list - min_val, "ms", (avg_time_list - min_val) / avg_time_list * 100, "%")
print("Minimum savings: ", avg_time_list - max_val, "ms", (avg_time_list - max_val) / avg_time_list * 100, "%")

{'List': 0.2599, '0.0': 0.2228, '0.05': 0.1593, '0.1': 0.1137, '0.15': 0.0912, '0.2': 0.0712, '0.25': 0.0537, '0.3': 0.0478, '0.35': 0.0569, '0.4': 0.0534, '0.45': 0.0466}
Maximum savings:  0.21330000000000002 ms 82.07002693343594 %
Minimum savings:  0.03710000000000002 ms 14.274721046556374 %


In [18]:
avgs = {
    "List": 0.4915,
    "0.0": 0.3771,
    "0.05": 0.2465,
    "0.1": 0.1994,
    "0.15": 0.1687,
    "0.2": 0.1165,
    "0.25": 0.0948,
    "0.3": 0.0829,
    "0.35": 0.0755,
    "0.4": 0.0768,
    "0.45": 0.0747,
}
colors = ["#1f78b4"] * (len(avgs))


colors[0] = "#ff7f00"


x = list(avgs.keys())

y = list(avgs.values())


fig = go.Figure(
    data=[go.Bar(x=x, y=y, text=y, textposition="auto", marker_color=colors)]
)


fig.update_layout(

    template="simple_white",

    width=677,  # 0.45 textwidth

    height=534,
    font=dict(family="Arial", size=17.2, color="black"),

    margin=dict(t=20, r=10, l=70, b=70, autoexpand=False),

    yaxis_title="Record processing time (ms)",
    yaxis_title_standoff=5,

    xaxis_title="Similarity threshold (η)",
    xaxis_title_standoff = 10,

    # yaxis_range=[0, 1.1],

    # xaxis_range=[0, 1],
)

fig.update_annotations(font_size=24)

fig.write_image(f"./images/fig_eval_eta.png",scale=4, width=677, height=534)

fig.show()