In [6]:
import psutil
import os
import pandas as pd
from timeit import default_timer as timer

from vmft_lad.BaseDetector import BaseDetector
from heap_providers.SubsequenceMaxHeap import SubsequenceMaxHeap
from heap_providers.ListProvider import ListMaxHeapProvider
from log_inference_providers.FakeLogInferenceProvider import FakeLogInferenceProvider


In [7]:
window_size = 4
anomaly_threshold = 0.15
subsequence_match_threshold = 0.05
probationary_period = 150

inference_provider = FakeLogInferenceProvider()

In [8]:
def get_data():
    for dataset_name in ["benign", "hdd", "oom", "buffer-io"]:
        dataset_dir = f"../../../data/{dataset_name}"
        datafile_count = 0
        for datasetFile in os.scandir(dataset_dir):
            if datafile_count >= 2:
                break
            if not datasetFile.name.endswith(".csv"):
                continue
            
            dataset_df = pd.read_csv(datasetFile.path, header=0)
            data = dataset_df["value"].tolist()
            datafile_count += 1
            yield data

In [9]:
# Execution time
def eval_execution_time(max_heap_provider_class):
    execution_time_min = float("inf")
    execution_time_max = 0
    execution_time_avg = None
    
    for data in get_data():
        max_heap_provider = max_heap_provider_class()

        model = BaseDetector(
            inferenceProvider=inference_provider,
            maxHeapProvider=max_heap_provider,
            data=data,
            windowSize=window_size,
            probationaryPeriod=probationary_period,
            subsequenceMatchThreshold=subsequence_match_threshold,
            anomalyThreshold=anomaly_threshold)
        
        for i in range(len(data)):
            start = timer()
            model.handleRecord(i)
            end = timer()
            execution_time = end - start
            execution_time_min = min(execution_time_min, execution_time)
            execution_time_max = max(execution_time_max, execution_time)
            execution_time_avg = execution_time if execution_time_avg is None else (execution_time_avg + execution_time) / 2

    # print(f"Execution time (min): {execution_time_min*1000 }ms")
    # print(f"Execution time (max): {execution_time_max*1000} ms")
    print(f"Execution time (avg): {execution_time_avg*1000 } ms")
    return execution_time_avg*100


In [10]:
avg_time_list = eval_execution_time(ListMaxHeapProvider)
avg_time_pq = eval_execution_time(SubsequenceMaxHeap)

print(f"Average record handle time (list): {avg_time_list} ms")
print(f"Average record handle time (pq): {avg_time_pq} ms")

print(f"Savings: {avg_time_list - avg_time_pq} ms ({(avg_time_list - avg_time_pq) / avg_time_list * 100}%)")

Execution time (avg): 0.21861921060071662 ms
Execution time (avg): 0.21236385797950746 ms
Average record handle time (list): 0.021861921060071662 ms
Average record handle time (pq): 0.021236385797950744 ms
Savings: 0.0006255352621209187 ms (2.861300525247017%)
