In [13]:
from typing import List
import pandas as pd
from timeit import default_timer as timer
from tqdm import tqdm
import plotly.graph_objects as go

from vmft_lad.BaseDetector import BaseDetector
from heap_providers.SubsequenceMaxHeap import SubsequenceMaxHeap
from log_inference_providers.NullLogInferenceProvider import NullLogInferenceProvider
from log_inference_providers.BartLarge.FastBartLargeZeroShotLogInference import FastBartLargeZeroShotLogInference
from log_inference_providers.Falcon7B.Falcon7BFewShotLogInference import FastFalcon7BFewShotLogInference
from log_inference_providers.GPT3.FastGPT3FewShotLogInference import FastGPT3FewShotLogInference
from log_inference_providers.FakeLogInferenceProvider import FakeLogInferenceProvider
from log_inference_providers.EmertonMonarch7B.FastEmertonMonarch7BFewShotLogInference import FastEmertonMonarch7BFewShotLogInference

In [14]:
window_size = 4
anomaly_threshold = 0.15
subsequence_match_threshold = 0.1
probationary_period = 150


inference_providers = {
    "Null": NullLogInferenceProvider,
    "Fake": FakeLogInferenceProvider,
    "BartLarge": FastBartLargeZeroShotLogInference,
    "Falcon7B": FastFalcon7BFewShotLogInference,
    "GPT3": FastGPT3FewShotLogInference,
    "EmertonMonarch7B": FastEmertonMonarch7BFewShotLogInference
}

inference_provider_name = "EmertonMonarch7B"
inference_provider = inference_providers[inference_provider_name]()

In [15]:
def test_vmft_lad(dataset_df: pd.DataFrame):
    data = dataset_df["value"].tolist()
    max_heap_provider = SubsequenceMaxHeap()
    model = BaseDetector(
        inferenceProvider=inference_provider,
        maxHeapProvider=max_heap_provider,
        data=data, 
        windowSize=window_size, 
        probationaryPeriod=probationary_period, 
        subsequenceMatchThreshold=subsequence_match_threshold, 
        anomalyThreshold=anomaly_threshold)

    model_out_df = dataset_df.copy()
    anomaly_scores = []
    labels = [0]*len(data)
    record_handle_times = []
    print(f"Starting detection...")
    for i in tqdm(range(len(data))):
    # for i in range(len(data)):
        start = timer()
        anomaly_scores.append(model.handleRecord(i))
        end = timer()
        record_handle_times.append(end - start)

    print(f"Finished detection.")

    # Add the anomaly scores to the dataframe
    model_out_df["anomaly_score"] = anomaly_scores
    model_out_df["label"] = labels
    model_out_df["record_handle_time"] = record_handle_times

    avg_time= model_out_df['record_handle_time'].mean() * 1000
    print(f"Average record handle time: {avg_time} ms")

    scaling_factor = model_out_df["value"].max()
    normalized_scores = [x*scaling_factor for x in anomaly_scores]

    fig = go.Figure()
    fig.add_trace(go.Scatter(y=model_out_df["value"], name='Log key',
                line=go.scatter.Line(color='rgba(0,0,190, 0.2)')))
    fig.add_trace(go.Scatter(y=normalized_scores, name='Anomaly score',
                line_color='rgb(11, 132, 165)'))


    fig.add_trace(go.Scatter(x=list(range(probationary_period+1)), y=[scaling_factor]*probationary_period,
                            name='Training region', fill='tozeroy', mode='none',
                            line_color='rgba(0, 0, 0, 0.4)',
                            fillcolor='rgba(0, 0, 0, 0.4)'
                            ))


    fig.update_yaxes(title_text="Log key")
    fig.update_xaxes(title_text="Time step")
    # fig.update_layout(
    #     title=dict(text="VMFT-LAD", yanchor='top', y=0.85)
    # )

    fig.show()

In [16]:
benign_df = pd.read_csv("./test_data/benign.csv", header=0)
test_vmft_lad(benign_df)

Starting detection...


100%|██████████| 3260/3260 [00:00<00:00, 5207.84it/s]

Finished detection.
Average record handle time: 0.18784214723936674 ms





In [17]:
benign_df = pd.read_csv("./test_data/benign2.csv", header=0)
test_vmft_lad(benign_df)

Starting detection...


100%|██████████| 554/554 [00:00<00:00, 9722.08it/s]

Finished detection.
Average record handle time: 0.07909837545138346 ms





In [18]:
hdd_df = pd.read_csv("./test_data/hdd.csv", header=0)
test_vmft_lad(hdd_df)


Starting detection...


100%|██████████| 1141/1141 [00:00<00:00, 6091.67it/s]

Finished detection.
Average record handle time: 0.167326117440837 ms



