In [1]:
import pandas as pd
from tqdm import tqdm
import json
import os

from vmft_lad.BaseDetector import BaseDetector
from heap_providers.SubsequenceMaxHeap import SubsequenceMaxHeap
from log_inference_providers.FakeLogInferenceProvider import FakeLogInferenceProvider
from log_inference_providers.BartLarge import FastBartLargeZeroShotLogInference
from log_inference_providers.EmertonMonarch7B import FastEmertonMonarch7BFewShotLogInference
from log_inference_providers.Cyrax7B import FastCyrax7BFewShotLogInference
from log_inference_providers.Falcon7B import Falcon7BFewShotLogInference
from log_inference_providers.GPT3.FastGPT3FewShotLogInference import FastGPT3FewShotLogInference

In [2]:
inference_providers = {
    "Fake": FakeLogInferenceProvider,
    "BartLarge": FastBartLargeZeroShotLogInference,
    "Falcon7B": Falcon7BFewShotLogInference,
    "Cyrax7B": FastCyrax7BFewShotLogInference,
    "GPT3": FastGPT3FewShotLogInference,
    "EmertonMonarch7B": FastEmertonMonarch7BFewShotLogInference,
}

In [3]:
window_size = 4
anomaly_threshold = 0.15
subsequence_match_threshold = 0.05
probationary_period = 150


inference_provider_name = "GPT3"
inference_provider = inference_providers[inference_provider_name]()

dataset_name = "cpu"

In [4]:
dataset_dir = f"../data/{dataset_name}"
dataset_metadata = json.load(open(f"{dataset_dir}/dataset_metadata.json"))

# output_dir = f"./output/{dataset_name}/{inference_provider_name}"
output_dir = f"../results/vmft-lad/{dataset_name}" # CHANGE THIS LATER
os.makedirs(output_dir, exist_ok=True)

In [5]:
for datasetFile in os.scandir(dataset_dir):
    if not datasetFile.name.endswith(".csv"):
        continue

    dataset_df = pd.read_csv(datasetFile.path, header=0)
    data = dataset_df["value"].tolist()

    dataset_name = datasetFile.name.split(".")[0]

    label_index = dataset_metadata[dataset_name]["label_region"]

    max_heap_provider = SubsequenceMaxHeap()
    model = BaseDetector(
        inferenceProvider=inference_provider,
        maxHeapProvider=max_heap_provider,
        data=data,
        windowSize=window_size,
        probationaryPeriod=probationary_period,
        subsequenceMatchThreshold=subsequence_match_threshold,
        anomalyThreshold=anomaly_threshold)
    
    out_df = dataset_df.copy()

    for i in range(len(data)):
        anomaly_score = model.handleRecord(i)
        out_df.at[i, 'anomaly_score'] = anomaly_score
        if label_index == 0:
            out_df.at[i, 'label'] = 0
        else:
            out_df.at[i, 'label'] = 1 if i >= label_index else 0
    
    # out_df.loc[len(out_df)] = {"timestamp": "2025-01-01 12:12:12.000000",
    #                            "value": 0,
    #                            "anomaly_score": 1.0,
    #                            "label": 1.0,}
    
    out_df.to_csv(f"{output_dir}/vmft-lad_{dataset_name}.csv", index=False)

Anomaly detected at window  182  with score  0.27225732337736935
Log templates: 
Finished <:*:> of <:*:> <:*:>
new group: <:*:> GID=<:NUM:>
new user: <:*:> UID=<:NUM:>, GID=<:NUM:>, <:*:> <:*:> <:*:>
Starting <:*:> of <:*:> <:*:>
Anomaly detected at window  184  with score  0.18873762376237624
Log templates: 
new user: <:*:> UID=<:NUM:>, GID=<:NUM:>, <:*:> <:*:> <:*:>
Starting <:*:> of <:*:> <:*:>
systemd-tmpfiles-clean.service: Succeeded.
pam <:*:> password changed for <:*:>
Anomaly detected at window  242  with score  0.1585677749360614
Log templates: 
pam <:*:> session closed for user <:*:>
new user: <:*:> UID=<:NUM:>, GID=<:NUM:>, <:*:> <:*:> <:*:>
new group: <:*:> GID=<:NUM:>
pam <:*:> password changed for <:*:>
Anomaly detected at window  292  with score  0.22010398613518198
Log templates: 
Using default interface naming scheme <:*:>
<:*:> Link UP
pam <:*:> session opened for user <:*:> by <:*:>
[ <:NUM:>.<:NUM:>] L1TF CPU bug present and SMT on, data leak possible. See CVE-<:NUM