In [1]:
import os
import pandas as pd
import numpy as np
import json
from drain3 import TemplateMiner
from drain3.template_miner_config import TemplateMinerConfig
from drain3.file_persistence import FilePersistence

from log_inference_providers.BartLarge.FastBartLargeZeroShotLogInference import (
    FastBartLargeZeroShotLogInference,
)
from log_inference_providers.Falcon7B.Falcon7BFewShotLogInference import (
    FastFalcon7BFewShotLogInference,
)
from log_inference_providers.FakeLogInferenceProvider import FakeLogInferenceProvider
from log_inference_providers.GPT3.FastGPT3FewShotLogInference import (
    FastGPT3FewShotLogInference,
)
from log_inference_providers.EmertonMonarch7B.FastEmertonMonarch7BFewShotLogInference import (
    FastEmertonMonarch7BFewShotLogInference,
)
from log_inference_providers.Cyrax7B.FastCyrax7BFewShotLogInference import (
    FastCyrax7BFewShotLogInference,
)
from log_inference_providers.NullLogInferenceProvider import NullLogInferenceProvider

In [2]:
drain_config_base_path = f"../../../preprocessing"
persistence = FilePersistence(f"{drain_config_base_path}/drain3_state.bin")
config = TemplateMinerConfig()
config.load(f"{drain_config_base_path}/drain3.ini")
config.profiling_enabled = False
template_miner = TemplateMiner(persistence, config)

In [3]:
inference_providers = {
    "Fake": FakeLogInferenceProvider(),
    "Null": NullLogInferenceProvider(),
    "BartLarge": FastBartLargeZeroShotLogInference(),
    "Falcon7B": FastFalcon7BFewShotLogInference(),
    "GPT3": FastGPT3FewShotLogInference(),
    "EmertonMonarch7B": FastEmertonMonarch7BFewShotLogInference(),
    "Cyrax7B": FastCyrax7BFewShotLogInference(),
}

In [4]:
for inference_provider in inference_providers.keys():
    print("## Starting inference provider:", inference_provider)
    for dataset_name in ["cpu", "buffer-io", "hdd", "oom", "benign"]:
        dataset_dir = f"../../../data/{dataset_name}"
        dataset_metadata = json.load(open(f"{dataset_dir}/dataset_metadata.json"))

        output_dir = f"../../../results/LLM_only/LLM-{inference_provider}/{dataset_name}"
        os.makedirs(output_dir, exist_ok=True)

        print("## Starting dataset:", dataset_name)

        for datasetFile in os.scandir(dataset_dir):
            if not datasetFile.name.endswith(".csv"):
                continue
            
            dataset_df = pd.read_csv(datasetFile.path, header=0)
            data = dataset_df["value"].tolist()

            dataset_file_name = datasetFile.name.split(".")[0]

            label_index = dataset_metadata[dataset_file_name]["label_region"]
            
            out_df = dataset_df.copy()

            for i in range(len(data)):
                log_template = template_miner.drain.id_to_cluster[data[i]].get_template()
                anomaly_score = int(inference_providers[inference_provider].infer(log_template))

                out_df.at[i, 'anomaly_score'] = anomaly_score
                if label_index == 0:
                    out_df.at[i, 'label'] = 0
                else:
                    out_df.at[i, 'label'] = 1 if i >= label_index else 0
                        
            if dataset_name != "benign":
                out_df.loc[len(out_df)] = {"timestamp": "2025-01-01 12:12:12.000000",
                                        "value": 0,
                                        "anomaly_score": 1.0,
                                        "label": 1.0,}
            
            out_df.to_csv(f"{output_dir}/LLM-{inference_provider}_{dataset_file_name}.csv", index=False)

## Starting inference provider: Falcon7B
## Starting dataset: cpu
## Starting dataset: buffer-io
## Starting dataset: hdd
## Starting dataset: oom
## Starting dataset: benign
