In [13]:
import os
import pandas as pd
import numpy as np
import json
from concurrent.futures.thread import ThreadPoolExecutor

In [14]:
benign_eval_window = 200

probationary_period = 0 # 150 # 0 for DeepLog, 150 for others

dataset_name = "buffer-io"
dataset_dir = f"../data/{dataset_name}"
dataset_metadata = json.load(open(f"{dataset_dir}/dataset_metadata.json"))

In [15]:
def execute(threshold: float, model_name:str, criteria_1: dict, criteria_2:dict):
    print("Start evaluating threshold: ", threshold)

    temp_results_criteria_1 = {
        "threshold": threshold,
        "TP": 0,
        "TN": 0,
        "FP": 0,
        "FN": 0,
    }
    temp_results_criteria_2 = {
        "threshold": threshold,
        "TP": 0,
        "TN": 0,
        "FP": 0,
        "FN": 0,
        "TP_LATE": 0,
    }
    

    for resultFile in os.scandir(f"../results/LLM_only/{model_name}/{dataset_name}"):
        if not resultFile.name.endswith(".csv"):
            continue

        dataset_result_df = pd.read_csv(resultFile.path, header=
                                        0)
        data_file_name = resultFile.name[len(model_name) + 1 : -4]

        label_index = dataset_metadata[data_file_name]["label_region"] - 1
        failure_point_index = dataset_metadata[data_file_name]["failure_point"]


        i  = probationary_period+1  # iterator for the data points
        # Benign region
        while (i < label_index):
            anomaly_score_window = []
            anomaly_score_window.append(dataset_result_df.iloc[i]["anomaly_score"])
            i += 1
            while(i%benign_eval_window != 0 and i < label_index):
                anomaly_score_window.append(dataset_result_df.iloc[i]["anomaly_score"])
                i += 1
            detection_in_window = False
            for anomaly_score in anomaly_score_window:
                if anomaly_score >= threshold:
                    detection_in_window = True
                    break
            if detection_in_window:
                temp_results_criteria_1["FP"] += 1
                temp_results_criteria_2["FP"] += 1
            else:
                temp_results_criteria_1["TN"] += 1
                temp_results_criteria_2["TN"] += 1
        
        # Anomalous region
        positive_prediction = False
        while(i < failure_point_index):
            anomaly_score = dataset_result_df.iloc[i]["anomaly_score"]
            positive_prediction = anomaly_score >= threshold
            if positive_prediction: # Early detection
                temp_results_criteria_1["TP"] += 1
                temp_results_criteria_2["TP"] += 1
                break
            i += 1
        
        while(not positive_prediction and (i < len(dataset_result_df))):
            anomaly_score = dataset_result_df.iloc[i]["anomaly_score"]
            positive_prediction = anomaly_score >= threshold
            if positive_prediction: # Late detection
                temp_results_criteria_1["TP"] += 1
                temp_results_criteria_2["TP_LATE"] += 1 # Can make this FN as well
                break
            i += 1

        if(not positive_prediction):
            temp_results_criteria_1["FN"] += 1
            temp_results_criteria_2["FN"] += 1

    criteria_1[threshold] = temp_results_criteria_1
    criteria_2[threshold] = temp_results_criteria_2

    print("Done evaluating threshold: ", threshold)


In [16]:
# thresholds = np.arange(0, 1.05, 0.05).round(3).tolist()
thresholds = [0.5]

for model_name in ["LLM-BartLarge", "LLM-Cyrax7B", "LLM-GPT3", "LLM-EmertonMonarch7B", "LLM-Falcon7B"]:
    print("\nStart evaluating model: ", model_name)
    
    eval_results_dir = f"./results_llm/{dataset_name}/{model_name}"
    os.makedirs(eval_results_dir, exist_ok=True)

    criteria_1 = {}
    criteria_2 = {}

    with ThreadPoolExecutor(max_workers=8) as executor:
        for threshold in thresholds:
            executor.submit(execute, threshold, model_name, criteria_1, criteria_2)
    
    df_criteria_1 = pd.DataFrame(criteria_1).T.sort_values("threshold")
    df_criteria_1.to_csv(f"{eval_results_dir}/criteria1.csv", index=False)

    df_criteria_2 = pd.DataFrame(criteria_2).T.sort_values("threshold")
    df_criteria_2.to_csv(f"{eval_results_dir}/criteria2.csv", index=False)

    print("Done evaluating model: ", model_name)

print("Finish execution")


Start evaluating model:  LLM-BartLarge
Start evaluating threshold:  0.5
Done evaluating threshold:  0.5
Done evaluating model:  LLM-BartLarge

Start evaluating model:  LLM-Cyrax7B
Start evaluating threshold:  0.5
Done evaluating threshold:  0.5
Done evaluating model:  LLM-Cyrax7B

Start evaluating model:  LLM-GPT3
Start evaluating threshold:  0.5
Done evaluating threshold:  0.5
Done evaluating model:  LLM-GPT3

Start evaluating model:  LLM-EmertonMonarch7B
Start evaluating threshold:  0.5
Done evaluating threshold:  0.5
Done evaluating model:  LLM-EmertonMonarch7B

Start evaluating model:  LLM-Falcon7B
Start evaluating threshold:  0.5
Done evaluating threshold:  0.5
Done evaluating model:  LLM-Falcon7B
Finish execution
