In [40]:
import os
import pandas as pd
import json
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from concurrent.futures.thread import ThreadPoolExecutor

from vmft_lad.BaseDetector import BaseDetector
from heap_providers.SubsequenceMaxHeap import SubsequenceMaxHeap
from log_inference_providers.GPT3.FastGPT3FewShotLogInference import FastGPT3FewShotLogInference
from log_inference_providers.FakeLogInferenceProvider import FakeLogInferenceProvider


In [41]:
window_size = 4
anomaly_threshold = 0.2
subsequence_match_threshold = 0.05
# probationary_period = 150
benign_eval_window = 200

# inference_provider = FastGPT3FewShotLogInference()
inference_provider = FakeLogInferenceProvider()

In [42]:
def get_data(dataset_dir):
        datafile_count = 0
        for datasetFile in os.scandir(dataset_dir):
            if datafile_count >= 2:
                break
            if not datasetFile.name.endswith(".csv"):
                continue

            dataset_name = datasetFile.name.split(".")[0]
            
            dataset_df = pd.read_csv(datasetFile.path, header=0)
            data = dataset_df["value"].tolist()
            datafile_count += 1
            yield dataset_name, data

In [43]:
def eval_window_size(window_size: int, fb_period:int, criteria_2: dict):

    temp_results_criteria_2 = {
        "window_size": window_size,
        "TP": 0,
        "TN": 0,
        "FP": 0,
        "FN": 0,
    }

    for dataset_name in ["benign", "hdd", "oom", "buffer-io", "cpu"]:
        dataset_dir = f"../../../data/{dataset_name}"
        dataset_metadata = json.load(open(f"{dataset_dir}/dataset_metadata.json"))

        for dataset_name, data in get_data(dataset_dir):
            label_index = dataset_metadata[dataset_name]["label_region"] - 1
            failure_point_index = dataset_metadata[dataset_name]["failure_point"]

            max_heap_provider = SubsequenceMaxHeap()

            model = BaseDetector(
                inferenceProvider=inference_provider,
                maxHeapProvider=max_heap_provider,
                data=data,
                windowSize=window_size,
                probationaryPeriod=fb_period,
                subsequenceMatchThreshold=subsequence_match_threshold,
                anomalyThreshold=anomaly_threshold)
            
            i = fb_period+1  # iterator for the data points
            # Benign region
            while (i < label_index):
                anomaly_score_window = []
                anomaly_score_window.append(model.handleRecord(i))
                i += 1
                while (i % benign_eval_window != 0 and i < label_index):
                    anomaly_score_window.append(model.handleRecord(i))
                    i += 1
                detection_in_window = False
                for anomaly_score in anomaly_score_window:
                    if anomaly_score >= anomaly_threshold:
                        detection_in_window = True
                        break
                if detection_in_window:
                    temp_results_criteria_2["FP"] += 1
                else:
                    temp_results_criteria_2["TN"] += 1

            # Anomalous region
            positive_prediction = False
            while (i < failure_point_index):
                anomaly_score = model.handleRecord(i)
                positive_prediction = anomaly_score >= anomaly_threshold
                if positive_prediction:  # Early detection
                    temp_results_criteria_2["TP"] += 1
                    break
                i += 1

            while (not positive_prediction and (i < len(data))):
                anomaly_score = model.handleRecord(i)
                positive_prediction = anomaly_score >= anomaly_threshold
                if positive_prediction:  # Late detection
                    temp_results_criteria_2["FN"] += 1
                    break
                i += 1

            if (not positive_prediction):
                temp_results_criteria_2["FN"] += 1

        criteria_2[window_size] = temp_results_criteria_2

    print("Done evaluating window_size: ", window_size)

In [44]:
def tpr(TP, FN):
    if TP + FN == 0:
        return 1
    return TP / (TP + FN)


def fpr(FP, TN):
    if FP + TN == 0:
        return 0
    return FP / (FP + TN)

In [45]:
fb_periods = np.arange(0, 400, 50, dtype=int).tolist()
# fb_periods = [0, 15, 30, 45] + fb_periods
window_sizes = np.arange(0, 45, 5, dtype=int).tolist()
# window_sizes = [1, 2, 3, 4, 5, 8] + window_sizes

# tpr_vs_fb_period_vs_window_sizes = {}
# fpr_vs_fb_period_vs_window_sizes = {}

# for fb_period in fb_periods:
#     print(f"___FB_PERIOD___: {fb_period}")
#     criteria_2 = {}
#     with ThreadPoolExecutor(max_workers=8) as executor:
#         args = [(window_size, fb_period, criteria_2) for window_size in window_sizes]
#         # for window_size in window_sizes:
#         #     executor.submit(eval_window_size, window_size, criteria_2)
#         executor.map(lambda p: eval_window_size(*p), args)

#     df_criteria_2 = pd.DataFrame(criteria_2).T.sort_values("window_size")
#     df_criteria_2["TPR"] = df_criteria_2.apply(
#                 lambda row: tpr(row["TP"], row["FN"]), axis=1
#             )
#     df_criteria_2["FPR"] = df_criteria_2.apply(
#         lambda row: fpr(row["FP"], row["TN"]), axis=1
#     )
#     window_sizes_vs_tpr_dict = df_criteria_2.set_index("window_size")["TPR"].to_dict()
#     window_sizes_vs_fpr_dict = df_criteria_2.set_index("window_size")["FPR"].to_dict()
#     tpr_vs_fb_period_vs_window_sizes[fb_period] = window_sizes_vs_tpr_dict
#     fpr_vs_fb_period_vs_window_sizes[fb_period] = window_sizes_vs_fpr_dict



In [46]:
# df_tpr = pd.DataFrame(tpr_vs_fb_period_vs_window_sizes).T
# df_tpr

In [47]:
# df_fpr = pd.DataFrame(fpr_vs_fb_period_vs_window_sizes).T
# df_fpr

In [48]:
# df_tpr.to_csv("tpr_vs_fb_period_vs_window_sizes.csv")
# df_fpr.to_csv("fpr_vs_fb_period_vs_window_sizes.csv")

In [49]:
df_tpr = pd.read_csv("tpr_vs_fb_period_vs_window_sizes.csv", index_col=0, header=0).values
df_fpr = pd.read_csv("fpr_vs_fb_period_vs_window_sizes.csv", index_col=0, header=0).values

In [50]:

y, x = fb_periods, window_sizes

In [51]:
custom_colorscale = [
    [0, 'rgb(209, 229, 240)'],
    [0.25, 'rgb(146, 197, 222)'],
    [0.5, 'rgb(67, 147, 195)'],
    [0.75, 'rgb(33, 102, 172)'],
    [1, 'rgb(5, 48, 97)'],
]
fig = go.Figure(data=[go.Surface(z=df_fpr, x=x, y=y, colorscale =custom_colorscale, 
                                 colorbar=dict(lenmode='fraction', len=0.8, thickness=15), 
                                cmin=0, cmax=1
                                 )])


camera = dict(
    up=dict(x=0, y=0, z=1),
    center=dict(x=0, y=0, z=0),
    eye=dict(x=1.2, y=1.2, z=1.8)
)

fig.update_layout(
                    # title='FPR', 
                    autosize=False, template='plotly_white',
                    width=450, height=380,
                    font=dict(
                        family="Arial",
                        size=12,
                        color="black"
                    ),
                    scene_camera=camera,
                    margin=dict(l=2, r=20, b=15, t=15),
                    title_xref="paper",
                    title_y=0.05,
                    title_x=0.5,
                    scene=dict(
                        xaxis=dict(title='m', range=[-0.1, 41]),
                        yaxis=dict(title='M', range=[-0.1, 381]),
                        zaxis=dict(title='FPR', range=[-0.1, 1.1])
                    )
                )
fig.update_scenes(xaxis_autorange="reversed", yaxis_autorange="reversed")
fig.write_image(f"images/fpr_vs_fb_period_vs_window_sizes.png",scale=4, width=450, height=380)
fig.show()

In [52]:
fig = go.Figure(data=[go.Surface(z=df_tpr, x=x, y=y, colorscale=custom_colorscale, 
                                 colorbar=dict(lenmode='fraction', len=0.8, thickness=15), 
                                cmin=0, cmax=1
                                 )])


camera = dict(
    up=dict(x=0, y=0, z=1),
    center=dict(x=0, y=0, z=0),
    eye=dict(x=1.2, y=1.2, z=1.8)
)

fig.update_layout(
                    # title='FPR', 
                    autosize=False, template='plotly_white',
                    width=450, height=380,
                    font=dict(
                        family="Arial",
                        size=12,
                        color="black"
                    ),
                    scene_camera=camera,
                    margin=dict(l=2, r=20, b=15, t=15),
                    title_xref="paper",
                    title_y=0.05,
                    title_x=0.5,
                    scene=dict(
                        xaxis=dict(title='m', range=[-0.1, 41]),
                        yaxis=dict(title='M', range=[-0.1, 381]),
                        zaxis=dict(title='TPR', range=[-0.1, 1.1])
                    )
                )
fig.update_scenes(xaxis_autorange="reversed", yaxis_autorange="reversed")
fig.write_image(f"images/tpr_vs_fb_period_vs_window_sizes.png",scale=4, width=450, height=380)
fig.show()