In [1]:
import os
import sys
import numpy as np
import json as json
import pandas as pd
from datetime import datetime

# General Paths
data_path = os.getcwd()
project_path = os.path.dirname(data_path)
benchmarks_path = os.path.join(project_path, 'benchmarks')

NAB_path = os.path.join(benchmarks_path, 'NAB')
NAB_config = os.path.join(NAB_path, 'config')
NAB_dataset = os.path.join(NAB_path, 'data')
NAB_labels = os.path.join(NAB_path, 'labels')
NAB_results = os.path.join(NAB_path, 'results')


In [2]:
def accuracy(row):
    return (row['TP']+row['TN']) / (row['TP']+row['FP']+row['TN']+row['FN'])

def precision(row):
    return (row['TP']) / (row['TP'] + row['FP'])

def recall(row):
    return (row['TP']) / (row['TP'] + row['FN'])

def fscore(row):
    p = precision(row)
    r = recall(row)
    return (2*p*r) / (p+r)

In [3]:
agg = {
    'solution': [],
    'accuracy': [],
    'precision': [],
    'recall': [],
    'f1-score': []
}

# Loop though each Solution with NAB Results
for i, solution in enumerate(os.listdir(NAB_results)):
    if os.path.isdir(os.path.join(NAB_results, solution)):
        # print(f"{i:2d} : {solution}")
        solution_result_path = os.path.join(NAB_results, solution)

        # Get the Standard Results
        standard_results_csv = f"{solution}_standard_scores.csv"
        standard_results_path = os.path.join(solution_result_path, standard_results_csv)
        df = pd.read_csv(standard_results_path)
        total_results = df[df['Detector']=='Totals']
        agg['solution'].append(solution)
        agg['accuracy'].append(total_results.apply(accuracy, axis=1).values[0])
        agg['precision'].append(total_results.apply(precision, axis=1).values[0])
        agg['recall'].append(total_results.apply(recall, axis=1).values[0])
        agg['f1-score'].append(total_results.apply(fscore, axis=1).values[0])

        
agg = pd.DataFrame.from_dict(agg)
display(agg)

Unnamed: 0,solution,accuracy,precision,recall,f1-score
0,knncad,0.898727,0.242131,0.002986,0.005898
1,numentaTM,0.899532,0.562358,0.007404,0.014616
2,twitterADVec,0.899433,0.517974,0.009464,0.018589
3,skyline,0.899868,0.571921,0.019824,0.038319
4,earthgeckoSkyline,0.898895,0.289544,0.003224,0.006378
5,numenta,0.899628,0.610687,0.007165,0.014164
6,bayesChangePt,0.897369,0.116494,0.003015,0.005879
7,,0.899364,0.5,3e-05,6e-05
8,expose,0.898213,0.192308,0.003583,0.007034
9,relativeEntropy,0.899568,0.588391,0.006658,0.013166


In [4]:
def make_pred(row, threshold):
    return int(row['anomaly_score'] >= threshold)

In [8]:
solution_thresh = {}
with open(os.path.join(NAB_config, 'thresholds.json')) as f:
    solution_thresh = json.load(f)

agg = {
    'solution': [],
    'accuracy': [],
    'precision': [],
    'recall': [],
    'f1-score': []
}

# Loop though each Solution with NAB Results
for i, solution in enumerate(os.listdir(NAB_results)):
    if os.path.isdir(os.path.join(NAB_results, solution)):
        # print(f"{i:2d} : {solution}")
        solution_result_path = os.path.join(NAB_results, solution)
        
        solution_results = {
            'task_id': [],
            'TP': [],
            'FP': [],
            'TN': [],
            'FN': []
        }

        # Loop through each Task Family in the Dataset
        for j, task_family in enumerate(os.listdir(solution_result_path)):
            if os.path.isdir(os.path.join(solution_result_path, task_family)):
                solution_task_family_path = os.path.join(solution_result_path, task_family)

                for k, task in enumerate(os.listdir(solution_task_family_path)):
                    task_results = os.path.join(solution_task_family_path, task)
                    df = pd.read_csv(task_results)

                    thresh = solution_thresh[solution]['standard']['threshold']
                    # Add Prediction Column
                    df['pred'] = df.apply(lambda row: make_pred(row, thresh), axis=1)

                    # Go through the Splits
                    # Loop thrugh the Task Data to Create Frames Consistent Size
                    split_nu = 0
                    split_size = 50
                    split_start = 0
                    split_end = split_size
                    # (While we can still get 20(tracks)x50(samples)
                    while split_start + split_size < len(df): 

                        # DO THE SPLITZ
                        split_df = df.iloc[split_start:split_end, :]
                        split_start += split_size
                        split_end += split_size

                        # Check Split Pred and Label
                        split_pred = any([split_df['pred'].values[0]==1])
                        split_label = any([split_df['label'].values[0]==1])

                        # Check for TP, FP, TN, FN
                        tp, fp, tn, fn = 0,0,0,0
                        if split_pred and split_label:
                            tp += 1
                        if split_pred and not split_label:
                            fp += 1
                        if not split_pred and not split_label:
                            tn += 1
                        if not split_pred and split_label:
                            fn += 1

                        # Append to the Solution Results
                        task_id = f"{task}-{split_nu}"
                        solution_results['task_id'].append(task_id)
                        solution_results['TP'].append(tp)
                        solution_results['FP'].append(fp)
                        solution_results['TN'].append(tn)
                        solution_results['FN'].append(fn)

                        split_nu += 1

        # Aggregate Solution Results 
        solution_results = pd.DataFrame.from_dict(solution_results)
        solution_agg = solution_results.sum(axis=0)
        solution_agg = pd.DataFrame(solution_agg)

        agg['solution'].append(solution)
        agg['accuracy'].append(solution_agg.apply(accuracy, axis=0).values[0])
        agg['precision'].append(solution_agg.apply(precision, axis=0).values[0])
        agg['recall'].append(solution_agg.apply(recall, axis=0).values[0])
        agg['f1-score'].append(solution_agg.apply(fscore, axis=0).values[0])

agg = pd.DataFrame.from_dict(agg)
display(agg)

                                                         0
task_id  knncad_ec2_network_in_5abac7.csv-0knncad_ec2_n...
TP                                                       3
FP                                                       5
TN                                                    6605
FN                                                     663
                                                         0
task_id  numentaTM_ec2_cpu_utilization_825cc2.csv-0nume...
TP                                                       3
FP                                                      10
TN                                                    6600
FN                                                     663
                                                         0
task_id  twitterADVec_ec2_cpu_utilization_825cc2.csv-0t...
TP                                                       3
FP                                                      15
TN                                                    65

  """


                                                         0
task_id  expose_ec2_cpu_utilization_c6585a.csv-0expose_...
TP                                                       2
FP                                                      19
TN                                                    6591
FN                                                     664
                                                         0
task_id  relativeEntropy_iio_us-east-1_i-a2eb1cd9_Netwo...
TP                                                       4
FP                                                       6
TN                                                    6604
FN                                                     662
                                                         0
task_id  ARTime_ec2_disk_write_bytes_1ef3de.csv-0ARTime...
TP                                                       5
FP                                                       6
TN                                                    66

  """


                                                         0
task_id  contextOSE_ec2_network_in_257a54.csv-0contextO...
TP                                                       2
FP                                                       2
TN                                                    6608
FN                                                     664
                                                         0
task_id  windowedGaussian_ec2_disk_write_bytes_1ef3de.c...
TP                                                      10
FP                                                      11
TN                                                    6599
FN                                                     656


Unnamed: 0,solution,accuracy,precision,recall,f1-score
0,knncad,0.908191,0.375,0.004505,0.008902
1,numentaTM,0.907504,0.230769,0.004505,0.008837
2,twitterADVec,0.906817,0.166667,0.004505,0.008772
3,skyline,0.908466,0.5,0.022523,0.043103
4,earthgeckoSkyline,0.907642,0.25,0.004505,0.00885
5,numenta,0.907916,0.3,0.004505,0.008876
6,bayesChangePt,0.903381,0.163636,0.013514,0.024965
7,,0.908466,,0.0,
8,expose,0.90613,0.095238,0.003003,0.005822
9,relativeEntropy,0.908191,0.4,0.006006,0.011834


In [7]:
display(agg.sort_values(by='f1-score', axis=0))

Unnamed: 0,solution,accuracy,precision,recall,f1-score
8,expose,0.90613,0.095238,0.003003,0.005822
11,htmjava,0.907367,0.166667,0.003003,0.0059
14,contextOSE,0.908466,0.5,0.003003,0.00597
2,twitterADVec,0.906817,0.166667,0.004505,0.008772
1,numentaTM,0.907504,0.230769,0.004505,0.008837
4,earthgeckoSkyline,0.907642,0.25,0.004505,0.00885
5,numenta,0.907916,0.3,0.004505,0.008876
0,knncad,0.908191,0.375,0.004505,0.008902
9,relativeEntropy,0.908191,0.4,0.006006,0.011834
10,ARTime,0.908329,0.454545,0.007508,0.014771
