In [1]:
import os
import sys
import numpy as np
import json as json
import pandas as pd
from datetime import datetime

# General Paths
data_path = os.getcwd()
project_path = os.path.dirname(data_path)
benchmarks_path = os.path.join(project_path, 'benchmarks')

NAB_path = os.path.join(benchmarks_path, 'NAB')
NAB_config = os.path.join(NAB_path, 'config')
NAB_dataset = os.path.join(NAB_path, 'data')
NAB_labels = os.path.join(NAB_path, 'labels')
NAB_results = os.path.join(NAB_path, 'results')


In [2]:
def accuracy(row):
    return (row['TP']+row['TN']) / (row['TP']+row['FP']+row['TN']+row['FN'])

def precision(row):
    return (row['TP']) / (row['TP'] + row['FP'])

def recall(row):
    return (row['TP']) / (row['TP'] + row['FN'])

def fpr(row):
    return row['FP'] / (row['FP'] + row['TN'])

def fnr(row):
    return row['FN'] / (row['FN'] + row['TP'])

def fscore(row):
    p = precision(row)
    r = recall(row)
    return (2*p*r) / (p+r)

In [3]:
agg = {
    'solution': [],
    'accuracy': [],
    'precision': [],
    'recall': [],
    'f1-score': []
}

# Loop though each Solution with NAB Results
for i, solution in enumerate(os.listdir(NAB_results)):
    if os.path.isdir(os.path.join(NAB_results, solution)):
        # print(f"{i:2d} : {solution}")
        solution_result_path = os.path.join(NAB_results, solution)

        # Get the Standard Results
        standard_results_csv = f"{solution}_standard_scores.csv"
        standard_results_path = os.path.join(solution_result_path, standard_results_csv)
        df = pd.read_csv(standard_results_path)
        total_results = df[df['Detector']=='Totals']
        agg['solution'].append(solution)
        agg['accuracy'].append(total_results.apply(accuracy, axis=1).values[0])
        agg['precision'].append(total_results.apply(precision, axis=1).values[0])
        agg['recall'].append(total_results.apply(recall, axis=1).values[0])
        agg['f1-score'].append(total_results.apply(fscore, axis=1).values[0])

        
agg = pd.DataFrame.from_dict(agg)
display(agg)

Unnamed: 0,solution,accuracy,precision,recall,f1-score
0,knncad,0.898727,0.242131,0.002986,0.005898
1,numentaTM,0.899532,0.562358,0.007404,0.014616
2,twitterADVec,0.899433,0.517974,0.009464,0.018589
3,skyline,0.899868,0.571921,0.019824,0.038319
4,earthgeckoSkyline,0.898895,0.289544,0.003224,0.006378
5,numenta,0.899628,0.610687,0.007165,0.014164
6,bayesChangePt,0.897369,0.116494,0.003015,0.005879
7,,0.899364,0.5,3e-05,6e-05
8,expose,0.898213,0.192308,0.003583,0.007034
9,relativeEntropy,0.899568,0.588391,0.006658,0.013166


In [4]:
def make_pred(row, threshold):
    # print(row)
    return int(row['anomaly_score'] >= threshold)

In [5]:
solution_thresh = {}
with open(os.path.join(NAB_config, 'thresholds.json')) as f:
    solution_thresh = json.load(f)

agg = {
    'solution': [],
    'accuracy': [],
    'precision': [],
    'recall': [],
    'f1-score': [],
    'fpr': [],
    'fnr': []
}

# Loop though each Solution with NAB Results
for i, solution in enumerate(os.listdir(NAB_results)):
    if os.path.isdir(os.path.join(NAB_results, solution)):
        # print(f"{i:2d} : {solution}")
        solution_result_path = os.path.join(NAB_results, solution)
        
        solution_results = {
            'task_id': [],
            'TP': [],
            'FP': [],
            'TN': [],
            'FN': []
        }

        # Loop through each Task Family in the Dataset
        for j, task_family in enumerate(os.listdir(solution_result_path)):
            if os.path.isdir(os.path.join(solution_result_path, task_family)):
                solution_task_family_path = os.path.join(solution_result_path, task_family)

                for k, task in enumerate(os.listdir(solution_task_family_path)):
                    task_results = os.path.join(solution_task_family_path, task)
                    df = pd.read_csv(task_results)

                    thresh = solution_thresh[solution]['standard']['threshold']
                    # Add Prediction Column
                    df['pred'] = df.apply(lambda row: make_pred(row, thresh), axis=1)

                    # Go through the Splits
                    # Loop thrugh the Task Data to Create Frames Consistent Size
                    split_nu = 0
                    split_size = 50
                    split_start = 0
                    split_end = split_size
                    # (While we can still get 20(tracks)x50(samples)
                    while split_start + split_size < len(df): 

                        # DO THE SPLITZ
                        split_df = df.iloc[split_start:split_end, :]
                        split_start += split_size
                        split_end += split_size

                        # Check Split Pred and Label
                        split_pred = any(split_df['pred'].values==1)
                        split_label = any(split_df['label'].values==1)

                        # Check for TP, FP, TN, FN
                        tp, fp, tn, fn = 0,0,0,0
                        if split_pred and split_label:
                            tp += 1
                        if split_pred and not split_label:
                            fp += 1
                        if not split_pred and not split_label:
                            tn += 1
                        if not split_pred and split_label:
                            fn += 1

                        # Append to the Solution Results
                        task_id = f"{task}-{split_nu}"
                        solution_results['task_id'].append(task_id)
                        solution_results['TP'].append(tp)
                        solution_results['FP'].append(fp)
                        solution_results['TN'].append(tn)
                        solution_results['FN'].append(fn)

                        split_nu += 1

        # Aggregate Solution Results 
        solution_results = pd.DataFrame.from_dict(solution_results)
        solution_agg = solution_results.sum(axis=0)
        solution_agg = pd.DataFrame(solution_agg)

        agg['solution'].append(solution)
        agg['accuracy'].append(solution_agg.apply(accuracy, axis=0).values[0])
        agg['precision'].append(solution_agg.apply(precision, axis=0).values[0])
        agg['recall'].append(solution_agg.apply(recall, axis=0).values[0])
        agg['f1-score'].append(solution_agg.apply(fscore, axis=0).values[0])
        agg['fpr'].append(solution_agg.apply(fpr, axis=0).values[0])
        agg['fnr'].append(solution_agg.apply(fnr, axis=0).values[0])

agg = pd.DataFrame.from_dict(agg)
display(agg)

  """


Unnamed: 0,solution,accuracy,precision,recall,f1-score,fpr,fnr
0,knncad,0.865173,0.254237,0.134961,0.176322,0.047399,0.865039
1,numentaTM,0.869984,0.316594,0.186375,0.234628,0.048169,0.813625
2,twitterADVec,0.877268,0.34748,0.16838,0.22684,0.037858,0.83162
3,skyline,0.873969,0.327543,0.169666,0.223539,0.041705,0.830334
4,earthgeckoSkyline,0.868197,0.276543,0.143959,0.189349,0.045091,0.856041
5,numenta,0.875756,0.35,0.188946,0.245409,0.042013,0.811054
6,bayesChangePt,0.803189,0.125858,0.141388,0.133172,0.117575,0.858612
7,,0.893073,,0.0,,0.0,1.0
8,expose,0.846894,0.158537,0.100257,0.122835,0.063712,0.899743
9,relativeEntropy,0.880154,0.367232,0.167095,0.229682,0.034472,0.832905


In [7]:
display(agg.sort_values(by='f1-score', axis=0, ascending=False))

Unnamed: 0,solution,accuracy,precision,recall,f1-score,fpr,fnr
10,ARTime,0.887576,0.440828,0.191517,0.267025,0.029086,0.808483
5,numenta,0.875756,0.35,0.188946,0.245409,0.042013,0.811054
11,htmjava,0.869434,0.320833,0.197943,0.244833,0.050169,0.802057
12,randomCutForest,0.874382,0.338863,0.183805,0.238333,0.042936,0.816195
1,numentaTM,0.869984,0.316594,0.186375,0.234628,0.048169,0.813625
9,relativeEntropy,0.880154,0.367232,0.167095,0.229682,0.034472,0.832905
2,twitterADVec,0.877268,0.34748,0.16838,0.22684,0.037858,0.83162
3,skyline,0.873969,0.327543,0.169666,0.223539,0.041705,0.830334
14,contextOSE,0.892523,0.49,0.125964,0.200409,0.015697,0.874036
15,windowedGaussian,0.862562,0.26383,0.159383,0.198718,0.053247,0.840617
