In [4]:
import sys
sys.path.append('../')
import os
import json
import pandas as pd
from pathlib import Path

#custom imports
import config
import nab_utils

ROOT_RESULTS=config.NAB_ASSETS / "predictions"
SAVE_DIRECTORY= config.NAB_ASSETS / "processed_results"

with open(config.NAB_ASSETS / "config" / "thresholds.json") as json_file:
    thresholds_file=json.load(json_file)


/app/nab
ROOT_PATH = /app


In [2]:
def get_scores_for_dataset(root_directory, data_group, dataset):
    """ For every model grab the results for the specified database.txt
    Reuturns: A dict with the key being the model name and a dataframe storing the results of each record"""

    anomaly_scores=pd.DataFrame()
    folders=next(os.walk(root_directory))[1] #GET ALL MODEL FOLDERS

    for model in folders:
        filepath=Path(root_directory) / model / data_group / Path(model +"_"+ dataset)
        file_results=pd.read_csv(filepath)
        anomaly_scores[model]=file_results["anomaly_score"]
    
    #add original datetimes and values
    anomaly_scores["timestamp"]=file_results["timestamp"]
    anomaly_scores["value"]=file_results["value"]

    return anomaly_scores

def calculate_threshold_cutoffs(df_scores, profile, thresholds):
    """Get optimal thresholds and apply cutoff
    Takes in the anomaly score dict and a profile to choose from for the config threshold filter
    Also calculates a sum of how many models passed the threshold for each model """
    
    for model in df_scores.columns:
        if model=="timestamp" or model=="value":
            continue
        df_scores[model]=df_scores[model].map(lambda x: x > thresholds[model][profile]["threshold"])
    
    #calculate sums
    item_list = [e for e in list(df_scores) if e not in ('timestamp','value')] #don't add timestamp or value to sum calculation
    df_scores["sum"]=df_scores[item_list].sum(axis=1)

    return df_scores

In [3]:
dataset="art_daily_jumpsdown.csv"
data_group="artificialWithAnomaly"
anomaly_scores=get_scores_for_dataset(ROOT_RESULTS,data_group,dataset)
treshold_scores=calculate_threshold_cutoffs(anomaly_scores.copy(),"standard",thresholds_file)

In [5]:
anomaly_scores=nab_utils.format_dataframe(anomaly_scores)
treshold_scores=nab_utils.format_dataframe(treshold_scores)

In [6]:
joins=anomaly_scores.join(treshold_scores,rsuffix='_threshold')
nab_utils.add_labels_to_dataset(joins,data_group+"/"+dataset)
nab_utils.add_anomaly_window_to_dataset(joins,10)
#treshold_scores[treshold_scores["sum"]>=1]

In [7]:
joins

Unnamed: 0_level_0,bayesChangePt,knncad,random,value,bayesChangePt_threshold,knncad_threshold,random_threshold,value_threshold,sum,anomaly_label,anomaly_window
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2014-04-01 00:00:00,0.0,0.0,0.639427,18.090486,False,False,False,18.090486,0,False,
2014-04-01 00:05:00,0.0,0.0,0.025011,20.359843,False,False,False,20.359843,0,False,
2014-04-01 00:10:00,0.0,0.0,0.275029,21.105470,False,False,False,21.105470,0,False,
2014-04-01 00:15:00,0.0,0.0,0.223211,21.151585,False,False,False,21.151585,0,False,
2014-04-01 00:20:00,0.0,0.0,0.736471,18.137141,False,False,False,18.137141,0,False,
...,...,...,...,...,...,...,...,...,...,...,...
2014-04-14 23:35:00,0.0,0.5,0.408647,18.269290,False,False,False,18.269290,0,False,
2014-04-14 23:40:00,0.0,0.5,0.189021,19.087351,False,False,False,19.087351,0,False,
2014-04-14 23:45:00,0.0,0.5,0.709149,19.594689,False,False,False,19.594689,0,False,
2014-04-14 23:50:00,0.0,0.5,0.789422,19.767817,False,False,False,19.767817,0,False,


In [64]:
def evaluate_performance(dataset, thresholds, window_size, max_thresholds=100):
    df_stats={}
    if len(thresholds) > max_thresholds:
        thresholds=sample(thresholds.tolist(), max_thresholds)

        for threshold in thresholds:
            df_stats[threshold]={}
            df_stats[threshold]["tp"]=0
            df_stats[threshold]["fn"]=0
            df_stats[threshold]["fp"]=0

            predictions=[i for i,v in enumerate(dataset["sum"]>=threshold) if v] #predictions are when the sum at that time surpasses the threshold

            for prediction in predictions:
                if(dataset.loc[dataset.index[prediction], 'anomaly_window']<1):#if the prediction is not insdie an anomaly window
                    df_stats[threshold]["fp"]=df_stats[threshold]["fp"]+1
                else:
                    df_stats[threshold]["tp"]=df_stats[threshold]["tp"]+1
            
            anomalies=[i for i,v in enumerate(dataset["anomaly_label"]==True) if v]
            for anomaly in anomalies: #for each labeled anomaly in the file
                detect=False
                window=range(-window_size,window_size+1) #get window for hits
                for entry in window: # for any entry in the window

                    if(anomaly+entry<0) or (anomaly+entry>len(dataset)): #avoid going out of bounds on the data
                        continue
                    if (dataset.loc[dataset.index[anomaly+entry], 'sum']>threshold): #if the sum at that time surpasses the threshold
                        detect=True #count this anomaly as found
                    
                if (detect==True):
                    df_stats[threshold]["tp"]=df_stats[threshold]["tp"]+1
                else:
                    df_stats[threshold]["fn"]=df_stats[threshold]["fn"]+1
            df_stats[threshold]["tn"]=len(dataset) - (df_stats[threshold]["tp"]+df_stats[threshold]["fp"]+df_stats[threshold]["fn"])+1

            #calculate metrics
            if (df_stats[threshold]["tp"]+df_stats[threshold]["fn"]>0):#if no labels in set we can a division by zero error
                df_stats[threshold]["sensitivity"]=df_stats[threshold]["tp"]/(df_stats[threshold]["tp"]+df_stats[threshold]["fn"])
            else:
                df_stats[threshold]["sensitivity"]=1
            
            df_stats[threshold]["specificitity"]=df_stats[threshold]["tn"]+df_stats[threshold]["fn"]/(df_stats[threshold]["tn"]+df_stats[threshold]["fp"])

            df_results=pd.DataFrame(df_stats)

            return df_results

Unnamed: 0_level_0,bayesChangePt,knncad,random,value,sum
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014-04-01 09:00:00,True,False,False,69.971759,1
2014-04-01 16:30:00,False,False,True,81.581763,1
2014-04-01 18:00:00,True,False,False,32.555578,1
2014-04-01 21:10:00,False,False,True,19.56456,1
2014-04-02 09:00:00,True,False,False,69.917116,1
2014-04-02 18:00:00,True,False,False,33.620965,1
2014-04-02 19:20:00,False,False,True,21.861459,1
2014-04-03 09:00:00,True,False,False,62.965356,1
2014-04-03 12:25:00,False,False,True,87.460645,1
2014-04-03 18:00:00,True,False,False,33.017279,1
