# Scores of injection experiments

In [1]:
from os import listdir
from os.path import isfile, join
import pickle
import numpy as np
import pprint

In [2]:
results_directory = "../data/results/"
evaluation_data = {}

### Load results

In [3]:
results_meta = {}

# get files and ids
for name in listdir(results_directory):
    path = join(results_directory, name)
    if isfile(path):
        id = name[:-7]
        results_meta[id] = {"result_pickle":path}

# add data
for key in results_meta:
    with open(results_meta[key]["result_pickle"], 'rb') as handle:
        results_meta[key]["data"] = pickle.load(handle)

# print info
for key in results_meta: 
    print(
        key, " ",
        results_meta[key]["result_pickle"], " ",
        type(results_meta[key]["data"]), " ",
        len(results_meta[key]["data"])
    )  

twitter_diff_dist   ../data/results/twitter_diff_dist.pickle   <class 'tuple'>   3
twitter_drift_induction   ../data/results/twitter_drift_induction.pickle   <class 'dict'>   3
amazon_diff_classes   ../data/results/amazon_diff_classes.pickle   <class 'dict'>   3
amazon_same_dist   ../data/results/amazon_same_dist.pickle   <class 'dict'>   3
twitter_same_dist   ../data/results/twitter_same_dist.pickle   <class 'dict'>   3
twitter_diff_classes   ../data/results/twitter_diff_classes.pickle   <class 'dict'>   3
amazon_drift_induction   ../data/results/amazon_drift_induction.pickle   <class 'dict'>   3


### Extract results: twitter_drift_induction

In [4]:
# set config from notebook
permutations = 10
modes = ['bert_768', 'bow_50', 'bow_768']
target_percentages = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
detectors = {
    'csdd': 'CosineSimilarityDriftDetector()',
    'kts' : 'KernelTwoSampleDriftDetector()',
    'aks' : 'AlibiKSDetector()',
    'ammd': 'AlibiMMDDetector()',
    'lsdd': 'AlibiLSDDDetector()',
    'cdbd': 'CDBDDetector()',
}
result_pickle = results_meta["twitter_drift_induction"]["result_pickle"]
try:
    del results
except NameError:
    pass

In [5]:
evaluation_data["twitter"] = {}

try:
    results
except NameError:
    print('Loading result pickle: ', result_pickle)
    with open(result_pickle, 'rb') as handle:
        results = pickle.load(handle)    

for mode in modes:
    print(mode)
    plot_data = {}
       
    for detector in detectors:
        plot_data[detector] = []
        for n in range(len(results[mode][detector]['predictions'][0])):
            nth_entries = [results[mode][detector]['predictions'][i][n] for i in range(permutations)]
            plot_data[detector].append(np.mean(nth_entries))

    evaluation_data["twitter"][mode] = plot_data

Loading result pickle:  ../data/results/twitter_drift_induction.pickle
bert_768
bow_50
bow_768


### Extract results: amazon_drift_induction.ipynb

In [6]:
# set config from notebook
permutations = 10
modes = ['bert_768', 'bow_50', 'bow_768']
target_percentages = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
detectors = {
    'csdd': "",
    'kts' : "",
    'aks' : "",
    'ammd': "",
    'lsdd': "",
    'cdbd': "",
}
result_pickle = results_meta["amazon_drift_induction"]["result_pickle"]
try:
    del results
except NameError:
    pass

In [7]:
evaluation_data["amazon"] = {}

try:
    results
except NameError:
    print('Loading result pickle: ', result_pickle)
    with open(result_pickle, 'rb') as handle:
        results = pickle.load(handle)    

# plot
for mode in modes:
    print(mode)
    plot_data = {}
       
    for detector in detectors:
        plot_data[detector] = []
        for n in range(len(results[mode][detector]['predictions'][0])):
            nth_entries = [results[mode][detector]['predictions'][i][n] for i in range(permutations)]
            plot_data[detector].append(np.mean(nth_entries))
            
    evaluation_data["amazon"][mode] = plot_data

Loading result pickle:  ../data/results/amazon_drift_induction.pickle
bert_768
bow_50
bow_768


### Configuration of conducted experiments

In [8]:
target_percentages = [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
negative_words_percentage = np.divide(target_percentages, 2)
print("negative_words_percentage", negative_words_percentage)

negative_words_percentage [0.    0.025 0.05  0.075 0.1   0.125 0.15  0.175 0.2   0.225 0.25  0.275
 0.3   0.325 0.35  0.375 0.4   0.425 0.45  0.475 0.5  ]


### Add comparison detector values

In [9]:
evaluation_data['test'] = {}
evaluation_data['test']['testmode'] = {}

fake_predictions = [0] * 21
fake_predictions[0] = 1
evaluation_data['test']['testmode']['perfect'] = fake_predictions

fake_predictions = [1] * 21
fake_predictions[0] = 0
evaluation_data['test']['testmode']['bad'] = fake_predictions

fake_predictions = negative_words_percentage
fake_predictions[0] = .012
fake_predictions = np.divide(.01, fake_predictions)
evaluation_data['test']['testmode']['fake'] = fake_predictions

### Print overview

In [10]:
for dataset in evaluation_data:
    for mode in evaluation_data[dataset]:
        for detector in evaluation_data[dataset][mode]:
            first = np.round(evaluation_data[dataset][mode][detector][:4], 4)
            last = np.round(evaluation_data[dataset][mode][detector][0:4], 4)
            print(dataset, mode, detector, len(evaluation_data[dataset][mode][detector]), "  ", first, "...", last)

twitter bert_768 csdd 21    [0.9998 0.9998 0.9998 0.9997] ... [0.9998 0.9998 0.9998 0.9997]
twitter bert_768 kts 21    [0.591  0.5846 0.538  0.4364] ... [0.591  0.5846 0.538  0.4364]
twitter bert_768 aks 21    [0.5178 0.5154 0.5094 0.5009] ... [0.5178 0.5154 0.5094 0.5009]
twitter bert_768 ammd 21    [0.547 0.532 0.453 0.347] ... [0.547 0.532 0.453 0.347]
twitter bert_768 lsdd 21    [0.572 0.518 0.47  0.374] ... [0.572 0.518 0.47  0.374]
twitter bert_768 cdbd 21    [0.2411 0.2306 0.2263 0.2172] ... [0.2411 0.2306 0.2263 0.2172]
twitter bow_50 csdd 21    [0.9951 0.9945 0.9934 0.9918] ... [0.9951 0.9945 0.9934 0.9918]
twitter bow_50 kts 21    [0.3424 0.2188 0.0786 0.0164] ... [0.3424 0.2188 0.0786 0.0164]
twitter bow_50 aks 21    [0.4931 0.4776 0.4279 0.3877] ... [0.4931 0.4776 0.4279 0.3877]
twitter bow_50 ammd 21    [0.383 0.22  0.087 0.013] ... [0.383 0.22  0.087 0.013]
twitter bow_50 lsdd 21    [0.443 0.294 0.141 0.049] ... [0.443 0.294 0.141 0.049]
twitter bow_50 cdbd 21    [0.2029 

# Functions for metric scores

In [11]:
def eval_metric_ext(negative_words_percentage, p_values, factor_penalty, print_info):
    
    # remove first element (no injected drift)
    neg_words_perc = negative_words_percentage[1:]
    p_vals = p_values[1:]
        
    # Scores 0 (bad) to 1 (good)
    scores = []
    for i in range(len(p_vals)):
        scores.append(1 - p_vals[i])
    if(print_info):
        print(" difference score ", round(numpy.sum(scores), 4))
    
    # No injected drift in first element
    # High penalty for low p-value
    penalty = (1 - p_values[0]) * factor_penalty
    for i in range(len(scores)):
        scores[i] = (scores[i] - penalty)
    if(print_info):
        print(" penalty ", round(penalty, 4), "= 1 -", round(p_values[0], 4))
        print(round(numpy.sum(scores), 4), "penalty score")
        
    # Earlier drift detection is better (sensitivity)
    sum_rating = 0
    for i in range(len(scores)):
        rating = 1/neg_words_perc[i]
        sum_rating += rating
        scores[i] = scores[i] * rating
    
    if(print_info):
        print(" scores ", numpy.round(scores, 2))
        print(" sum_rating ", round(sum_rating, 4))
        print(" result ", round(numpy.sum(scores) / sum_rating, 4))
    
    return np.sum(scores) / sum_rating


def get_label(name):
    mappings = {
        "cdbd" : "CDBD",
        "csdd" : "CosSim",
        "kts" : "KTS",
        "aks" : "KS",
        "lsdd" : "LSDD",
        "ammd" : "MMD",
        
        "twitterbert_768" : "Twitter, BERT-768",
        "twitterbow_50" : "Twitter, BoW-50",
        "twitterbow_768" : "Twitter, BoW-768",
        "amazonbert_768" : "Amazon, BERT-768",
        "amazonbow_50" : "Amazon, BoW-50",
        "amazonbow_768" : "Amazon, BoW-768",
    }
    if(name in mappings):
        return mappings[name]
    else:
        print("Unknown:", name)
        return name


def get_sorted_detectors(detector_keys):
    labels = {}
    for detector in detectors:
        labels[get_label(detector)] = detector
    sorted_ = []
    for detector in sorted(labels.keys()):
        sorted_.append(labels[detector])
    return sorted_


def format_float(value):
    rounded = np.round(value, 4)
    if(rounded == 0): # handle -0.0
        rounded = 0
    return (format(rounded, '.4f'))

### Scores (with penalty)

In [12]:
print_scores_with_penalty = True

scores = {}
for dataset in evaluation_data:
    scores[dataset] = {}
    for mode in evaluation_data[dataset]:
        scores[dataset][mode] = {}
        for detector in evaluation_data[dataset][mode]:
            scores[dataset][mode][detector] = eval_metric_ext(negative_words_percentage, evaluation_data[dataset][mode][detector], 1, False)

In [13]:
if(print_scores_with_penalty):
    pprint.pprint(scores)

{'amazon': {'bert_768': {'aks': 0.1038891377276589,
                         'ammd': 0.35207229452446626,
                         'cdbd': 0.0,
                         'csdd': 0.0009419449035998787,
                         'kts': 0.3470358487863721,
                         'lsdd': 0.32282094194420985},
            'bow_50': {'aks': 0.08493507904662598,
                       'ammd': 0.30006730517979396,
                       'cdbd': 0.0,
                       'csdd': 0.00322754592606801,
                       'kts': 0.3305730797663515,
                       'lsdd': 0.35501300728480323},
            'bow_768': {'aks': 0.031178169007641,
                        'ammd': 0.224893615182149,
                        'cdbd': -8.00317837646986e-11,
                        'csdd': 0.000788754483987109,
                        'kts': 0.20467471119561784,
                        'lsdd': 0.09724017361297943}},
 'test': {'testmode': {'bad': -1.0000000000000002,
                       'fake': 

In [14]:
sorted_detectors = get_sorted_detectors(scores["twitter"]["bert_768"])
sorted_datasets = sorted(scores.keys())
sorted_modes = ['bow_50', 'bow_768', 'bert_768']

# Get values to compute means
s = {}
c = {}
sd = {}
cd = {}
for dataset in sorted_datasets:
    if(dataset == "test"):
        continue
    for mode in sorted_modes:
        for detector in sorted_detectors:
            score = scores[dataset][mode][detector]
            sd[detector] = sd.get(detector,0) + score
            s[dataset+mode] = s.get(dataset+mode,0) + score
            cd[detector] = cd.get(detector,0) + 1
            c[dataset+mode] = c.get(dataset+mode,0) + 1


            
end=" , "
print("Dataset, Model", end=end)
for detector in sorted_detectors:
    print(get_label(detector), end=end)
print("$\\varnothing$", end=end) # ∅ $\varnothing$
    
for dataset in sorted_datasets:
    if(dataset == "test"):
        continue
    for mode in sorted_modes:
        print()
        print(get_label(dataset+mode), end=end)
        for detector in sorted_detectors:
            score = scores[dataset][mode][detector]
            print(format_float(score), end=end)
            
        print(format_float(s[dataset+mode]/c[dataset+mode]), end=end)

print()
print('$\\varnothing$     ,        ', end=end) # ∅ $\varnothing$
for detector in sorted_detectors:
    print(format_float(sd[detector]/cd[detector]), end=end)

Dataset, Model , CDBD , CosSim , KS , KTS , LSDD , MMD , $\varnothing$ , 
Amazon, BoW-50 , 0.0000 , 0.0032 , 0.0849 , 0.3306 , 0.3550 , 0.3001 , 0.1790 , 
Amazon, BoW-768 , 0.0000 , 0.0008 , 0.0312 , 0.2047 , 0.0972 , 0.2249 , 0.0931 , 
Amazon, BERT-768 , 0.0000 , 0.0009 , 0.1039 , 0.3470 , 0.3228 , 0.3521 , 0.1878 , 
Twitter, BoW-50 , -0.0240 , 0.0191 , 0.1790 , 0.2681 , 0.3343 , 0.3084 , 0.1808 , 
Twitter, BoW-768 , -0.0107 , 0.0588 , 0.1493 , 0.3368 , 0.2826 , 0.3340 , 0.1918 , 
Twitter, BERT-768 , 0.0406 , 0.0002 , 0.0681 , 0.2749 , 0.2832 , 0.2753 , 0.1571 , 
$\varnothing$     ,         , 0.0010 , 0.0138 , 0.1027 , 0.2937 , 0.2792 , 0.2991 , 

In [15]:
if(False):
    pprint.pprint(s)
    pprint.pprint(c)
    pprint.pprint(sd)
    pprint.pprint(cd)

### Scores (without penalty)

In [16]:
print_scores_without_penalty = True

scoresNoPenalty = {}
for dataset in evaluation_data:
    scoresNoPenalty[dataset] = {}
    for mode in evaluation_data[dataset]:
        scoresNoPenalty[dataset][mode] = {}
        for detector in evaluation_data[dataset][mode]:
            scoresNoPenalty[dataset][mode][detector] = eval_metric_ext(negative_words_percentage, evaluation_data[dataset][mode][detector], 0, False)

In [17]:
if(print_scores_without_penalty):
    pprint.pprint(scoresNoPenalty)

{'amazon': {'bert_768': {'aks': 0.5541084112948587,
                         'ammd': 0.6100722945244665,
                         'cdbd': 1.0000000000000002,
                         'csdd': 0.001617742366063258,
                         'kts': 0.6262358487863721,
                         'lsdd': 0.6018209709120674},
            'bow_50': {'aks': 0.5691815118278455,
                       'ammd': 0.7690673051797943,
                       'cdbd': 1.0000000000000002,
                       'csdd': 0.005845381924603167,
                       'kts': 0.7507730797663515,
                       'lsdd': 0.7400130260602666},
            'bow_768': {'aks': 0.5183443949178279,
                        'ammd': 0.6678936151821491,
                        'cdbd': 0.9999999999115522,
                        'csdd': 0.005456990262795705,
                        'kts': 0.7606747111956179,
                        'lsdd': 0.6042401833285367}},
 'test': {'testmode': {'bad': 0.0,
                       'f

In [18]:
sorted_detectors = get_sorted_detectors(scores["twitter"]["bert_768"])
sorted_datasets = sorted(scores.keys())
sorted_modes = ['bow_50', 'bow_768', 'bert_768']

# Get values to compute means
s = {}
c = {}
sd = {}
cd = {}
for dataset in sorted_datasets:
    if(dataset == "test"):
        continue
    for mode in sorted_modes:
        for detector in sorted_detectors:
            score = scoresNoPenalty[dataset][mode][detector]
            sd[detector] = sd.get(detector,0) + score
            s[dataset+mode] = s.get(dataset+mode,0) + score
            cd[detector] = cd.get(detector,0) + 1
            c[dataset+mode] = c.get(dataset+mode,0) + 1


            
end=" , "
print("Dataset, Model", end=end)
for detector in sorted_detectors:
    print(get_label(detector), end=end)
print("$\\varnothing$", end=end) # ∅ $\varnothing$
    
for dataset in sorted_datasets:
    if(dataset == "test"):
        continue
    for mode in sorted_modes:
        print()
        print(get_label(dataset+mode), end=end)
        for detector in sorted_detectors:
            score = scoresNoPenalty[dataset][mode][detector]
            print(format_float(score), end=end)
            
        print(format_float(s[dataset+mode]/c[dataset+mode]), end=end)

print()
print('$\\varnothing$     ,        ', end=end) # ∅ $\varnothing$
for detector in sorted_detectors:
    print(format_float(sd[detector]/cd[detector]), end=end)

Dataset, Model , CDBD , CosSim , KS , KTS , LSDD , MMD , $\varnothing$ , 
Amazon, BoW-50 , 1.0000 , 0.0058 , 0.5692 , 0.7508 , 0.7400 , 0.7691 , 0.6391 , 
Amazon, BoW-768 , 1.0000 , 0.0055 , 0.5183 , 0.7607 , 0.6042 , 0.6679 , 0.5928 , 
Amazon, BERT-768 , 1.0000 , 0.0016 , 0.5541 , 0.6262 , 0.6018 , 0.6101 , 0.5656 , 
Twitter, BoW-50 , 0.7731 , 0.0240 , 0.6858 , 0.9257 , 0.8913 , 0.9254 , 0.7042 , 
Twitter, BoW-768 , 0.7043 , 0.1351 , 0.6507 , 0.8718 , 0.8526 , 0.8980 , 0.6854 , 
Twitter, BERT-768 , 0.7995 , 0.0004 , 0.5503 , 0.6839 , 0.7112 , 0.7283 , 0.5789 , 
$\varnothing$     ,         , 0.8795 , 0.0287 , 0.5881 , 0.7698 , 0.7335 , 0.7665 , 

In [19]:
if(False):
    pprint.pprint(s)
    pprint.pprint(c)
    pprint.pprint(sd)
    pprint.pprint(cd)