# Scores of injection experiments (**metric version C**)

This rates the first results very high, but last results probably too low.

Printed tables can be pasted at https://www.tablesgenerator.com/

In [1]:
from os import listdir
from os.path import isfile, join
import pickle
import numpy as np
import pprint

In [2]:
results_directory = "../data/results/"
evaluation_data = {}

### Load results

In [3]:
results_meta = {}

# get files and ids
for name in listdir(results_directory):
    path = join(results_directory, name)
    if isfile(path):
        id = name[:-7]
        results_meta[id] = {"result_pickle":path}

# add data
for key in results_meta:
    with open(results_meta[key]["result_pickle"], 'rb') as handle:
        results_meta[key]["data"] = pickle.load(handle)

# print info
for key in results_meta: 
    print(
        key, " ",
        results_meta[key]["result_pickle"], " ",
        type(results_meta[key]["data"]), " ",
        len(results_meta[key]["data"])
    )  

twitter_diff_dist   ../data/results/twitter_diff_dist.pickle   <class 'tuple'>   3
twitter_drift_induction   ../data/results/twitter_drift_induction.pickle   <class 'dict'>   3
amazon_diff_classes   ../data/results/amazon_diff_classes.pickle   <class 'dict'>   3
amazon_same_dist   ../data/results/amazon_same_dist.pickle   <class 'dict'>   3
twitter_same_dist   ../data/results/twitter_same_dist.pickle   <class 'dict'>   3
twitter_diff_classes   ../data/results/twitter_diff_classes.pickle   <class 'dict'>   3
amazon_drift_induction   ../data/results/amazon_drift_induction.pickle   <class 'dict'>   3


### Extract results: twitter_drift_induction

In [4]:
# set config from notebook
permutations = 10
modes = ['bert_768', 'bow_50', 'bow_768']
target_percentages = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
detectors = {
    'csdd': 'CosineSimilarityDriftDetector()',
    'kts' : 'KernelTwoSampleDriftDetector()',
    'aks' : 'AlibiKSDetector()',
    'ammd': 'AlibiMMDDetector()',
    'lsdd': 'AlibiLSDDDetector()',
    'cdbd': 'CDBDDetector()',
}
result_pickle = results_meta["twitter_drift_induction"]["result_pickle"]
try:
    del results
except NameError:
    pass

In [5]:
evaluation_data["twitter"] = {}

try:
    results
except NameError:
    print('Loading result pickle: ', result_pickle)
    with open(result_pickle, 'rb') as handle:
        results = pickle.load(handle)    

for mode in modes:
    print(mode)
    plot_data = {}
       
    for detector in detectors:
        plot_data[detector] = []
        for n in range(len(results[mode][detector]['predictions'][0])):
            nth_entries = [results[mode][detector]['predictions'][i][n] for i in range(permutations)]
            plot_data[detector].append(np.mean(nth_entries))

    evaluation_data["twitter"][mode] = plot_data

Loading result pickle:  ../data/results/twitter_drift_induction.pickle
bert_768
bow_50
bow_768


### Extract results: amazon_drift_induction.ipynb

In [6]:
# set config from notebook
permutations = 10
modes = ['bert_768', 'bow_50', 'bow_768']
target_percentages = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
detectors = {
    'csdd': "",
    'kts' : "",
    'aks' : "",
    'ammd': "",
    'lsdd': "",
    'cdbd': "",
}
result_pickle = results_meta["amazon_drift_induction"]["result_pickle"]
try:
    del results
except NameError:
    pass

In [7]:
evaluation_data["amazon"] = {}

try:
    results
except NameError:
    print('Loading result pickle: ', result_pickle)
    with open(result_pickle, 'rb') as handle:
        results = pickle.load(handle)    

# plot
for mode in modes:
    print(mode)
    plot_data = {}
       
    for detector in detectors:
        plot_data[detector] = []
        for n in range(len(results[mode][detector]['predictions'][0])):
            nth_entries = [results[mode][detector]['predictions'][i][n] for i in range(permutations)]
            plot_data[detector].append(np.mean(nth_entries))
            
    evaluation_data["amazon"][mode] = plot_data

Loading result pickle:  ../data/results/amazon_drift_induction.pickle
bert_768
bow_50
bow_768


### Configuration of conducted experiments

In [8]:
target_percentages = [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
negative_words_percentage = np.divide(target_percentages, 2)
print("negative_words_percentage", negative_words_percentage)

negative_words_percentage [0.    0.025 0.05  0.075 0.1   0.125 0.15  0.175 0.2   0.225 0.25  0.275
 0.3   0.325 0.35  0.375 0.4   0.425 0.45  0.475 0.5  ]


### Add comparison detector values

In [9]:
evaluation_data['test'] = {}
evaluation_data['test']['testmode'] = {}

fake_predictions = [0] * 21
fake_predictions[0] = 1
evaluation_data['test']['testmode']['perfect'] = fake_predictions

fake_predictions = [1] * 21
fake_predictions[0] = 0
evaluation_data['test']['testmode']['bad'] = fake_predictions

fake_predictions = negative_words_percentage
fake_predictions[0] = .012
fake_predictions = np.divide(.01, fake_predictions)
evaluation_data['test']['testmode']['fake'] = fake_predictions

### Print overview

In [10]:
for dataset in evaluation_data:
    for mode in evaluation_data[dataset]:
        for detector in evaluation_data[dataset][mode]:
            first = np.round(evaluation_data[dataset][mode][detector][:4], 4)
            last = np.round(evaluation_data[dataset][mode][detector][0:4], 4)
            print(dataset, mode, detector, len(evaluation_data[dataset][mode][detector]), "  ", first, "...", last)

twitter bert_768 csdd 21    [0.9998 0.9998 0.9998 0.9997] ... [0.9998 0.9998 0.9998 0.9997]
twitter bert_768 kts 21    [0.591  0.5846 0.538  0.4364] ... [0.591  0.5846 0.538  0.4364]
twitter bert_768 aks 21    [0.5178 0.5154 0.5094 0.5009] ... [0.5178 0.5154 0.5094 0.5009]
twitter bert_768 ammd 21    [0.547 0.532 0.453 0.347] ... [0.547 0.532 0.453 0.347]
twitter bert_768 lsdd 21    [0.572 0.518 0.47  0.374] ... [0.572 0.518 0.47  0.374]
twitter bert_768 cdbd 21    [0.2411 0.2306 0.2263 0.2172] ... [0.2411 0.2306 0.2263 0.2172]
twitter bow_50 csdd 21    [0.9951 0.9945 0.9934 0.9918] ... [0.9951 0.9945 0.9934 0.9918]
twitter bow_50 kts 21    [0.3424 0.2188 0.0786 0.0164] ... [0.3424 0.2188 0.0786 0.0164]
twitter bow_50 aks 21    [0.4931 0.4776 0.4279 0.3877] ... [0.4931 0.4776 0.4279 0.3877]
twitter bow_50 ammd 21    [0.383 0.22  0.087 0.013] ... [0.383 0.22  0.087 0.013]
twitter bow_50 lsdd 21    [0.443 0.294 0.141 0.049] ... [0.443 0.294 0.141 0.049]
twitter bow_50 cdbd 21    [0.2029 

# Functions for metric scores

In [11]:
def eval_metric_pow(negative_words_percentage, p_values, factor_penalty, power_base, print_info):
    
    # remove first element (no injected drift)
    neg_words_perc = negative_words_percentage[1:]
    p_vals = p_values[1:]
        
    # Scores 0 (bad) to 1 (good)
    scores = []
    for i in range(len(p_vals)):
        scores.append(1 - p_vals[i])
    if(print_info):
        print(" difference score ", round(np.sum(scores), 4))
    
    # No injected drift in first element
    # High penalty for low p-value
    penalty = (1 - p_values[0]) * factor_penalty
    for i in range(len(scores)):
        scores[i] = (scores[i] - penalty)
    if(print_info):
        print(" penalty ", round(penalty, 4), "= 1 -", round(p_values[0], 4))
        print(round(np.sum(scores), 4), "penalty score")
        
    # Earlier drift detection is better (sensitivity)
    sum_rating = 0
    for i in range(len(scores)):
        rating = pow(power_base, 1/neg_words_perc[i])
        sum_rating += rating
        scores[i] = scores[i] * rating
    
    if(print_info):
        print(" scores ", np.round(scores, 2))
        print(" sum_rating ", round(sum_rating, 4))
        print(" result ", round(np.sum(scores) / sum_rating, 4))
    
    return np.sum(scores) / sum_rating


def get_label(name):
    mappings = {
        "cdbd" : "CDBD",
        "csdd" : "CosSim",
        "kts" : "KTS",
        "aks" : "KS",
        "lsdd" : "LSDD",
        "ammd" : "MMD",
        
        "twitterbert_768" : "Twitter, BERT-768",
        "twitterbow_50"   : "Twitter, BoW-50  ",
        "twitterbow_768"  : "Twitter, BoW-768 ",
        "amazonbert_768"  : "Amazon,  BERT-768",
        "amazonbow_50"    : "Amazon,  BoW-50  ",
        "amazonbow_768"   : "Amazon,  BoW-768 ",
    }
    if(name in mappings):
        return mappings[name]
    else:
        print("Unknown:", name)
        return name


def get_sorted_detectors(detector_keys):
    labels = {}
    for detector in detectors:
        labels[get_label(detector)] = detector
    sorted_ = []
    for detector in sorted(labels.keys()):
        sorted_.append(labels[detector])
    return sorted_


def format_float(value):
    rounded = np.round(value, 4)
    if(rounded == 0): # handle -0.0
        rounded = 0
    return (format(rounded, '.4f'))

### Print config

In [12]:
print_textbf = True
best_marker_start = "\\textbf{"
best_marker_end = "}"
best_marker_start = "⬛"
best_marker_end = ""

### Scores (with penalty)

In [13]:
print_scores_with_penalty = True

scores = {}
for dataset in evaluation_data:
    scores[dataset] = {}
    for mode in evaluation_data[dataset]:
        scores[dataset][mode] = {}
        for detector in evaluation_data[dataset][mode]:
            scores[dataset][mode][detector] = eval_metric_pow(negative_words_percentage, evaluation_data[dataset][mode][detector], 1, 2, False)

In [14]:
if(print_scores_with_penalty):
    pprint.pprint(scores)

{'amazon': {'bert_768': {'aks': 0.006043029549983538,
                         'ammd': 0.006000091330780542,
                         'cdbd': 0.0,
                         'csdd': 5.501515712396836e-05,
                         'kts': 0.027200064300705467,
                         'lsdd': 0.018000041219333688},
            'bow_50': {'aks': 0.003643577894274681,
                       'ammd': 0.025000116259637672,
                       'cdbd': 0.0,
                       'csdd': 9.965912989239957e-05,
                       'kts': 0.04300010769702112,
                       'lsdd': 0.08100007228193422},
            'bow_768': {'aks': 0.0007441663713498737,
                        'ammd': 0.004000049987231799,
                        'cdbd': -6.431310448433687e-11,
                        'csdd': 2.1278925353253043e-05,
                        'kts': -0.00979991831936586,
                        'lsdd': 0.021999981726778296}},
 'test': {'testmode': {'bad': -0.9999999999999998,
        

In [15]:
sorted_detectors = get_sorted_detectors(scores["twitter"]["bert_768"])
sorted_datasets = sorted(scores.keys())
sorted_modes = ['bow_50', 'bow_768', 'bert_768']

# Get values to compute means
s = {}
c = {}
sd = {}
cd = {}
for dataset in sorted_datasets:
    if(dataset == "test"):
        continue
    for mode in sorted_modes:
        for detector in sorted_detectors:
            score = scores[dataset][mode][detector]
            sd[detector] = sd.get(detector,0) + score
            s[dataset+mode] = s.get(dataset+mode,0) + score
            cd[detector] = cd.get(detector,0) + 1
            c[dataset+mode] = c.get(dataset+mode,0) + 1


            
end=" , "
print("Dataset, Model   ", end=end)
for detector in sorted_detectors:
    #print(get_label(detector), end=end)
    print("{:<6}".format(get_label(detector)), end=end)
print("$\\varnothing$", end=end) # ∅ $\varnothing$
    
for dataset in sorted_datasets:
    if(dataset == "test"):
        continue
    for mode in sorted_modes:
        print()
        print(get_label(dataset+mode), end=end)
        
        max_detector = None
        max_score = -2
        for detector in sorted_detectors:
            score = scores[dataset][mode][detector]
            if(score > max_score):
                max_score = score
                max_detector = detector
                
        for detector in sorted_detectors:
            score = scores[dataset][mode][detector]
            if(print_textbf and detector == max_detector):
                print(best_marker_start, end="") # backslash could not be parsed in latex tool
            print(format_float(score), end="")
            if(print_textbf and detector == max_detector):
                print(best_marker_end, end="")
                
            print("", end=end)
            
        print(format_float(s[dataset+mode]/c[dataset+mode]), end=end)

print()
print('$\\varnothing$     ,       ', end=end) # ∅ $\varnothing$
for detector in sorted_detectors:
    print(format_float(sd[detector]/cd[detector]), end=end)

Dataset, Model    , CDBD   , CosSim , KS     , KTS    , LSDD   , MMD    , $\varnothing$ , 
Amazon,  BoW-50   , 0.0000 , 0.0001 , 0.0036 , 0.0430 , ⬛0.0810 , 0.0250 , 0.0255 , 
Amazon,  BoW-768  , 0.0000 , 0.0000 , 0.0007 , -0.0098 , ⬛0.0220 , 0.0040 , 0.0028 , 
Amazon,  BERT-768 , 0.0000 , 0.0001 , 0.0060 , ⬛0.0272 , 0.0180 , 0.0060 , 0.0095 , 
Twitter, BoW-50   , -0.0115 , 0.0006 , 0.0155 , 0.1236 , 0.1490 , ⬛0.1630 , 0.0734 , 
Twitter, BoW-768  , 0.0304 , 0.0001 , 0.0096 , 0.0878 , 0.1430 , ⬛0.1550 , 0.0710 , 
Twitter, BERT-768 , 0.0104 , 0.0000 , 0.0025 , 0.0064 , ⬛0.0540 , 0.0150 , 0.0147 , 
$\varnothing$     ,        , 0.0049 , 0.0001 , 0.0063 , 0.0464 , 0.0778 , 0.0613 , 

In [16]:
if(False):
    pprint.pprint(s)
    pprint.pprint(c)
    pprint.pprint(sd)
    pprint.pprint(cd)

### Scores (without penalty)

In [17]:
print_scores_without_penalty = True

scoresNoPenalty = {}
for dataset in evaluation_data:
    scoresNoPenalty[dataset] = {}
    for mode in evaluation_data[dataset]:
        scoresNoPenalty[dataset][mode] = {}
        for detector in evaluation_data[dataset][mode]:
            scoresNoPenalty[dataset][mode][detector] = eval_metric_pow(negative_words_percentage, evaluation_data[dataset][mode][detector], 0, 2, False)

In [18]:
if(print_scores_without_penalty):
    pprint.pprint(scoresNoPenalty)

{'amazon': {'bert_768': {'aks': 0.4562623031171832,
                         'ammd': 0.26400009133078056,
                         'cdbd': 0.9999999999999998,
                         'csdd': 0.0007308126195873471,
                         'kts': 0.30640006430070543,
                         'lsdd': 0.29700007018719105},
            'bow_50': {'aks': 0.4878900106754941,
                       'ammd': 0.4940001162596377,
                       'cdbd': 0.9999999999999998,
                       'csdd': 0.0027174951284275554,
                       'kts': 0.46320010769702114,
                       'lsdd': 0.4660000910573973},
            'bow_768': {'aks': 0.48791039228153654,
                        'ammd': 0.4470000499872317,
                        'cdbd': 0.9999999999272705,
                        'csdd': 0.0046895147041618465,
                        'kts': 0.5462000816806339,
                        'lsdd': 0.5289999914423353}},
 'test': {'testmode': {'bad': 0.0,
                 

In [19]:
sorted_detectors = get_sorted_detectors(scores["twitter"]["bert_768"])
sorted_datasets = sorted(scores.keys())
sorted_modes = ['bow_50', 'bow_768', 'bert_768']

# Get values to compute means
s = {}
c = {}
sd = {}
cd = {}
for dataset in sorted_datasets:
    if(dataset == "test"):
        continue
    for mode in sorted_modes:
        for detector in sorted_detectors:
            score = scoresNoPenalty[dataset][mode][detector]
            sd[detector] = sd.get(detector,0) + score
            s[dataset+mode] = s.get(dataset+mode,0) + score
            cd[detector] = cd.get(detector,0) + 1
            c[dataset+mode] = c.get(dataset+mode,0) + 1


            
end=" , "
print("Dataset, Model   ", end=end)
for detector in sorted_detectors:
    print("{:<6}".format(get_label(detector)), end=end)
print("$\\varnothing$", end=end) # ∅ $\varnothing$
    
for dataset in sorted_datasets:
    if(dataset == "test"):
        continue
    for mode in sorted_modes:
        print()
        print(get_label(dataset+mode), end=end)
        
        max_detector = None
        max_score = -2
        for detector in sorted_detectors:
            if(detector == "cdbd"): # exclude from max scores
                continue
            score = scoresNoPenalty[dataset][mode][detector]
            if(score > max_score):
                max_score = score
                max_detector = detector
        
        for detector in sorted_detectors:
            score = scoresNoPenalty[dataset][mode][detector]
            if(print_textbf and detector == max_detector):
                print(best_marker_start, end="") # backslash could not be parsed in latex tool
            print(format_float(score), end="")
            if(print_textbf and detector == max_detector):
                print(best_marker_end, end="")
            print("", end=end)
            
        print(format_float(s[dataset+mode]/c[dataset+mode]), end=end)

print()
print('$\\varnothing$     ,       ', end=end) # ∅ $\varnothing$
for detector in sorted_detectors:
    print(format_float(sd[detector]/cd[detector]), end=end)

Dataset, Model    , CDBD   , CosSim , KS     , KTS    , LSDD   , MMD    , $\varnothing$ , 
Amazon,  BoW-50   , 1.0000 , 0.0027 , 0.4879 , 0.4632 , 0.4660 , ⬛0.4940 , 0.4856 , 
Amazon,  BoW-768  , 1.0000 , 0.0047 , 0.4879 , ⬛0.5462 , 0.5290 , 0.4470 , 0.5025 , 
Amazon,  BERT-768 , 1.0000 , 0.0007 , ⬛0.4563 , 0.3064 , 0.2970 , 0.2640 , 0.3874 , 
Twitter, BoW-50   , 0.7856 , 0.0055 , 0.5224 , ⬛0.7812 , 0.7060 , 0.7800 , 0.5968 , 
Twitter, BoW-768  , 0.7454 , 0.0765 , 0.5111 , 0.6228 , 0.7130 , ⬛0.7190 , 0.5646 , 
Twitter, BERT-768 , 0.7694 , 0.0002 , ⬛0.4846 , 0.4154 , 0.4820 , 0.4680 , 0.4366 , 
$\varnothing$     ,        , 0.8834 , 0.0151 , 0.4917 , 0.5225 , 0.5322 , 0.5287 , 

In [20]:
if(False):
    pprint.pprint(s)
    pprint.pprint(c)
    pprint.pprint(sd)
    pprint.pprint(cd)