# Scores of injection experiments

In [1]:
from os import listdir
from os.path import isfile, join
import pickle
import numpy as np
import pprint

In [2]:
results_directory = "../data/results/"
evaluation_data = {}

### Load results

In [3]:
results_meta = {}

# get files and ids
for name in listdir(results_directory):
    path = join(results_directory, name)
    if isfile(path):
        id = name[:-7]
        results_meta[id] = {"result_pickle":path}

# add data
for key in results_meta:
    with open(results_meta[key]["result_pickle"], 'rb') as handle:
        results_meta[key]["data"] = pickle.load(handle)

# print info
for key in results_meta: 
    print(
        key, " ",
        results_meta[key]["result_pickle"], " ",
        type(results_meta[key]["data"]), " ",
        len(results_meta[key]["data"])
    )  

twitter_diff_dist   ../data/results/twitter_diff_dist.pickle   <class 'tuple'>   3
twitter_drift_induction   ../data/results/twitter_drift_induction.pickle   <class 'dict'>   3
amazon_diff_classes   ../data/results/amazon_diff_classes.pickle   <class 'dict'>   3
amazon_same_dist   ../data/results/amazon_same_dist.pickle   <class 'dict'>   3
twitter_same_dist   ../data/results/twitter_same_dist.pickle   <class 'dict'>   3
twitter_diff_classes   ../data/results/twitter_diff_classes.pickle   <class 'dict'>   3
amazon_drift_induction   ../data/results/amazon_drift_induction.pickle   <class 'dict'>   3


### Extract results: twitter_drift_induction

In [4]:
# set config from notebook
permutations = 10
modes = ['bert_768', 'bow_50', 'bow_768']
target_percentages = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
detectors = {
    'csdd': 'CosineSimilarityDriftDetector()',
    'kts' : 'KernelTwoSampleDriftDetector()',
    'aks' : 'AlibiKSDetector()',
    'ammd': 'AlibiMMDDetector()',
    'lsdd': 'AlibiLSDDDetector()',
    'cdbd': 'CDBDDetector()',
}
result_pickle = results_meta["twitter_drift_induction"]["result_pickle"]
try:
    del results
except NameError:
    pass

In [5]:
evaluation_data["twitter"] = {}

try:
    results
except NameError:
    print('Loading result pickle: ', result_pickle)
    with open(result_pickle, 'rb') as handle:
        results = pickle.load(handle)    

for mode in modes:
    print(mode)
    plot_data = {}
       
    for detector in detectors:
        plot_data[detector] = []
        for n in range(len(results[mode][detector]['predictions'][0])):
            nth_entries = [results[mode][detector]['predictions'][i][n] for i in range(permutations)]
            plot_data[detector].append(np.mean(nth_entries))

    evaluation_data["twitter"][mode] = plot_data

Loading result pickle:  ../data/results/twitter_drift_induction.pickle
bert_768
bow_50
bow_768


### Extract results: amazon_drift_induction.ipynb

In [6]:
# set config from notebook
permutations = 10
modes = ['bert_768', 'bow_50', 'bow_768']
target_percentages = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
detectors = {
    'csdd': "",
    'kts' : "",
    'aks' : "",
    'ammd': "",
    'lsdd': "",
    'cdbd': "",
}
result_pickle = results_meta["amazon_drift_induction"]["result_pickle"]
try:
    del results
except NameError:
    pass

In [7]:
evaluation_data["amazon"] = {}

try:
    results
except NameError:
    print('Loading result pickle: ', result_pickle)
    with open(result_pickle, 'rb') as handle:
        results = pickle.load(handle)    

# plot
for mode in modes:
    print(mode)
    plot_data = {}
       
    for detector in detectors:
        plot_data[detector] = []
        for n in range(len(results[mode][detector]['predictions'][0])):
            nth_entries = [results[mode][detector]['predictions'][i][n] for i in range(permutations)]
            plot_data[detector].append(np.mean(nth_entries))
            
    evaluation_data["amazon"][mode] = plot_data

Loading result pickle:  ../data/results/amazon_drift_induction.pickle
bert_768
bow_50
bow_768


### Configuration of conducted experiments

In [8]:
target_percentages = [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
negative_words_percentage = np.divide(target_percentages, 2)
print("negative_words_percentage", negative_words_percentage)

negative_words_percentage [0.    0.025 0.05  0.075 0.1   0.125 0.15  0.175 0.2   0.225 0.25  0.275
 0.3   0.325 0.35  0.375 0.4   0.425 0.45  0.475 0.5  ]


### Add comparison detector values

In [9]:
evaluation_data['test'] = {}
evaluation_data['test']['testmode'] = {}

fake_predictions = [0] * 21
fake_predictions[0] = 1
evaluation_data['test']['testmode']['perfect'] = fake_predictions

fake_predictions = [1] * 21
fake_predictions[0] = 0
evaluation_data['test']['testmode']['bad'] = fake_predictions

fake_predictions = negative_words_percentage
fake_predictions[0] = .012
fake_predictions = np.divide(.01, fake_predictions)
evaluation_data['test']['testmode']['fake'] = fake_predictions

### Print overview

In [10]:
for dataset in evaluation_data:
    for mode in evaluation_data[dataset]:
        for detector in evaluation_data[dataset][mode]:
            first = np.round(evaluation_data[dataset][mode][detector][:4], 4)
            last = np.round(evaluation_data[dataset][mode][detector][0:4], 4)
            print(dataset, mode, detector, len(evaluation_data[dataset][mode][detector]), "  ", first, "...", last)

twitter bert_768 csdd 21    [0.9998 0.9998 0.9998 0.9997] ... [0.9998 0.9998 0.9998 0.9997]
twitter bert_768 kts 21    [0.591  0.5846 0.538  0.4364] ... [0.591  0.5846 0.538  0.4364]
twitter bert_768 aks 21    [0.5178 0.5154 0.5094 0.5009] ... [0.5178 0.5154 0.5094 0.5009]
twitter bert_768 ammd 21    [0.547 0.532 0.453 0.347] ... [0.547 0.532 0.453 0.347]
twitter bert_768 lsdd 21    [0.572 0.518 0.47  0.374] ... [0.572 0.518 0.47  0.374]
twitter bert_768 cdbd 21    [0.8219 0.8201 0.8238 0.7922] ... [0.8219 0.8201 0.8238 0.7922]
twitter bow_50 csdd 21    [0.9951 0.9945 0.9934 0.9918] ... [0.9951 0.9945 0.9934 0.9918]
twitter bow_50 kts 21    [0.3424 0.2188 0.0786 0.0164] ... [0.3424 0.2188 0.0786 0.0164]
twitter bow_50 aks 21    [0.4931 0.4776 0.4279 0.3877] ... [0.4931 0.4776 0.4279 0.3877]
twitter bow_50 ammd 21    [0.383 0.22  0.087 0.013] ... [0.383 0.22  0.087 0.013]
twitter bow_50 lsdd 21    [0.443 0.294 0.141 0.049] ... [0.443 0.294 0.141 0.049]
twitter bow_50 cdbd 21    [0.5944 

# Functions for metric scores

In [11]:
def eval_metric_ext(negative_words_percentage, p_values, factor_penalty, print_info):
    
    # remove first element (no injected drift)
    neg_words_perc = negative_words_percentage[1:]
    p_vals = p_values[1:]
        
    # Scores 0 (bad) to 1 (good)
    scores = []
    for i in range(len(p_vals)):
        scores.append(1 - p_vals[i])
    if(print_info):
        print(" difference score ", round(numpy.sum(scores), 4))
    
    # No injected drift in first element
    # High penalty for low p-value
    penalty = (1 - p_values[0]) * factor_penalty
    for i in range(len(scores)):
        scores[i] = (scores[i] - penalty)
    if(print_info):
        print(" penalty ", round(penalty, 4), "= 1 -", round(p_values[0], 4))
        print(round(numpy.sum(scores), 4), "penalty score")
        
    # Earlier drift detection is better (sensitivity)
    sum_rating = 0
    for i in range(len(scores)):
        rating = 1/neg_words_perc[i]
        sum_rating += rating
        scores[i] = scores[i] * rating
    
    if(print_info):
        print(" scores ", numpy.round(scores, 2))
        print(" sum_rating ", round(sum_rating, 4))
        print(" result ", round(numpy.sum(scores) / sum_rating, 4))
    
    return np.sum(scores) / sum_rating


def get_label(name):
    mappings = {
        "cdbd" : "CDBD",
        "csdd" : "CosSim",
        "kts" : "KTS/MMD",
        "aks" : "KS",
        "lsdd" : "LSDD",
        "ammd" : "MMD",
        
        "twitterbert_768" : "Twitter, BERT-768",
        "twitterbow_50"   : "Twitter, BoW-50  ",
        "twitterbow_768"  : "Twitter, BoW-768 ",
        "amazonbert_768"  : "Amazon,  BERT-768",
        "amazonbow_50"    : "Amazon,  BoW-50  ",
        "amazonbow_768"   : "Amazon,  BoW-768 ",
    }
    if(name in mappings):
        return mappings[name]
    else:
        print("Unknown:", name)
        return name


def get_sorted_detectors(detector_keys):
    labels = {}
    for detector in detectors:
        labels[get_label(detector)] = detector
    sorted_ = []
    for detector in sorted(labels.keys()):
        sorted_.append(labels[detector])
    return sorted_


def format_float(value):
    rounded = np.round(value, 4)
    if(rounded == 0): # handle -0.0
        rounded = 0
    return (format(rounded, '.4f'))

# Configuration

In [12]:
max_index = 20 # 20: all, 12: to 0.3
print("Maximim index for evaluation:", max_index, "which is percentage", negative_words_percentage[max_index])

print_textbf = True
#best_marker_start = "\\textbf{" 
#best_marker_start = "BSLSHBSLSHtextbf{" # backslashes could not be parsed in online tool
#best_marker_end = "}"
best_marker_start = ""
best_marker_end = "⬛"

Maximim index for evaluation: 20 which is percentage 0.5


### Scores (with penalty)

In [13]:
print_scores_with_penalty = True
alpha = 1

scores = {}
for dataset in evaluation_data:
    scores[dataset] = {}
    for mode in evaluation_data[dataset]:
        scores[dataset][mode] = {}
        for detector in evaluation_data[dataset][mode]:
            scores[dataset][mode][detector] = eval_metric_ext(negative_words_percentage[:max_index], evaluation_data[dataset][mode][detector][:max_index], alpha, False)

In [14]:
if(print_scores_with_penalty):
    pprint.pprint(scores)

{'amazon': {'bert_768': {'aks': 0.10018429189047545,
                         'ammd': 0.34657685597543997,
                         'cdbd': 0.0705508117628693,
                         'csdd': 0.0008739953374681871,
                         'kts': 0.34179639793680017,
                         'lsdd': 0.3172092135260019},
            'bow_50': {'aks': 0.08151503231844369,
                       'ammd': 0.29681265972751,
                       'cdbd': 0.20156011591095302,
                       'csdd': 0.002978010641077654,
                       'kts': 0.32708879196445717,
                       'lsdd': 0.3513488859136617},
            'bow_768': {'aks': 0.029287486922476864,
                        'ammd': 0.2202271737741508,
                        'cdbd': 0.0977993823393399,
                        'csdd': 0.0007262213844526468,
                        'kts': 0.2013299718440915,
                        'lsdd': 0.09291686007044979}},
 'test': {'testmode': {'bad': -1.0000000000000002,


In [15]:
sorted_detectors = get_sorted_detectors(scores["twitter"]["bert_768"])
sorted_detectors.remove("csdd") # Remove CosSim
sorted_detectors.remove("ammd") # Remove MMD
sorted_datasets = sorted(scores.keys())
sorted_modes = ['bow_50', 'bow_768', 'bert_768']

print("alpha", alpha)
print("max_index", max_index)

# Get values to compute means
s = {}
c = {}
sd = {}
cd = {}
for dataset in sorted_datasets:
    if(dataset == "test"):
        continue
    for mode in sorted_modes:
        for detector in sorted_detectors:
            score = scores[dataset][mode][detector]
            sd[detector] = sd.get(detector,0) + score
            cd[detector] = cd.get(detector,0) + 1
            #if(detector == "cdbd" or detector == "csdd"): # Exclude detectors from mode mean
            #    print("Skipping", detector, "for mode mean")
            #    continue
            s[dataset+mode] = s.get(dataset+mode,0) + score
            c[dataset+mode] = c.get(dataset+mode,0) + 1


            
end=" , "
print("Dataset, Model   ", end=end)
for detector in sorted_detectors:
    print(get_label(detector), end=end)
print("$\\varnothing$", end=end) # ∅ $\varnothing$
    
for dataset in sorted_datasets:
    if(dataset == "test"):
        continue
    for mode in sorted_modes:
        print()
        print(get_label(dataset+mode), end=end)
        
        max_detector = None
        max_score = -2
        for detector in sorted_detectors:
            score = scores[dataset][mode][detector]
            if(score > max_score):
                max_score = score
                max_detector = detector
                
        for detector in sorted_detectors:
            score = scores[dataset][mode][detector]
            if(print_textbf and detector == max_detector):
                print(best_marker_start, end="")
            print(format_float(score), end="")
            if(print_textbf and detector == max_detector):
                print(best_marker_end, end="")
            print("", end=end)
            
        print(format_float(s[dataset+mode]/c[dataset+mode]), end=end)

print()
print('$\\varnothing$     ,       ', end=end) # ∅ $\varnothing$
for detector in sorted_detectors:
    print(format_float(sd[detector]/cd[detector]), end=end)

alpha 1
max_index 20
Dataset, Model    , CDBD , KS , KTS/MMD , LSDD , $\varnothing$ , 
Amazon,  BoW-50   , 0.2016 , 0.0815 , 0.3271 , 0.3513⬛ , 0.2404 , 
Amazon,  BoW-768  , 0.0978 , 0.0293 , 0.2013⬛ , 0.0929 , 0.1053 , 
Amazon,  BERT-768 , 0.0706 , 0.1002 , 0.3418⬛ , 0.3172 , 0.2074 , 
Twitter, BoW-50   , 0.0145 , 0.1750 , 0.2671 , 0.3328⬛ , 0.1974 , 
Twitter, BoW-768  , 0.1236 , 0.1448 , 0.3350⬛ , 0.2806 , 0.2210 , 
Twitter, BERT-768 , 0.0727 , 0.0648 , 0.2704 , 0.2792⬛ , 0.1718 , 
$\varnothing$     ,        , 0.0968 , 0.0993 , 0.2905 , 0.2757 , 

In [16]:
if(False):
    pprint.pprint(s)
    pprint.pprint(c)
    pprint.pprint(sd)
    pprint.pprint(cd)

### Scores (without penalty)

In [17]:
print_scores_without_penalty = True
alpha = 0

scoresNoPenalty = {}
for dataset in evaluation_data:
    scoresNoPenalty[dataset] = {}
    for mode in evaluation_data[dataset]:
        scoresNoPenalty[dataset][mode] = {}
        for detector in evaluation_data[dataset][mode]:
            scoresNoPenalty[dataset][mode][detector] = eval_metric_ext(negative_words_percentage[:max_index], evaluation_data[dataset][mode][detector][:max_index], alpha, False)

In [18]:
if(print_scores_without_penalty):
    pprint.pprint(scoresNoPenalty)

{'amazon': {'bert_768': {'aks': 0.5504035654576752,
                         'ammd': 0.6045768559754402,
                         'cdbd': 0.24527233704484538,
                         'csdd': 0.0015497927999315662,
                         'kts': 0.6209963979368003,
                         'lsdd': 0.5962092424938594},
            'bow_50': {'aks': 0.5657614650996633,
                       'ammd': 0.7658126597275102,
                       'cdbd': 0.623461477899139,
                       'csdd': 0.005595846639612811,
                       'kts': 0.7472887919644573,
                       'lsdd': 0.7363489046891251},
            'bow_768': {'aks': 0.5164537128326637,
                        'ammd': 0.6632271737741509,
                        'cdbd': 0.6694941971503965,
                        'csdd': 0.005394457163261242,
                        'kts': 0.7573299718440916,
                        'lsdd': 0.5999168697860071}},
 'test': {'testmode': {'bad': 0.0,
                       '

In [19]:
sorted_detectors = get_sorted_detectors(scores["twitter"]["bert_768"])
sorted_detectors.remove("csdd") # Remove CosSim
sorted_detectors.remove("ammd") # Remove MMD
sorted_datasets = sorted(scores.keys())
sorted_modes = ['bow_50', 'bow_768', 'bert_768']

print("alpha", alpha)
print("max_index", max_index)

# Get values to compute means
s = {}
c = {}
sd = {}
cd = {}
for dataset in sorted_datasets:
    if(dataset == "test"):
        continue
    for mode in sorted_modes:
        for detector in sorted_detectors:
            score = scoresNoPenalty[dataset][mode][detector]
            sd[detector] = sd.get(detector,0) + score
            cd[detector] = cd.get(detector,0) + 1
            #if(detector == "cdbd" or detector == "csdd"): # Exclude detectors from mode mean
            #    print("Skipping", detector, "for mode mean")
            #    continue
            c[dataset+mode] = c.get(dataset+mode,0) + 1
            s[dataset+mode] = s.get(dataset+mode,0) + score


            
end=" , "
print("Dataset, Model   ", end=end)
for detector in sorted_detectors:
    print(get_label(detector), end=end)
print("$\\varnothing$", end=end) # ∅ $\varnothing$
    
for dataset in sorted_datasets:
    if(dataset == "test"):
        continue
    for mode in sorted_modes:
        print()
        print(get_label(dataset+mode), end=end)
        
        max_detector = None
        max_score = -2
        for detector in sorted_detectors:
            #if(detector == "cdbd"): # exclude from max scores
            #    continue
            score = scoresNoPenalty[dataset][mode][detector]
            if(score > max_score):
                max_score = score
                max_detector = detector

        for detector in sorted_detectors:
            score = scoresNoPenalty[dataset][mode][detector]
            if(print_textbf and detector == max_detector):
                print(best_marker_start, end="")
            print(format_float(score), end="")
            if(print_textbf and detector == max_detector):
                print(best_marker_end, end="")
            print("", end=end)
            
        print(format_float(s[dataset+mode]/c[dataset+mode]), end=end)

print()
print('$\\varnothing$     ,       ', end=end) # ∅ $\varnothing$
for detector in sorted_detectors:
    print(format_float(sd[detector]/cd[detector]), end=end)

alpha 0
max_index 20
Dataset, Model    , CDBD , KS , KTS/MMD , LSDD , $\varnothing$ , 
Amazon,  BoW-50   , 0.6235 , 0.5658 , 0.7473⬛ , 0.7363 , 0.6682 , 
Amazon,  BoW-768  , 0.6695 , 0.5165 , 0.7573⬛ , 0.5999 , 0.6358 , 
Amazon,  BERT-768 , 0.2453 , 0.5504 , 0.6210⬛ , 0.5962 , 0.5032 , 
Twitter, BoW-50   , 0.4202 , 0.6819 , 0.9247⬛ , 0.8898 , 0.7291 , 
Twitter, BoW-768  , 0.7898 , 0.6462 , 0.8700⬛ , 0.8506 , 0.7892 , 
Twitter, BERT-768 , 0.2508 , 0.5470 , 0.6794 , 0.7072⬛ , 0.5461 , 
$\varnothing$     ,        , 0.4998 , 0.5846 , 0.7666 , 0.7300 , 

In [20]:
if(False):
    pprint.pprint(s)
    pprint.pprint(c)
    pprint.pprint(sd)
    pprint.pprint(cd)