# Drift experiments

Executes detectors on different models and saves results in the format:

```
{'data_id': {'detector_id': {'predictions': [.1,.2,.3],
                             'time_detect': 60.00,
                             'time_fit': 1.00}}}
```

## Data

In [1]:
import yaml
import os.path
import pickle

# Set data paths
config          = yaml.safe_load(open("config.yaml", "r"))
bow_50_file  = os.path.join(config["EMBEDDINGS_DIRECTORY"], "amazon_drift_bow_50.pickle")
bow_768_file = os.path.join(config["EMBEDDINGS_DIRECTORY"], "amazon_drift_bow_768.pickle")
results_file = os.path.join(config["EXPERIMENTS_DIRECTORY"], "results_a")
print("bow_50_file", bow_50_file)
print("bow_768_file", bow_768_file)

# Load data
data = {}
with open(bow_50_file, "rb") as handle:
    data["bow_50"] = pickle.load(handle)
print("Samples:", len(data["bow_50"]['orig'][0]), len(data["bow_50"]['drifted'][0][0]), len(data["bow_50"]['train'][0]))
with open(bow_768_file, "rb") as handle:
    data["bow_768"] = pickle.load(handle)
print("Samples:", len(data["bow_768"]['orig'][0]), len(data["bow_768"]['drifted'][0][0]), len(data["bow_768"]['train'][0]))

bow_50_file /home/eml4u/EML4U/data/amazon/amazon_drift_bow_50.pickle
bow_768_file /home/eml4u/EML4U/data/amazon/amazon_drift_bow_768.pickle
Samples: 10000 10000 10000
Samples: 10000 10000 10000


In [None]:
# Print example data
if(False):
    print_model = data["bow_50"]
    print(type(print_model), len(print_model))
    for key, value in print_model.items() :
        print (key, type(value), len(value))
        for i in range(len(value)) :
            print (value[i][0])
            print()

## Dimension reduction

In [2]:
from sklearn.decomposition import PCA
import numpy as np

# https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html
# https://scikit-learn.org/stable/auto_examples/decomposition/plot_pca_iris.html
def reduce_dim(data, target_dimensions, pca=None):
    
    # Recursive call for lists
    if(True):
        if isinstance(data, list):
            if(pca is None):
                pca = PCA(n_components=target_dimensions)
                pca.fit(data[0])

            results = []
            for item in data:
                results.append(reduce_dim(item, target_dimensions, pca=pca))
            return results
    
    # Check if is 2-dimensional numpy array
    if not isinstance(data, np.ndarray) or data.ndim != 2:
        raise ValueError(type(data))
    
    if(pca is None):
        pca = PCA(n_components=target_dimensions)
        pca.fit(data)
    return pca.transform(data)

In [3]:
# Set data paths
results_file = os.path.join(config["EXPERIMENTS_DIRECTORY"], "results_reduction_test")
print("Changed results file to", results_file)

# Create data
old_key = "bow_768"
new_key = "bow_50_reduced"
print("Creating key", new_key)
data[new_key] = data[old_key].copy() # copy all

for key in data[new_key]:
    # print(type(data[new_key][key]))  # tuple -> create new ones
    data[new_key][key] = (reduce_dim(data[new_key][key][0], 50), data[new_key][key][0])
    if isinstance(data[new_key][key][0], list):
        print(key, len(data[new_key][key][0][0]), len(data[new_key][key][0][0][1]))
    elif isinstance(data[new_key][key][0], np.ndarray):
        print(key, len(data[new_key][key][0]), len(data[new_key][key][0][1]))

Changed results file to /home/eml4u/EML4U/data/amazon/results_reduction_test
Creating key bow_50_reduced
orig 10000 50
drifted 10000 50
train 10000 50


## Results

In [None]:
results = {}

In [4]:
# Load previous results
if os.path.isfile(results_file):
    with open(results_file, "rb") as handle:
        results = pickle.load(handle)

## Experiments

In [None]:
import time

# Call fit funtion, if not already in results
def default_fit(detector_id, detector, data_id, data, results, force_run):
    if(data_id in results and detector_id in results[data_id] and not force_run):
        return
    
    # Reset results
    results_detector = {}
    
    time_begin = time.time()
        
    detector.fit(data)
    
    results_detector["time_fit"] = time.time() - time_begin
    
    if(data_id not in results):
        results[data_id] = {}
    results[data_id][detector_id] = results_detector

# Compute predictions, if not already in results
def default_detect(detector_id, detector, data_id, data, results, force_run):
    if(data_id in results and detector_id in results[data_id] and
       "predictions" in results[data_id][detector_id] and not force_run):
        return
    
    # Get previous results
    if(data_id in results and detector_id in results[data_id]):
        results_detector = results[data_id][detector_id]
    else:
        results_detector = {}
    
    time_begin = time.time()
    
    results_detector["predictions"] = []
    print(data_id, detector_id, end=" ")
    for p in data:
        results_detector["predictions"].append(detector.predict_proba(p))
        print(len(p) , end=" ")
    print()

    results_detector["time_detect"] = time.time() - time_begin

    if(data_id not in results):
        results[data_id] = {}
    results[data_id][detector_id] = results_detector

In [None]:
from detectors.AlibiKSDetector import AlibiKSDetector
detector_id = "AlibiKSDetector"

In [None]:
data_id = "bow_50"
detector = AlibiKSDetector()
default_fit   (detector_id, detector, data_id, data[data_id]['orig'][0],    results, False)
default_detect(detector_id, detector, data_id, data[data_id]['drifted'][0], results, False)

In [None]:
data_id = "bow_50_reduced"
detector = AlibiKSDetector()
default_fit   (detector_id, detector, data_id, data[data_id]['orig'][0],    results, False)
default_detect(detector_id, detector, data_id, data[data_id]['drifted'][0], results, False)

In [None]:
data_id = "bow_768"
detector = AlibiKSDetector()
default_fit   (detector_id, detector, data_id, data[data_id]['orig'][0],    results, False)
default_detect(detector_id, detector, data_id, data[data_id]['drifted'][0], results, False)

In [None]:
from detectors.AlibiMMDDetector import AlibiMMDDetector
detector_id = "AlibiMMDDetector"

In [None]:
data_id = "bow_50"
detector = AlibiMMDDetector(backend = 'pytorch')
default_fit   (detector_id, detector, data_id, data[data_id]['orig'][0],    results, False)
default_detect(detector_id, detector, data_id, data[data_id]['drifted'][0], results, False)

In [None]:
data_id = "bow_768"
detector = AlibiMMDDetector(backend = 'pytorch')
default_fit   (detector_id, detector, data_id, data[data_id]['orig'][0],    results, False)
default_detect(detector_id, detector, data_id, data[data_id]['drifted'][0], results, False)

In [None]:
from detectors.CosineDetector import CosineSimilarityDriftDetector
detector_id = "CosineDetector"

In [None]:
data_id = "bow_50"
detector = CosineSimilarityDriftDetector()
default_fit   (detector_id, detector, data_id, data[data_id]['orig'][0],    results, False)
default_detect(detector_id, detector, data_id, data[data_id]['drifted'][0], results, False)

In [None]:
data_id = "bow_768"
detector = CosineSimilarityDriftDetector()
default_fit   (detector_id, detector, data_id, data[data_id]['orig'][0],    results, False)
default_detect(detector_id, detector, data_id, data[data_id]['drifted'][0], results, False)

In [None]:
from detectors.FCITDetector import FCITDriftDetector
detector_id = "FCITDetector"

In [None]:
data_id = "bow_50"
detector = FCITDriftDetector()
default_fit   (detector_id, detector, data_id, data[data_id]['orig'][0],    results, False)
default_detect(detector_id, detector, data_id, data[data_id]['drifted'][0], results, False)

In [None]:
data_id = "bow_768"
detector = FCITDriftDetector()
default_fit   (detector_id, detector, data_id, data[data_id]['orig'][0],    results, False)
default_detect(detector_id, detector, data_id, data[data_id]['drifted'][0], results, False)

In [None]:
from detectors.KernelTwoSampleDetector import KernelTwoSampleDriftDetector
detector_id = "KernelTwoSampleDetector"

In [None]:
data_id = "bow_50"
detector = KernelTwoSampleDriftDetector()
default_fit   (detector_id, detector, data_id, data[data_id]['orig'][0],    results, False)
default_detect(detector_id, detector, data_id, data[data_id]['drifted'][0], results, False)

In [None]:
data_id = "bow_768"
detector = KernelTwoSampleDriftDetector()
default_fit   (detector_id, detector, data_id, data[data_id]['orig'][0],    results, False)
default_detect(detector_id, detector, data_id, data[data_id]['drifted'][0], results, False)

In [None]:
from detectors.AlibiLSDD import AlibiLSDDDetector
detector_id = "AlibiLSDDDetector"

In [None]:
data_id = "bow_50"
detector = AlibiLSDDDetector(backend='pytorch')
default_fit   (detector_id, detector, data_id, data[data_id]['orig'][0],    results, False)
default_detect(detector_id, detector, data_id, data[data_id]['drifted'][0], results, False)

In [None]:
data_id = "bow_50_reduced"
detector = AlibiLSDDDetector(backend='pytorch')
default_fit   (detector_id, detector, data_id, data[data_id]['orig'][0],    results, False)
default_detect(detector_id, detector, data_id, data[data_id]['drifted'][0], results, False)

In [None]:
data_id = "bow_768"
detector = AlibiLSDDDetector(backend='pytorch')
default_fit   (detector_id, detector, data_id, data[data_id]['orig'][0],    results, False)
default_detect(detector_id, detector, data_id, data[data_id]['drifted'][0], results, False)

In [None]:
from detectors.AlibiChiSquaredDetector import AlibiChiSquaredDetector
detector_id = "AlibiChiSquaredDetector"

In [None]:
data_id = "bow_50"
detector = AlibiChiSquaredDetector()
default_fit   (detector_id, detector, data_id, data[data_id]['orig'][0],    results, False)
default_detect(detector_id, detector, data_id, data[data_id]['drifted'][0], results, False)

In [None]:
data_id = "bow_768"
detector = AlibiChiSquaredDetector()
default_fit   (detector_id, detector, data_id, data[data_id]['orig'][0],    results, False)
default_detect(detector_id, detector, data_id, data[data_id]['drifted'][0], results, False)

In [None]:
from detectors.AlibiChiSquaredDetector import AlibiChiSquaredDetector
detector_id = "AlibiChiSquaredDetector-FDR"

In [None]:
data_id = "bow_50"
detector = AlibiChiSquaredDetector(correction = "fdr")
default_fit   (detector_id, detector, data_id, data[data_id]['orig'][0],    results, False)
default_detect(detector_id, detector, data_id, data[data_id]['drifted'][0], results, False)

In [None]:
data_id = "bow_768"
detector = AlibiChiSquaredDetector(correction = "fdr")
default_fit   (detector_id, detector, data_id, data[data_id]['orig'][0],    results, False)
default_detect(detector_id, detector, data_id, data[data_id]['drifted'][0], results, False)

In [None]:
from detectors.CDBDDetector import CDBDDetector
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
detector_id = "CDBDDetector"

In [None]:
data_id = "bow_50"

features = data[data_id]['train'][0]
targets = np.array(data[data_id]['train'][1])[:,1] # take the labels from dictionary, convert to np.array and slice to only get the scores
targets = targets.astype('int')
x_train, x_test, y_train, y_test = train_test_split(features, targets, test_size=0.33, shuffle=False)
model = SVC(kernel='linear', random_state=42) # SVM model
model.fit(x_train, y_train)

detector = CDBDDetector(model)
default_fit   (detector_id, detector, data_id, data[data_id]['orig'][0],    results, True)
default_detect(detector_id, detector, data_id, data[data_id]['drifted'][0], results, True)

## Results

In [None]:
# Save results
with open(results_file, "wb") as handle:
    pickle.dump(results, handle)

In [None]:
# Print runtimes
if(True):
    print("Runtimes (fit and detect) in minutes:")
    from pprint import pprint
    for data_id in results:
        times = {}
        for detector_id in results[data_id]:
            time = 0
            for key in results[data_id][detector_id]:
                if(key == "time_detect" or key == "time_fit"):
                    time += results[data_id][detector_id][key]
            times[detector_id] = time/60
        pprint(sorted(times.items(), key=lambda item: item[1]))

In [5]:
# Print results to compare models

def print_results(detector_id):
    for data_id in results:
        print(data_id, detector_id)
        print(" sum p-values:", np.sum(results[data_id][detector_id]["predictions"]))
        first = np.round(results[data_id][detector_id]["predictions"][:4], 4)
        last = np.round(results[data_id][detector_id]["predictions"][0:4], 4)
        print("", first, "...", last)

if(True):
    print_results("AlibiKSDetector")
    print()
    print_results("AlibiLSDDDetector")

bow_50 AlibiKSDetector
 sum p-values: 4.1668124
 [0.5108 0.4079 0.3725 0.3364] ... [0.5108 0.4079 0.3725 0.3364]
bow_50_reduced AlibiKSDetector
 sum p-values: 6.132953
 [0.509  0.4831 0.5102 0.4493] ... [0.509  0.4831 0.5102 0.4493]
bow_768 AlibiKSDetector
 sum p-values: 6.753817
 [0.5101 0.4989 0.4815 0.4608] ... [0.5101 0.4989 0.4815 0.4608]

bow_50 AlibiLSDDDetector
 sum p-values: 1.4399999678134918
 [0.96 0.48 0.   0.  ] ... [0.96 0.48 0.   0.  ]
bow_50_reduced AlibiLSDDDetector
 sum p-values: 1.4700000137090683
 [0.69 0.18 0.23 0.18] ... [0.69 0.18 0.23 0.18]
bow_768 AlibiLSDDDetector
 sum p-values: 2.5599999595433474
 [0.59 0.43 0.31 0.29] ... [0.59 0.43 0.31 0.29]
