In [1]:
# Replace part of the OUTPUT_PATH to create a new folder 
# for the detection results

OUTPUT_PATH = "Temp/LoanApplications_Offline/"
NEW_OUTPUT_PATH = "Temp/LoanApplications_Offline__DETECTION/"

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm_notebook

import warnings
warnings.filterwarnings("ignore")

%load_ext autoreload
%autoreload 2

# My packages
from source import parse_mxml as pm
from source import log_representation as lr
from source import plots as plts
from source import drift_detection as dd
from source import drift_localization as dl
from source import offline_streaming_clustering as off_sc

from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
import hdbscan
from sklearn.model_selection import ParameterGrid
from joblib import Parallel, delayed, parallel_backend

import random
random.seed(42)

import os
import glob

import gc
gc.enable()

from scipy.spatial import distance
from sklearn.base import clone as sk_clone 

from copy import deepcopy

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_colwidth', 100)

In [3]:
def insensitive_glob(pattern):
    def either(c):
        return '[%s%s]' % (c.lower(), c.upper()) if c.isalpha() else c
    return glob.glob(''.join(map(either, pattern)))

def if_any(string, lista):
    # If the string contains any of the values
    # from the list 'lista'
    for l in lista:
        if l in string:
            return True
    return False

In [4]:
# List log files
logs = insensitive_glob(r"../../../../../../../Datasets/Business_Process_Drift_Logs/Logs/*/*k.MXML")
logs = [x.replace('\\', '/') for x in logs if "2.5" not in x]
# logs = [x for x in logs if "2.5" not in x]

In [5]:
# reference objects and map them to strings in dict 
# used in further methods
objects = {
    "model": {
#         "kmeans__k=6": KMeans(n_clusters=6, random_state=42),
#         "kmeans__k=3": KMeans(n_clusters=3, random_state=42),
#         "kmeans__k=2": KMeans(n_clusters=2, random_state=42),
        "DBSCAN__eps=05ms=5": DBSCAN(eps=0.5, min_samples=5, metric='euclidean'),
#         "DBSCAN__eps=1ms=4": DBSCAN(eps=1, min_samples=4, metric='euclidean'),
#         "DBSCAN__eps=2ms=3": DBSCAN(eps=2, min_samples=3, metric='euclidean'),
        "HDBSCAN__noparams": hdbscan.HDBSCAN(gen_min_span_tree=True, allow_single_cluster=True)
    },
    
    "representation": {
        "activity_binary": lambda x: lr.get_binary_representation(lr.get_traces_as_tokens(x)),
        "activity_frequency": lambda x: lr.get_frequency_representation(lr.get_traces_as_tokens(x)),
        "transitions_binary": lambda x: lr.get_binary_transitions_representation(lr.get_traces_as_tokens(x)),
        "transitions_frequency": lambda x: lr.get_frequency_transitions_representation(lr.get_traces_as_tokens(x)),
        "activity_transitions_frequency": lambda x: pd.concat([lr.get_frequency_transitions_representation(lr.get_traces_as_tokens(x)), lr.get_frequency_representation(lr.get_traces_as_tokens(x))],axis=1),
        "activity_transitions_binary": lambda x: pd.concat([lr.get_binary_transitions_representation(lr.get_traces_as_tokens(x)), lr.get_binary_representation(lr.get_traces_as_tokens(x))],axis=1)
    }
}

In [6]:
# change patterns and they supported representations
activity_binary_drifts = ["cb", "cf", "cm", "fr", "pm", "re", "rp"]
activity_frequency_drifts = activity_binary_drifts + ["cp", "lp"]
transitions_binary_drifts = activity_frequency_drifts + ["cd", "pl", "sw"]
transitions_frequency_drifts = transitions_binary_drifts
activity_transitions_frequency_drifts = transitions_binary_drifts
activity_transitions_binary_drifts = transitions_binary_drifts

### Pipeline Offline Clustering

In [7]:
def read_file_and_run_clustering_pipeline(args, return_result=False):
    """
    Read an event log file, represent it into a feature vector space and
    run the trace clustering method over windows. This method outputs results
    as gzip csv files into the "OUTPUT_PATH" folder, or return the result 
    as DataFrame when return_result = True.
    
    Parameters:
    -----------
        args (dict): Dictionary with the parameters and the log_file path
            requiring the following keys:
                example = {
                    'log': <PATH TO LOG_FILE>,
                    'representation': <KEY TO REPRESENTATIONS IN 'objects'>,
                    'parameters': [{
                        'model': <KEY TO MODEL IN 'objects'>, 
                        'sliding_window': <WHETHER TO USE SLIDING WINDOW>,
                        'window_size': <SIZE OF TRACE WINDOW TO USE>,
                        'sliding_step': <STEP OF SLIDING WINDOW>
                    }
        return_result (bool): Whether to return the result as DataFrame
            
    """
    # Create final dataset
    all_results = pd.DataFrame()
    
    # Treat file name to structure size and log type
    split = args["log"].split("/")
    
    # Parse change pattern name
    tipo_mudanca = split[-2]
    log_name = split[-1][:-5]

    # Parse size of the event_log
    log_size = int(float(log_name.replace(tipo_mudanca, "").replace("k", "")) * 1000)


    try:
        # Read log and apply trace representation technique
        log_read = pm.all_prep(open(args["log"]))
        df = objects["representation"][args["representation"]](log_read)
        
        # Create metric dataset
        all_metrics = pd.DataFrame() 
        
        for p in args["parameters"]:
            print(p)
            
            # If file does not exists, run trace clustering step and export file
            all_metrics = off_sc.run_offline_clustering_window(
                sk_clone(objects["model"][p["model"]]),
                p["window_size"],
                df,
                p["sliding_window"],
                sliding_step=p['sliding_step']
            )
            
            # Set up true drifts indexes and append
            y_true = list(range(int(len(df)/10), len(df), int(len(df)/10)))
            all_metrics["y_true"] = all_metrics.apply(lambda x: y_true, axis = 1)
            
            all_metrics = all_metrics.reset_index()
            
            # Identify columns
            all_metrics['tipo_mudanca'] = tipo_mudanca
            all_metrics['log_size'] = str(log_size)
            all_metrics['model'] = p["model"]
            all_metrics['representation'] = args["representation"]
            all_metrics['window_size'] = str(p["window_size"])
            all_metrics['sliding_window'] = str(p["sliding_window"])
            
        
            # Append results in final dataset
            all_results = all_results.append(all_metrics)
            
            gc.collect()
            
    except Exception as e:
        raise e
    
    all_results = all_results.reset_index(drop=True)
    return all_results

#### Run pipeline for specific case(s)

In [8]:
# read_file_and_run_clustering_pipeline({
#     'log': logs[0],
#     'representation': 'activity_binary',
#     'parameters': [{
#         'model': 'DBSCAN__eps=05ms=5', 
#         'sliding_window': False,
#         'window_size': 150,
#         'sliding_step': 1
#     }]
# }, return_result=True)

In [9]:
#read_file_and_run_clustering_pipeline({
#     'log': logs[0],
#     'representation': 'activity_binary',
#     'parameters': [{
#         'model': 'kmeans__k=6', 
#         'sliding_window': False,
#         'window_size': 200,
#         'sliding_step': 1
#     }]
# }, return_result=True)


### Run Experiments with several parameters combinations

In [10]:
# Trace clustering parameters
grid_parameters = list(ParameterGrid({
    "sliding_window": [False]    
    ,"window_size": [75, 100, 125, 150, 175, 200]
    ,"sliding_step": [1]
    ,"model": [
#         'kmeans__k=6',
#         'kmeans__k=3',
#         'kmeans__k=2',
        "DBSCAN__eps=05ms=5",
#         "DBSCAN__eps=1ms=4",
#         "DBSCAN__eps=2ms=3",
        "HDBSCAN__noparams"
    ] 
}))

# Trace vector representations
grid_logs = list(ParameterGrid([
    {
        "log": [x for x in logs if if_any(x, activity_binary_drifts)],
        "representation": ["activity_binary"]
    },
    {
        "log": [x for x in logs if if_any(x, activity_frequency_drifts)],
        "representation": ["activity_frequency"]
    },
    {
        "log": [x for x in logs if if_any(x, transitions_binary_drifts)],
        "representation": ["transitions_binary"]
    },
    {
        "log": [x for x in logs if if_any(x, transitions_frequency_drifts)],
        "representation": ["transitions_frequency"]
    },
    {
        "log": [x for x in logs if if_any(x, activity_transitions_frequency_drifts)],
        "representation": ["activity_transitions_frequency"]
    },
    {
        "log": [x for x in logs if if_any(x, activity_transitions_binary_drifts)],
        "representation": ["activity_transitions_binary"]
    }
]))

# Combining all parameters
combs = []
for x in grid_logs:
    dic = x.copy()
    dic['parameters'] = grid_parameters 
    
    combs.append(dic)

len(combs), len(grid_parameters), len(combs) * len(grid_parameters) 

(192, 12, 2304)

#### Run parallely

In [12]:
final_resp = pd.DataFrame()
final_resp = final_resp.append(Parallel(n_jobs=-2)(
    delayed(read_file_and_run_clustering_pipeline)(comb,return_result=False) for comb in tqdm_notebook(combs)
))

final_resp = final_resp.reset_index(drop=True)

try:
    os.makedirs(OUTPUT_PATH)
except:
    pass
final_resp.to_pickle(OUTPUT_PATH + "clustering_results" + '.pickle.gzip', compression="gzip")

final_resp.shape

HBox(children=(FloatProgress(value=0.0, max=192.0), HTML(value='')))




(140800, 55)

### Detection Pipeline

In [13]:
# Drift detection parameters
drift_config = list(ParameterGrid([
    {
        "rolling_window": [3, 4]
        ,"std_tolerance": [1, 1.5, 2]
        ,"min_tol": [0.003, 0.005, 0.007, 0.01] 
    }
]))
len(drift_config)

24

In [14]:
def drift_detect_pipeline(drift_comb, return_results=False):
    # Runs the drift detection for every feature
    results = []
    for col in drift_comb.select_dtypes(include=np.number).columns:
        if (col not in ["i","test_id"]):
#         if (col in ["Silhouette"]):
#         if (col not in ["k"] and not col.startswith("diff") ) or col in ["diff_centroids"]:

            drift_comb_temp = drift_comb.copy()
            drift_comb_temp["measure"] = col
            
            print(col)
            
            # Define index as iteration number (i)
            drift_comb_temp.set_index("i", inplace=True)
    
            detected_drifts, info = dd.detect_concept_drift(
                drift_comb_temp, 
                col,
                int(drift_comb_temp["rolling_window"].iloc[0]),
                float(drift_comb_temp["std_tolerance"].iloc[0]),
                float(drift_comb_temp["min_tol"].iloc[0])
            )    
            
            # Calculate classification metrics
            metrics_results = dd.get_metrics(
                detected_drifts,
                drift_comb_temp["y_true"].iloc[0],
                int(drift_comb_temp["window_size"].iloc[0])
            )
            
            drift_comb_temp.reset_index(inplace=True)
            info = pd.DataFrame(info, index=drift_comb_temp.index)
            metrics_results = pd.DataFrame([metrics_results], index=drift_comb_temp.index)

            results = pd.concat([drift_comb_temp,metrics_results,info], axis=1)

        gc.collect()
        
    return results

#### Run pipeline for specific case(s)

In [18]:
# drift_detect_pipeline(drifts_combinations[drifts_combinations["test_id"]==0])

k
Silhouette
DBi
calinski_harabasz_score
validity_index
avg_dist_between_centroids
std_dist_between_centroids
avg_radius
std_radius
avg_dist_intra_cluster
std_dist_intra_cluster
avg_skewness
std_skewness
avg_cluster_std
std_cluster_std
diff_k
adjusted_rand_score
adjusted_mutual_info_score
homogeneity_score
completeness_score
v_measure_score
fowlkes_mallows_score
diff_Silhouette
diff_DBi
diff_calinski_harabasz_score
diff_validity_index
diff_centroids
std_diff_centroids
total_MSE
avg_MSE
count_non_zero_MSE
diff_avg_dist_between_centroids
diff_std_dist_between_centroids
diff_radius
diff_dist_intra_cluster
diff_skewness
diff_cluster_std
relative_validity
diff_relative_validity


Unnamed: 0,i,k,y_pred,Silhouette,DBi,calinski_harabasz_score,validity_index,centroids,avg_dist_between_centroids,std_dist_between_centroids,volume_list,radius_list,dist_intra_cluster_list,skewness_list,cluster_std_list,avg_radius,std_radius,avg_dist_intra_cluster,std_dist_intra_cluster,avg_skewness,std_skewness,avg_cluster_std,std_cluster_std,diff_k,adjusted_rand_score,adjusted_mutual_info_score,homogeneity_score,completeness_score,v_measure_score,fowlkes_mallows_score,diff_Silhouette,diff_DBi,diff_calinski_harabasz_score,diff_validity_index,diff_centroids,std_diff_centroids,total_MSE,avg_MSE,count_non_zero_MSE,diff_avg_dist_between_centroids,diff_std_dist_between_centroids,diff_volume,diff_radius,diff_dist_intra_cluster,diff_skewness,diff_cluster_std,y_true,tipo_mudanca,log_size,model,representation,window_size,sliding_window,relative_validity,diff_relative_validity,min_tol,rolling_window,std_tolerance,test_id,measure,Precision,Recall,F1,Delay,Correct_Predictions,Support,Drifts_Found,Resp,lowers,uppers,means
0,100,4,"[0, 0, 1, 0, 0, 4, 2, 0, 2, 0, 2, 0, 1, 3, 0, 3, 2, 0, 0, 0, 1, 0, 1, 4, 0, 0, 0, 4, 3, 0, 0, 1,...",0.792254,0.823908,124.5982,0.787734,"[[1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]...",2.181134,0.52181,"[40, 16, 13, 13, 18]","[0.0, 0.0, 0.0, 0.0, 2.313406979295224]","[0.0, 0.0, 0.0, 0.0, 1.843581617841975]","[0.31980107453341566, -0.5455447255899809, -0.5455447255899807, -0.5455447255899806, -0.62493242...","[0.4937279747182558, 0.48237638894271995, 0.48237638894272, 0.48237638894272, 0.4772445792538752]",0.462681,0.925363,0.368716,0.737433,-0.388353,0.35541,0.48362,0.005431,,,,,,,,,,,,,,,,,,,,,,,,"[1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000]",cb,10000,DBSCAN__eps=05ms=5,activity_binary,100,False,,,0.003,3,1,0,diff_relative_validity,0,0.0,0.0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0]",0,[],"[1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000]",,,
1,200,5,"[0, 2, 0, 3, 2, 3, 1, 3, 3, 4, 3, 5, 3, 0, 0, 1, 0, 4, 5, 3, 0, 0, 0, 4, 0, 5, 4, 0, 0, 4, 2, 0,...",0.877051,0.697294,193.409123,0.852468,"[[1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]...",2.272879,0.455361,"[43, 10, 11, 16, 14, 6]","[0.0, 1.9287301521985911, 0.0, 0.0, 0.0, 0.0]","[0.0, 1.894811374526966, 0.0, 0.0, 0.0, 0.0]","[0.3198010745334156, -0.5455447255899807, -0.5455447255899807, -0.5455447255899809, -0.545544725...","[0.49372797471825586, 0.48237638894272, 0.48237638894272, 0.48237638894271995, 0.48237638894272,...",0.321455,0.718795,0.315802,0.706155,-0.40132,0.322495,0.484268,0.00423,1.0,-0.028486,0.015998,0.092984,0.088587,0.090732,0.224782,0.084796,-0.126614,68.810923,0.064735,0.212568,0.425136,0.0,0.0,0.0,0.091746,-0.066449,0,0.0,0.0,0.0,0.0,"[1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000]",cb,10000,DBSCAN__eps=05ms=5,activity_binary,100,False,,,0.003,3,1,0,diff_relative_validity,0,0.0,0.0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0]",0,[],"[1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000]",,,
2,300,5,"[0, 0, 1, 0, 2, 0, 5, 1, 3, 2, 0, 4, 2, 3, 5, 3, 0, 0, 3, 0, 0, 4, 0, 2, 2, 5, 0, 4, 5, 0, 0, 4,...",0.873144,0.633062,235.809613,0.869987,"[[1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]...",2.405256,0.642471,"[45, 8, 11, 14, 9, 13]","[0.0, 0.0, 0.0, 0.0, 0.0, 1.9791815892720934]","[0.0, 0.0, 0.0, 0.0, 0.0, 1.5246453230912211]","[0.3198010745334158, -0.10540925533894575, -0.5455447255899808, -0.5455447255899806, -0.54554472...","[0.4937279747182558, 0.4993069989739546, 0.48237638894272, 0.48237638894272006, 0.48237638894272...",0.329864,0.737597,0.254108,0.568202,-0.37938,0.381236,0.483332,0.012372,0.0,-0.013144,-0.032185,0.064362,0.064851,0.064606,0.243365,-0.003906,-0.064232,42.40049,0.017519,0.39715,0.565915,2.27286,0.119624,13.0,0.132377,0.18711,"[2, -2, 0, -2, -5, 7]",1.27286,0.985809,0.048148,0.0001324928,"[1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000]",cb,10000,DBSCAN__eps=05ms=5,activity_binary,100,False,,,0.003,3,1,0,diff_relative_validity,0,0.0,0.0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0]",0,[],"[1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000]",,,
3,400,5,"[0, 1, 2, 3, 3, 2, 4, 3, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 5, 0, 3, 0, 4, 2, 0, 0, 3, 0, 5, 3, 0, 0,...",0.958485,0.259478,930.264286,0.89,"[[1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]...",2.454185,0.645482,"[39, 5, 11, 19, 15, 11]","[0.0, 0.0, 0.0, 0.0, 0.0, 1.7296634173127678]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.5844024658860173]","[0.3198010745334157, -0.10540925533894574, -0.5455447255899808, -0.5455447255899808, -0.54554472...","[0.4937279747182558, 0.4993069989739546, 0.48237638894272, 0.48237638894272, 0.48237638894271995...",0.288277,0.644607,0.0974,0.217794,-0.34242,0.342177,0.486151,0.007765,0.0,-0.015923,-0.036221,0.059926,0.05838,0.059143,0.231451,0.085341,-0.373584,694.454673,0.020013,0.216303,0.370294,1.836797,0.096674,8.0,0.048928,0.003011,"[-6, -3, 0, 5, 6, -2]",0.010377,0.147343,0.008196,4.767304e-05,"[1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000]",cb,10000,DBSCAN__eps=05ms=5,activity_binary,100,False,,,0.003,3,1,0,diff_relative_validity,0,0.0,0.0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0]",0,[],"[1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000]",,,
4,500,5,"[0, 0, 2, 0, 3, 0, 0, 0, 4, 0, 0, 2, 2, 5, 0, 0, 0, 5, 0, 3, 2, 0, 4, 1, 0, 2, 0, 0, 0, 3, 0, 0,...",0.935909,0.485314,347.025882,0.910621,"[[1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]...",2.352853,0.515914,"[46, 8, 14, 12, 6, 14]","[0.0, 2.2707377655731187, 0.0, 0.0, 0.0, 0.0]","[0.0, 1.2996648299906237, 0.0, 0.0, 0.0, 0.0]","[0.3198010745334157, -0.3198010745334157, -0.5455447255899806, -0.5455447255899808, -0.545544725...","[0.4937279747182558, 0.4937279747182558, 0.48237638894272006, 0.48237638894272, 0.48237638894272...",0.378456,0.846254,0.216611,0.484356,-0.363696,0.316589,0.48616,0.005351,0.0,0.018795,-0.011054,0.080434,0.084092,0.082222,0.262243,-0.022576,0.225836,-583.238403,0.020621,0.154143,0.240255,0.687543,0.036186,13.0,-0.101332,-0.129568,"[7, 3, 3, -7, -9, 3]",1.357998,0.338442,0.008914,1.04762e-05,"[1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000]",cb,10000,DBSCAN__eps=05ms=5,activity_binary,100,False,,,0.003,3,1,0,diff_relative_validity,0,0.0,0.0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0]",0,[],"[1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000]",,,
5,600,4,"[1, 0, 0, 0, 0, 0, 2, 0, 3, 0, 4, 4, 2, 2, 3, 4, 0, 3, 2, 0, 2, 4, 1, 2, 0, 1, 0, 2, 1, 1, 0, 0,...",0.768067,0.903741,102.258144,0.776289,"[[1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]...",2.163529,0.523704,"[44, 12, 11, 14, 19]","[0.0, 0.0, 0.0, 0.0, 2.168778713477528]","[0.0, 0.0, 0.0, 0.0, 1.9550872637143208]","[0.3198010745334158, -0.5455447255899808, -0.5455447255899809, -0.5455447255899807, -0.595385669...","[0.4937279747182558, 0.48237638894272, 0.48237638894272, 0.48237638894272, 0.47921637064389827]",0.433756,0.867511,0.391017,0.782035,-0.382444,0.351653,0.484015,0.005008,-1.0,0.008224,-0.034534,0.045515,0.048145,0.046793,0.2742,-0.167842,0.418427,-244.767738,-0.134332,0.240037,0.480074,0.0,0.0,0.0,-0.189324,0.00779,0,0.0,0.0,0.0,0.0,"[1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000]",cb,10000,DBSCAN__eps=05ms=5,activity_binary,100,False,,,0.003,3,1,0,diff_relative_validity,0,0.0,0.0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0]",0,[],"[1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000]",,,
6,700,4,"[0, 0, 0, 0, 1, 2, 0, 2, 2, 2, 4, 0, 1, 2, 3, 0, 0, 0, 0, 4, 0, 0, 2, 4, 4, 3, 2, 0, 4, 0, 0, 4,...",0.861914,0.775206,174.895503,0.807271,"[[1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]...",2.206437,0.50234,"[50, 14, 12, 11, 13]","[0.0, 0.0, 0.0, 0.0, 2.2347444686667615]","[0.0, 0.0, 0.0, 0.0, 1.7482132592085224]","[0.3198010745334158, -0.5455447255899806, -0.5455447255899808, -0.5455447255899808, -0.545544725...","[0.4937279747182558, 0.48237638894272006, 0.48237638894272, 0.48237638894272, 0.48237638894272]",0.446949,0.893898,0.349643,0.699285,-0.372476,0.346138,0.484647,0.004541,0.0,-0.025368,-0.024821,0.041687,0.043643,0.042642,0.268507,0.093847,-0.128535,72.637359,0.030981,0.064044,0.128089,0.020508,0.001079,13.0,0.042908,-0.021364,"[6, 2, 1, -3, -6]",0.00087,0.008559,0.000497,1.997143e-06,"[1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000]",cb,10000,DBSCAN__eps=05ms=5,activity_binary,100,False,,,0.003,3,1,0,diff_relative_validity,0,0.0,0.0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0]",0,[],"[1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000]",,,
7,800,4,"[0, 0, 0, 1, 1, 2, 2, 1, 0, 3, 0, 0, 0, 4, 0, 3, 3, 0, 4, 4, 4, 0, 0, 0, 4, 2, 0, 1, 2, 0, 4, 4,...",0.850358,0.567409,163.833757,0.694177,"[[1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]...",2.206722,0.543978,"[44, 14, 8, 13, 21]","[0.0, 0.0, 0.0, 0.0, 2.692056004781442]","[0.0, 0.0, 0.0, 0.0, 1.145113433616626]","[0.3198010745334158, -0.5455447255899806, -0.5455447255899809, -0.5455447255899806, -0.567970315...","[0.4937279747182558, 0.48237638894272006, 0.48237638894271995, 0.48237638894272, 0.4809810473890...",0.538411,1.076822,0.229023,0.458045,-0.376961,0.348489,0.484368,0.004711,0.0,-0.00304,-0.048893,0.020743,0.020058,0.020395,0.287069,-0.011557,-0.207797,-11.061745,-0.113093,0.307996,0.403624,2.222609,0.116979,13.0,0.000285,0.041638,"[-6, 0, -4, 2, 8]",0.041827,0.072746,0.000101,3.893956e-07,"[1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000]",cb,10000,DBSCAN__eps=05ms=5,activity_binary,100,False,,,0.003,3,1,0,diff_relative_validity,0,0.0,0.0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0]",0,[],"[1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000]",,,
8,900,5,"[2, 0, 0, 0, 0, 0, 2, 2, 2, 0, 3, 3, 1, 4, 5, 4, 1, 0, 0, 5, 0, 0, 3, 0, 1, 5, 0, 0, 3, 0, 0, 0,...",0.876199,0.70437,180.285517,0.854412,"[[1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]...",2.270518,0.452493,"[50, 10, 11, 13, 8, 8]","[0.0, 1.8110770276274832, 0.0, 0.0, 0.0, 0.0]","[0.0, 1.920846327310384, 0.0, 0.0, 0.0, 0.0]","[0.3198010745334158, -0.5455447255899807, -0.5455447255899809, -0.5455447255899807, -0.545544725...","[0.4937279747182558, 0.48237638894272, 0.48237638894272, 0.48237638894272, 0.48237638894271995, ...",0.301846,0.674949,0.320141,0.715857,-0.40132,0.322495,0.484268,0.00423,1.0,-0.037523,-0.011517,0.071278,0.068527,0.069875,0.256939,0.025841,0.13696,16.45176,0.160235,0.108588,0.217176,0.0,0.0,0.0,0.063796,-0.091485,0,0.0,0.0,0.0,0.0,"[1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000]",cb,10000,DBSCAN__eps=05ms=5,activity_binary,100,False,,,0.003,3,1,0,diff_relative_validity,0,0.0,0.0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0]",0,[],"[1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000]",,,
9,1000,5,"[0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 3, 0, 4, 0, 2, 3, 0, 4, 0, 2, 3, 5, 4, 3, 1, 0, 0, 4, 1,...",0.965,0.360376,673.4536,0.94,"[[1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]...",2.392114,0.554455,"[47, 6, 10, 12, 13, 12]","[0.0, 2.3570226039551585, 0.0, 0.0, 0.0, 0.0]","[0.0, 0.9428090415820634, 0.0, 0.0, 0.0, 0.0]","[0.31980107453341566, -0.24748737341529128, -0.5455447255899807, -0.5455447255899808, -0.5455447...","[0.4937279747182558, 0.49621528504319135, 0.48237638894272, 0.48237638894272, 0.48237638894272, ...",0.392837,0.87841,0.157135,0.351364,-0.351644,0.319394,0.486575,0.005981,0.0,-0.064764,-0.025398,0.07575,0.073815,0.07477,0.237186,0.088801,-0.343993,493.168083,0.085588,0.158309,0.35399,0.817037,0.043002,11.0,0.121596,0.101962,"[-3, -4, -1, -1, 5, 4]",0.049676,0.159426,0.014806,3.191917e-05,"[1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000]",cb,10000,DBSCAN__eps=05ms=5,activity_binary,100,False,,,0.003,3,1,0,diff_relative_validity,0,0.0,0.0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0]",0,[],"[1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000]",,,


#### Run parallely

In [16]:
# Read file
clusterings = pd.read_pickle(OUTPUT_PATH + "clustering_results.pickle.gzip", compression='gzip')
clusterings.shape

(140800, 55)

In [17]:
# Combining all parameters
drifts_combinations = pd.DataFrame()
for config in drift_config:
    temp = clusterings.copy()
    temp["min_tol"] = str(config["min_tol"])
    temp["rolling_window"] = str(config["rolling_window"])
    temp["std_tolerance"] = str(config["std_tolerance"])
    drifts_combinations = drifts_combinations.append(temp)

drifts_combinations = drifts_combinations.reset_index(drop=True)
drifts_combinations["test_id"] = drifts_combinations.groupby([
    'tipo_mudanca','log_size','model','representation','window_size','sliding_window'
    ,'min_tol','rolling_window','std_tolerance'
    ]).ngroup()
drifts_combinations

Unnamed: 0,i,k,y_pred,Silhouette,DBi,calinski_harabasz_score,validity_index,centroids,avg_dist_between_centroids,std_dist_between_centroids,volume_list,radius_list,dist_intra_cluster_list,skewness_list,cluster_std_list,avg_radius,std_radius,avg_dist_intra_cluster,std_dist_intra_cluster,avg_skewness,std_skewness,avg_cluster_std,std_cluster_std,diff_k,adjusted_rand_score,adjusted_mutual_info_score,homogeneity_score,completeness_score,v_measure_score,fowlkes_mallows_score,diff_Silhouette,diff_DBi,diff_calinski_harabasz_score,diff_validity_index,diff_centroids,std_diff_centroids,total_MSE,avg_MSE,count_non_zero_MSE,diff_avg_dist_between_centroids,diff_std_dist_between_centroids,diff_volume,diff_radius,diff_dist_intra_cluster,diff_skewness,diff_cluster_std,y_true,tipo_mudanca,log_size,model,representation,window_size,sliding_window,relative_validity,diff_relative_validity,min_tol,rolling_window,std_tolerance,test_id
0,75,4,"[0, 0, 1, 0, 0, 4, 2, 0, 2, 0, 2, 0, 1, 3, 0, 3, 2, 0, 0, 0, 1, 0, 1, 4, 0, 0, 0, 4, 3, 0, 0, 1,...",0.778878,0.840834,82.536541,0.777725,"[[1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]...",2.179280,0.510825,"[30, 13, 7, 11, 14]","[0.0, 0.0, 0.0, 0.0, 2.201576429631777]","[0.0, 0.0, 0.0, 0.0, 1.9078924663386718]","[0.3198010745334157, -0.5455447255899807, -0.5455447255899808, -0.5455447255899807, -0.579257469...","[0.4937279747182558, 0.48237638894272, 0.48237638894272, 0.48237638894272, 0.4802622382758289]",0.440315,0.880631,0.381578,0.763157,-0.379218,0.349753,0.484224,0.004822,,,,,,,,,,,,,,,,,,,,,,,,"[1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000]",cb,10000,DBSCAN__eps=05ms=5,activity_binary,75,False,,,0.003,3,1,120
1,150,4,"[0, 4, 0, 1, 2, 3, 0, 0, 3, 0, 4, 3, 3, 4, 0, 4, 0, 4, 0, 0, 3, 1, 0, 3, 2, 0, 2, 0, 1, 2, 1, 4,...",0.862840,0.629739,203.783333,0.811190,"[[1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]...",2.234556,0.553111,"[31, 11, 9, 13, 11]","[0.0, 0.0, 0.0, 0.0, 1.8272501129310708]","[0.0, 0.0, 0.0, 0.0, 1.427373534245738]","[0.31980107453341583, -0.5455447255899809, -0.5455447255899808, -0.5455447255899807, -0.76891740...","[0.4937279747182557, 0.48237638894272, 0.48237638894272, 0.48237638894272, 0.4666972990084803]",0.365450,0.730900,0.285475,0.570949,-0.417150,0.378495,0.481511,0.008613,0.0,-0.005500,-0.003063,0.081617,0.081618,0.081618,0.242518,0.083962,-0.211095,121.246792,0.033465,0.382283,0.480912,4.021572,0.211662,13.0,0.055276,0.042285,"[1, -2, 2, 2, -3]",0.028024,0.046180,0.007194,0.000037,"[1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000]",cb,10000,DBSCAN__eps=05ms=5,activity_binary,75,False,,,0.003,3,1,120
2,225,5,"[0, 2, 0, 0, 3, 0, 5, 4, 4, 3, 4, 0, 2, 0, 2, 0, 1, 0, 0, 1, 5, 0, 0, 3, 3, 0, 5, 0, 0, 2, 0, 0,...",0.895834,0.666191,193.636549,0.862175,"[[1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]...",2.368069,0.666584,"[32, 7, 12, 11, 5, 8]","[0.0, 0.0, 0.0, 0.0, 0.0, 2.038688303787511]","[0.0, 0.0, 0.0, 0.0, 0.0, 1.5090459872429685]","[0.31980107453341583, -0.1054092553389457, -0.5455447255899808, -0.5455447255899809, -0.54554472...","[0.49372797471825575, 0.49930699897395464, 0.48237638894272, 0.48237638894272, 0.482376388942719...",0.339781,0.759774,0.251508,0.562388,-0.358398,0.357245,0.484999,0.009413,1.0,0.008077,-0.029349,0.079933,0.075259,0.077526,0.252026,0.032994,0.036452,-10.146784,0.050985,0.318424,0.390090,0.000000,0.000000,0.0,0.133512,0.113474,0,0.000000,0.000000,0.000000,0.000000,"[1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000]",cb,10000,DBSCAN__eps=05ms=5,activity_binary,75,False,,,0.003,3,1,120
3,300,4,"[4, 0, 2, 4, 0, 0, 2, 0, 3, 0, 4, 4, 3, 3, 1, 0, 0, 0, 4, 4, 0, 4, 0, 0, 0, 3, 2, 0, 0, 4, 3, 4,...",0.747398,0.850170,83.510329,0.721680,"[[1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]...",2.386692,0.694441,"[35, 6, 7, 10, 17]","[0.0, 0.0, 0.0, 0.0, 1.5066174676312585]","[0.0, 0.0, 0.0, 0.0, 1.7856182449374065]","[0.3198010745334157, -0.10540925533894573, -0.5455447255899808, -0.5455447255899807, -0.74688811...","[0.4937279747182558, 0.4993069989739546, 0.48237638894272, 0.48237638894272, 0.46840376150562407]",0.301323,0.602647,0.357124,0.714247,-0.324717,0.384515,0.485238,0.010675,-1.0,-0.021615,0.002921,0.101402,0.115660,0.108063,0.251051,-0.148436,0.183979,-110.126220,-0.140495,0.130560,0.261119,0.000000,0.000000,0.0,0.018624,0.027856,0,0.000000,0.000000,0.000000,0.000000,"[1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000]",cb,10000,DBSCAN__eps=05ms=5,activity_binary,75,False,,,0.003,3,1,120
4,375,3,"[0, 3, 1, 2, 2, 1, 3, 2, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 3, 0, 2, 0, 3, 1, 0, 0, 2, 0, 3, 2, 0, 0,...",0.837392,0.453694,140.653398,0.640199,"[[1.0, 0.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0]...",2.287959,0.499409,"[32, 8, 15, 20]","[0.0, 0.0, 0.0, 2.7267196408871968]","[0.0, 0.0, 0.0, 1.2443709219927586]","[0.31980107453341583, -0.5455447255899809, -0.5455447255899809, -0.5455447255899807]","[0.49372797471825575, 0.48237638894271995, 0.48237638894271995, 0.48237638894272]",0.681680,1.180704,0.311093,0.538828,-0.329208,0.374706,0.485214,0.004915,-1.0,0.005818,0.037218,0.100402,0.108870,0.104465,0.297979,0.089994,-0.396476,57.143070,-0.081481,0.588874,0.586899,0.000000,0.000000,0.0,-0.098733,-0.195031,0,0.000000,0.000000,0.000000,0.000000,"[1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000]",cb,10000,DBSCAN__eps=05ms=5,activity_binary,75,False,,,0.003,3,1,120
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3379195,6800,10,"[9, 7, 7, 7, 6, 8, 7, 1, 0, 8, 1, 0, 1, 0, 7, 10, 9, 6, 4, 0, 1, 7, 4, 0, 0, 8, 6, 2, 7, 9, 6, 0...",0.659765,1.145666,80.038901,0.068968,"[[0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.05263157894736842, 0.0, 1.0, 0.0...",3.639031,0.798166,"[57, 23, 23, 6, 8, 12, 13, 19, 22, 10, 7]","[2.118380189210327, 1.8471865901085218, 1.8471865901085218, 1.224744871391589, 2.143303524935281...","[0.22696930598682075, 0.671704214584917, 0.671704214584917, 1.4696938456699071, 0.61237243569579...","[0.876628761144752, 0.8157397277344735, 0.8157397277344736, 0.1970276015597753, 0.19702760155977...","[0.45794161249907306, 0.4629713329897312, 0.4629713329897312, 0.49759127609605963, 0.49759127609...",1.993605,0.895144,1.050711,0.901292,0.352472,0.297716,0.488076,0.016455,-1.0,0.010602,-0.017932,0.117954,0.123101,0.120473,0.140126,-0.087894,0.395516,-54.614301,-0.100716,0.874758,0.845553,0.000000,0.000000,0.0,-0.039312,-0.011276,0,0.000000,0.000000,0.000000,0.000000,"[750, 1500, 2250, 3000, 3750, 4500, 5250, 6000, 6750]",sw,7500,HDBSCAN__noparams,activity_transitions_binary,200,False,0.129335,-0.095502,0.01,4,2,54839
3379196,7000,11,"[2, 7, 6, 4, 1, 2, 7, 0, 9, 3, 0, 0, 1, 1, 7, 3, 2, 8, 9, 2, 2, 2, 10, 2, 0, 3, 3, 8, 11, 2, 6, ...",0.795880,0.685444,160.927175,0.134426,"[[0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.022727272727272728, 0.0, 1.0, 0....",3.706580,0.810316,"[44, 20, 27, 11, 7, 6, 21, 17, 20, 10, 10, 7]","[2.185248250738431, 2.1242645786248002, 2.0704333124998056, 0.0, 0.0, 0.0, 3.1453813816898553, 2...","[0.10163945352271772, 0.223606797749979, 0.318528201923047, 0.0, 0.0, 0.0, 2.424223096638363, 0....","[0.8919578008354931, 0.8779725062069957, 0.8657165936665178, 0.19702760155977525, 0.197027601559...","[0.45664534523392153, 0.4578284416334067, 0.45885729872651076, 0.4975912760960597, 0.49759127609...",1.606484,0.991149,0.569890,0.686438,0.336398,0.314842,0.488245,0.017607,1.0,0.007670,0.019592,0.157156,0.147632,0.152245,0.131220,0.136115,-0.460222,80.888274,0.065458,1.563176,1.033410,0.000000,0.000000,0.0,0.067550,0.012151,0,0.000000,0.000000,0.000000,0.000000,"[750, 1500, 2250, 3000, 3750, 4500, 5250, 6000, 6750]",sw,7500,HDBSCAN__noparams,activity_transitions_binary,200,False,0.183169,0.053833,0.01,4,2,54839
3379197,7200,12,"[4, 0, 0, 8, 11, 1, 10, 0, 7, 1, 11, 0, 1, 5, 1, 6, 4, 9, 0, 8, 1, 7, 0, 2, 10, 7, 9, 0, 3, 8, 6...",0.810792,0.620915,180.764153,0.347945,"[[0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0,...",3.791309,0.867063,"[51, 20, 22, 7, 7, 13, 11, 10, 9, 18, 8, 10, 14]","[0.0, 2.1242645786248002, 1.8295101634089186, 0.0, 0.0, 2.0640627484613443, 2.032789070454354, 2...","[0.0, 0.223606797749979, 0.6969562527272071, 0.0, 0.0, 0.34401045807689073, 0.4065578140908709, ...","[0.9036961141150637, 0.8779725062069957, 0.8118387533013891, 0.4417706308963744, 0.1970276015597...","[0.4556450995538137, 0.4578284416334067, 0.4632866815817121, 0.4882313567838719, 0.4975912760960...",1.575574,0.878646,0.553013,0.575482,0.340669,0.297730,0.488565,0.016535,1.0,0.017712,-0.000093,0.157085,0.153430,0.155236,0.128622,0.014913,-0.064528,19.836978,0.213519,0.413777,0.608489,0.000000,0.000000,0.0,0.084728,0.056746,0,0.000000,0.000000,0.000000,0.000000,"[750, 1500, 2250, 3000, 3750, 4500, 5250, 6000, 6750]",sw,7500,HDBSCAN__noparams,activity_transitions_binary,200,False,0.372921,0.189752,0.01,4,2,54839
3379198,7400,12,"[2, 2, 11, 9, 0, 1, 8, 11, 7, 12, 5, 0, 6, 1, 8, 0, 8, 7, 2, 5, 1, 9, 0, 12, 5, 4, 0, 9, 8, 7, 1...",0.757902,0.637687,120.828889,0.366343,"[[0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0,...",3.805711,0.878611,"[34, 26, 20, 7, 7, 17, 7, 7, 28, 18, 9, 7, 13]","[0.0, 1.8060549049036765, 1.6770509831248424, 0.0, 0.0, 0.0, 0.0, 0.0, 3.7513602974964497, 1.987...","[0.0, 0.7224219619614705, 0.8826584121709695, 0.0, 0.0, 0.0, 0.0, 0.0, 2.086940497151385, 0.4676...","[0.9036961141150635, 0.8066747063332154, 0.7785161769302641, 0.4417706308963744, 0.1970276015597...","[0.4556450995538138, 0.4637027940718311, 0.4659442010178149, 0.4882313567838719, 0.4975912760960...",1.157960,1.179496,0.522321,0.616245,0.340409,0.279980,0.489044,0.015328,0.0,0.003326,0.015081,0.178534,0.175710,0.177111,0.109009,-0.052890,0.016771,-59.935264,0.018398,0.412309,0.527383,6.389559,0.125285,28.0,0.014402,0.011549,"[-17, 6, -2, 0, 0, 4, -4, -3, 19, 0, 1, -3, -1]",1.264070,0.443006,0.000904,0.000003,"[750, 1500, 2250, 3000, 3750, 4500, 5250, 6000, 6750]",sw,7500,HDBSCAN__noparams,activity_transitions_binary,200,False,0.439871,0.066950,0.01,4,2,54839


In [None]:
drifts_combinations_grouped = drifts_combinations.groupby("test_id")
final_drift_detection = pd.DataFrame()
final_drift_detection = final_drift_detection.append(Parallel(n_jobs=-2)(
    delayed(drift_detect_pipeline)(drift_comb) for i,drift_comb in tqdm_notebook(drifts_combinations_grouped)
))

final_drift_detection = final_drift_detection.reset_index(drop=True)

try:
    os.makedirs(NEW_OUTPUT_PATH)
except:
    pass
final_drift_detection.to_pickle(NEW_OUTPUT_PATH + "drift_detections_results" + '.pickle.gzip', compression="gzip")


final_drift_detection.shape

HBox(children=(FloatProgress(value=0.0, max=55296.0), HTML(value='')))

### Consolidate results

# Results

In [None]:
# Read file
detections = pd.read_pickle(NEW_OUTPUT_PATH + "drift_detections_results.pickle.gzip", compression='gzip')
detections.shape

In [None]:
model = list(objects["model"].keys())[1]
print(model)

# Filter tests
detections_filtered = detections[detections["model"]==model]
print(detections_filtered.shape)

all_results = detections_filtered.set_index("test_id").groupby("test_id").first()

# Group results by all params and get the mean F1 and Delay among all scenarios
# In other words, what combination of params got the best results in all scenarios

params = ["min_tol", "rolling_window", "std_tolerance", "window_size", "measure", "representation"]
scenarios = ["tipo_mudanca", "log_size"]
validation_metrics = ["F1","Delay"]#,"Support","Precision","Recall"]

all_results_grouped = all_results.groupby(params)[validation_metrics].agg(['mean','std','count'])
all_results_grouped.columns = all_results_grouped.columns.map('_'.join)
# all_results_grouped = all_results_grouped[all_results_grouped['F1_count']>=len(activity_binary_drifts)]
all_results_grouped.sort_values(["F1_mean","Delay_mean"], ascending=[False,True], inplace=True)
all_results_grouped = all_results_grouped.reset_index()
all_results_grouped.head(50)

In [None]:
best_results_by_representation = all_results_grouped.reset_index().groupby("representation").first().reset_index()
best_results_by_representation

In [None]:
best_result = pd.merge(left=all_results.reset_index()
            , right=best_results_by_representation
            , how='inner', on=params).set_index("test_id")
results_table = best_result.pivot_table(values=["F1"]#,"Delay"]
                                        , index=["tipo_mudanca"]
                                        , columns=["representation"]
                                        , aggfunc='mean')\
                                    .sort_index(axis='columns',level=[1,0], ascending=[True,False])
results_table

In [None]:
results_table['F1'].plot(kind='bar', figsize=(20,10), fontsize=20)
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), fontsize=20)
plt.show()

In [None]:
results_table_logsize = best_result.pivot_table(values=["F1"]#,"Delay"]
                                                , index=["tipo_mudanca"]
                                                , columns=["representation", "log_size"]
                                                , aggfunc='mean')\
                                    .sort_index(axis='columns',level=[1,0], ascending=[True,False])
results_table_logsize

In [None]:
for row in range(0,len(best_result)):
    best_result_drifts = detections_filtered[detections_filtered["test_id"]==best_result.index[row]]
    best_result_drifts.set_index("i", inplace=True)
    
    plts.plot_deteccao_drift(
        best_result_drifts,
        best_result_drifts.iloc[0]['measure'],
        best_result_drifts.iloc[0]['Drifts_Found'],
        best_result_drifts.iloc[0]['Resp'],
        best_result_drifts['means'],
        best_result_drifts['lowers'],
        best_result_drifts['uppers'],
        save_png=""
    )
    plt.title("Log: " + best_result_drifts.iloc[0]["tipo_mudanca"] + str(best_result_drifts.iloc[0]["log_size"]) 
                  + " - Rep: " + best_result_drifts.iloc[0]["representation"] 
                  + " - Metric: " + best_result_drifts.iloc[0]["measure"]
                  + " - F1: " + str(round(best_result_drifts.iloc[0]["F1"],2))
              , size=30)
    plt.show()

In [None]:
for row in range(0,len(best_result)):

    best_result_log = [x for x in logs if best_result.iloc[row]["tipo_mudanca"] in x 
         and str(best_result.iloc[row]["log_size"]/1000) in 
                       str(float(x.split("/")[-1][:-5].replace("k", "").replace(x.split("/")[-2],"")))]
    
    run_df = off_sc.run_offline_clustering_window(
        objects["model"][best_result.iloc[row]['model']],
        int(best_result.iloc[row]['window_size']),
        objects["representation"][best_result.iloc[row]['representation']](pm.all_prep(open(best_result_log[0]))),
        sliding_window=False,
        sliding_step=1
    )

    drifts, info = dd.detect_concept_drift(
        run_df,
        best_result.iloc[row]['measure'],
        rolling_window=best_result.iloc[row]['rolling_window'],
        std_tolerance=best_result.iloc[row]['std_tolerance'],
        min_tol=best_result.iloc[row]['min_tol']
    )

    plts.plot_deteccao_drift(
        run_df,
        best_result.iloc[row]['measure'],
        best_result.iloc[row]['Drifts_Found'],
        best_result.iloc[row]['Resp'],
        info['means'],
        info['lowers'],
        info['uppers'],
        save_png=""
    )
    plt.title("Log: " + best_result.iloc[row]["tipo_mudanca"] + str(best_result.iloc[row]["log_size"]) 
                  + " - Rep: " + best_result.iloc[row]["representation"] 
                  + " - Metric: " + best_result.iloc[row]["measure"]
                  + " - F1: " + str(round(best_result.iloc[row]["F1"],2))
              , size=30)
    plt.show()

In [None]:
# all_results.to_excel('Resultados_PL.xlsx')