In [None]:
# Replace part of the OUTPUT_PATH to create a new folder 
# for the detection results

OUTPUT_PATH = "Temp/LoanApplications_Offline/"
NEW_OUTPUT_PATH = "Temp/LoanApplications_Offline__DETECTION/"

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm_notebook

import warnings
warnings.filterwarnings("ignore")

%load_ext autoreload
%autoreload 2

# My packages
from source import parse_mxml as pm
from source import log_representation as lr
from source import plots as plts
from source import drift_detection as dd
from source import drift_localization as dl
from source import offline_streaming_clustering as off_sc

from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
import hdbscan
from sklearn.model_selection import ParameterGrid
from joblib import Parallel, delayed, parallel_backend

import random
random.seed(42)

import os
import glob

import gc
gc.enable()

from scipy.spatial import distance
from sklearn.base import clone as sk_clone 

from copy import deepcopy

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_colwidth', 100)

In [None]:
def insensitive_glob(pattern):
    def either(c):
        return '[%s%s]' % (c.lower(), c.upper()) if c.isalpha() else c
    return glob.glob(''.join(map(either, pattern)))

def if_any(string, lista):
    # If the string contains any of the values
    # from the list 'lista'
    for l in lista:
        if l in string:
            return True
    return False

In [None]:
# List log files
logs = insensitive_glob(r"../../../../../../../Datasets/Business_Process_Drift_Logs/Logs/*/*k.MXML")
logs = [x.replace('\\', '/') for x in logs if "2.5" not in x]
# logs = [x for x in logs if "2.5" not in x]

In [None]:
# reference objects and map them to strings in dict 
# used in further methods
objects = {
    "model": {
#         "kmeans__k=6": KMeans(n_clusters=6, random_state=42),
#         "kmeans__k=3": KMeans(n_clusters=3, random_state=42),
#         "kmeans__k=2": KMeans(n_clusters=2, random_state=42),
#         "DBSCAN__eps=05ms=5": DBSCAN(eps=0.5, min_samples=5, metric='euclidean'),
#         "DBSCAN__eps=1ms=4": DBSCAN(eps=1, min_samples=4, metric='euclidean'),
#         "DBSCAN__eps=2ms=3": DBSCAN(eps=2, min_samples=3, metric='euclidean'),
        "HDBSCAN__noparams": hdbscan.HDBSCAN(gen_min_span_tree=True, allow_single_cluster=True)
    },
    
    "representation": {
        "activity_binary": lr.get_binary_representation,
        "activity_frequency": lr.get_frequency_representation,
        
        "transitions_binary": lr.get_binary_transitions_representation,
        "transitions_frequency": lr.get_frequency_transitions_representation,
        
        "activity_tfidf": lr.get_tfidf_representation,
        "transitions_tfidf": lr.get_tfidf_transitions_representation,
        
        "activity_transitions_frequency": lr.get_activity_transitions_frequency_representation,
        "activity_transitions_binary": lr.get_activity_transitions_binary_representation
    }
#     "representation": {
#         "activity_binary": lambda x: lr.get_binary_representation(lr.get_traces_as_tokens(x)),
#         "activity_frequency": lambda x: lr.get_frequency_representation(lr.get_traces_as_tokens(x)),
#         "transitions_binary": lambda x: lr.get_binary_transitions_representation(lr.get_traces_as_tokens(x)),
#         "transitions_frequency": lambda x: lr.get_frequency_transitions_representation(lr.get_traces_as_tokens(x)),
#         "activity_transitions_frequency": lambda x: pd.concat([lr.get_frequency_transitions_representation(lr.get_traces_as_tokens(x)), lr.get_frequency_representation(lr.get_traces_as_tokens(x))],axis=1),
#         "activity_transitions_binary": lambda x: pd.concat([lr.get_binary_transitions_representation(lr.get_traces_as_tokens(x)), lr.get_binary_representation(lr.get_traces_as_tokens(x))],axis=1)
#     }
}

In [None]:
# change patterns and they supported representations
activity_binary_drifts = ["cb", "cf", "cm", "fr", "pm", "re", "rp"]
activity_frequency_drifts = activity_binary_drifts + ["cp", "lp"]

transitions_binary_drifts = activity_frequency_drifts + ["cd", "pl", "sw"]
transitions_frequency_drifts = transitions_binary_drifts

activity_tfidf_drifts = transitions_binary_drifts
transitions_tfidf_drifts = transitions_binary_drifts

activity_transitions_frequency_drifts = transitions_binary_drifts
activity_transitions_binary_drifts = transitions_binary_drifts

### Pipeline Offline Clustering

In [None]:
def read_file_and_run_clustering_pipeline(args, return_result=False):
    """
    Read an event log file, represent it into a feature vector space and
    run the trace clustering method over windows. This method outputs results
    as gzip csv files into the "OUTPUT_PATH" folder, or return the result 
    as DataFrame when return_result = True.
    
    Parameters:
    -----------
        args (dict): Dictionary with the parameters and the log_file path
            requiring the following keys:
                example = {
                    'log': <PATH TO LOG_FILE>,
                    'representation': <KEY TO REPRESENTATIONS IN 'objects'>,
                    'parameters': [{
                        'model': <KEY TO MODEL IN 'objects'>, 
                        'sliding_window': <WHETHER TO USE SLIDING WINDOW>,
                        'window_size': <SIZE OF TRACE WINDOW TO USE>,
                        'sliding_step': <STEP OF SLIDING WINDOW>
                    }
        return_result (bool): Whether to return the result as DataFrame
            
    """
    # Create final dataset
    all_results = pd.DataFrame()
    
    # Treat file name to structure size and log type
    split = args["log"].split("/")
    
    # Parse change pattern name
    tipo_mudanca = split[-2]
    log_name = split[-1][:-5]

    # Parse size of the event_log
    log_size = int(float(log_name.replace(tipo_mudanca, "").replace("k", "")) * 1000)


    try:
        # Read log and apply trace representation technique
        log_read = pm.all_prep(open(args["log"]))
        tokens = lr.get_traces_as_tokens(log_read)
#         df = objects["representation"][args["representation"]](log_read)
        
        # Create metric dataset
        all_metrics = pd.DataFrame() 
        
        for p in args["parameters"]:
            print(p)
            
            # If file does not exists, run trace clustering step and export file
            all_metrics = off_sc.run_offline_clustering_window(
                tokens,
                objects["representation"][args["representation"]],
                sk_clone(objects["model"][p["model"]]),
                p["window_size"],
#                 df,
                p["sliding_window"],
                sliding_step=p['sliding_step']
            )
            
            # Set up true drifts indexes and append
            y_true = list(range(int(len(tokens)/10), len(tokens), int(len(tokens)/10)))
            all_metrics["y_true"] = all_metrics.apply(lambda x: y_true, axis = 1)
            
            all_metrics = all_metrics.reset_index()
            
            # Identify columns
            all_metrics['tipo_mudanca'] = tipo_mudanca
            all_metrics['log_size'] = str(log_size)
            all_metrics['model'] = p["model"]
            all_metrics['representation'] = args["representation"]
            all_metrics['window_size'] = str(p["window_size"])
            all_metrics['sliding_window'] = str(p["sliding_window"])
            
        
            # Append results in final dataset
            all_results = all_results.append(all_metrics)
            
            gc.collect()
            
    except Exception as e:
        raise e
    
    all_results = all_results.reset_index(drop=True)
    return all_results

#### Run pipeline for specific case(s)

In [None]:
# read_file_and_run_clustering_pipeline({
#     'log': logs[0],
#     'representation': 'activity_binary',
#     'parameters': [{
#         'model': 'DBSCAN__eps=05ms=5', 
#         'sliding_window': False,
#         'window_size': 150,
#         'sliding_step': 1
#     }]
# }, return_result=True)

In [None]:
#read_file_and_run_clustering_pipeline({
#     'log': logs[0],
#     'representation': 'activity_binary',
#     'parameters': [{
#         'model': 'kmeans__k=6', 
#         'sliding_window': False,
#         'window_size': 200,
#         'sliding_step': 1
#     }]
# }, return_result=True)


### Run Experiments with several parameters combinations

In [None]:
# Trace clustering parameters
grid_parameters = list(ParameterGrid({
    "sliding_window": [False]    
    ,"window_size": [75, 100, 125, 150, 200]
    ,"sliding_step": [1]
    ,"model": [
#         'kmeans__k=6',
#         'kmeans__k=3',
#         'kmeans__k=2',
#         "DBSCAN__eps=05ms=5",
#         "DBSCAN__eps=1ms=4",
#         "DBSCAN__eps=2ms=3",
        "HDBSCAN__noparams"
    ] 
}))

# Trace vector representations
grid_logs = list(ParameterGrid([
    { "log": [x for x in logs if if_any(x, activity_binary_drifts)],
        "representation": ["activity_binary"]},
    {"log": [x for x in logs if if_any(x, activity_frequency_drifts)],
        "representation": ["activity_frequency"]},
    
    { "log": [x for x in logs if if_any(x, transitions_binary_drifts)],
        "representation": ["transitions_binary"]},
    
    { "log": [x for x in logs if if_any(x, transitions_frequency_drifts)],
        "representation": ["transitions_frequency"]},
    
    { "log": [x for x in logs if if_any(x, activity_tfidf_drifts)],
        "representation": ["activity_tfidf"]},
    {"log": [x for x in logs if if_any(x, transitions_tfidf_drifts)],
        "representation": ["activity_transitions_binary"]},
    
    {"log": [x for x in logs if if_any(x, activity_transitions_frequency_drifts)],
        "representation": ["activity_transitions_frequency"]},
    {"log": [x for x in logs if if_any(x, activity_transitions_binary_drifts)],
        "representation": ["activity_transitions_binary"]}
]))

# Combining all parameters
combs = []
for x in grid_logs:
    dic = x.copy()
    dic['parameters'] = grid_parameters 
    
    combs.append(dic)

len(combs), len(grid_parameters), len(combs) * len(grid_parameters) 

#### Run parallely

In [None]:
final_resp = pd.DataFrame()
final_resp = final_resp.append(Parallel(n_jobs=-2)(
    delayed(read_file_and_run_clustering_pipeline)(comb,return_result=False) for comb in tqdm_notebook(combs)
))

final_resp = final_resp.reset_index(drop=True)

try:
    os.makedirs(OUTPUT_PATH)
except:
    pass
final_resp.to_pickle(OUTPUT_PATH + "clustering_results_HDBSCAN__noparams" + '.pickle.gzip', compression="gzip")

final_resp.shape

### Detection Pipeline

In [None]:
# Drift detection parameters
drift_config = list(ParameterGrid([
    {
        "rolling_window": [3, 4]
        ,"std_tolerance": [2, 2.5, 3]
        ,"min_tol": [0.01 ,0.025, 0.05] #0.0025, 0.005,
    }
]))
len(drift_config)

In [None]:
def drift_detect_pipeline(drift_comb, return_results=False):
    # Runs the drift detection for every feature
    results = []
    for col in drift_comb.select_dtypes(include=np.number).columns:
        if (col not in ["i","test_id"]):
#         if (col in ["Silhouette"]):
#         if (col not in ["k"] and not col.startswith("diff") ) or col in ["diff_centroids"]:

            drift_comb_temp = drift_comb.copy()
            drift_comb_temp["measure"] = col
            
            # Define index as iteration number (i)
            drift_comb_temp.set_index("i", inplace=True)
    
            detected_drifts, not_drifts, info = dd.detect_concept_drift(
                drift_comb_temp, 
                col,
                int(drift_comb_temp["rolling_window"].iloc[0]),
                float(drift_comb_temp["std_tolerance"].iloc[0]),
                float(drift_comb_temp["min_tol"].iloc[0])
            )    
            
            # Calculate classification metrics
            metrics_results = dd.get_metrics(
                detected_drifts,
                not_drifts,
                drift_comb_temp["y_true"].iloc[0],
                int(drift_comb_temp["window_size"].iloc[0])
            )
            
            drift_comb_temp.reset_index(inplace=True)
            info = pd.DataFrame(info, index=drift_comb_temp.index)
            metrics_results = pd.DataFrame([metrics_results], index=drift_comb_temp.index)

            results = pd.concat([drift_comb_temp,metrics_results,info], axis=1)

        gc.collect()
        
    return results

#### Run parallely

In [None]:
# Read file
clusterings = pd.read_pickle(OUTPUT_PATH + "clustering_results_HDBSCAN__noparams.pickle.gzip", compression='gzip')
clusterings.shape

In [None]:
# Combining all parameters
drifts_combinations = pd.DataFrame()
for config in tqdm_notebook(drift_config):
    temp = clusterings.copy()
    temp["min_tol"] = str(config["min_tol"])
    temp["rolling_window"] = str(config["rolling_window"])
    temp["std_tolerance"] = str(config["std_tolerance"])
    drifts_combinations = drifts_combinations.append(temp)

drifts_combinations = drifts_combinations.reset_index(drop=True)
drifts_combinations["test_id"] = drifts_combinations.groupby([
    'tipo_mudanca','log_size','model','representation','window_size','sliding_window'
    ,'min_tol','rolling_window','std_tolerance'
    ]).ngroup()

In [None]:
drifts_combinations_grouped = drifts_combinations.groupby("test_id")
final_drift_detection = pd.DataFrame()
final_drift_detection = final_drift_detection.append(Parallel(n_jobs=-1)(
    delayed(drift_detect_pipeline)(drift_comb) for i,drift_comb in tqdm_notebook(drifts_combinations_grouped)
))

final_drift_detection = final_drift_detection.reset_index(drop=True)

try:
    os.makedirs(NEW_OUTPUT_PATH)
except:
    pass
final_drift_detection.to_pickle(NEW_OUTPUT_PATH + "drift_detections_results_HDBSCAN__noparams" + '.pickle.gzip', compression="gzip")


final_drift_detection.shape

#### Run pipeline for specific case(s)

In [None]:
drift_detect_pipeline(drifts_combinations[drifts_combinations["test_id"]==0])

### Consolidate results

# Results

In [None]:
# Read file
detections = pd.read_pickle(NEW_OUTPUT_PATH + "drift_detections_results.pickle.gzip", compression='gzip')
detections.shape

In [None]:
model = list(objects["model"].keys())[1]
print(model)

# Filter tests
detections_filtered = detections[detections["model"]==model]
print(detections_filtered.shape)

all_results = final_drift_detection.set_index("test_id").groupby("test_id").first()

# Group results by all params and get the mean F1 and Delay among all scenarios
# In other words, what combination of params got the best results in all scenarios

params = ["min_tol", "rolling_window", "std_tolerance", "window_size", "measure", "representation"]
scenarios = ["tipo_mudanca", "log_size"]
validation_metrics = ["F1","Delay"]#,"Support","Precision","Recall"]

all_results_grouped = all_results.groupby(params)[validation_metrics].agg(['mean','std'])
all_results_grouped.columns = all_results_grouped.columns.map('_'.join)
# all_results_grouped = all_results_grouped[all_results_grouped['F1_count']>=len(activity_binary_drifts)]
all_results_grouped.sort_values(["F1_mean","Delay_mean"], ascending=[False,True], inplace=True)
all_results_grouped = all_results_grouped.reset_index()
all_results_grouped.head(50)

In [None]:
best_results_by_representation = all_results_grouped.reset_index().groupby("representation").first().reset_index()
best_results_by_representation

In [None]:
best_result = pd.merge(left=all_results.reset_index()
            , right=best_results_by_representation
            , how='inner', on=params).set_index("test_id")
results_table = best_result.pivot_table(values=["F1"]#,"Delay"]
                                        , index=["tipo_mudanca"]
                                        , columns=["representation"]
                                        , aggfunc='mean')\
                                    .sort_index(axis='columns',level=[1,0], ascending=[True,False])
results_table

In [None]:
results_table['F1'].plot(kind='bar', figsize=(20,10), fontsize=20)
plt.legend(loc='center left', bbox_to_anchor=(1.0, 0.5), fontsize=20)
plt.show()

In [None]:
results_table_logsize = best_result.pivot_table(values=["F1"]#,"Delay"]
                                                , index=["tipo_mudanca"]
                                                , columns=["representation", "log_size"]
                                                , aggfunc='mean')\
                                    .sort_index(axis='columns',level=[1,0], ascending=[True,False])
results_table_logsize

In [None]:
for row in range(0,len(best_result)):
    best_result_drifts = detections_filtered[detections_filtered["test_id"]==best_result.index[row]]
    best_result_drifts.set_index("i", inplace=True)
    
    plts.plot_deteccao_drift(
        best_result_drifts,
        best_result_drifts.iloc[0]['measure'],
        best_result_drifts.iloc[0]['Drifts_Found'],
        best_result_drifts.iloc[0]['Resp'],
        best_result_drifts['means'],
        best_result_drifts['lowers'],
        best_result_drifts['uppers'],
        save_png=""
    )
    plt.title("Log: " + best_result_drifts.iloc[0]["tipo_mudanca"] + str(best_result_drifts.iloc[0]["log_size"]) 
                  + " - Rep: " + best_result_drifts.iloc[0]["representation"] 
                  + " - Metric: " + best_result_drifts.iloc[0]["measure"]
                  + " - F1: " + str(round(best_result_drifts.iloc[0]["F1"],2))
              , size=30)
    plt.show()

In [None]:
# for row in range(0,len(best_result)):

#     best_result_log = [x for x in logs if best_result.iloc[row]["tipo_mudanca"] in x 
#          and str(best_result.iloc[row]["log_size"]/1000) in 
#                        str(float(x.split("/")[-1][:-5].replace("k", "").replace(x.split("/")[-2],"")))]
    
#     run_df = off_sc.run_offline_clustering_window(
#         objects["model"][best_result.iloc[row]['model']],
#         int(best_result.iloc[row]['window_size']),
#         objects["representation"][best_result.iloc[row]['representation']](pm.all_prep(open(best_result_log[0]))),
#         sliding_window=False,
#         sliding_step=1
#     )

#     drifts, info = dd.detect_concept_drift(
#         run_df,
#         best_result.iloc[row]['measure'],
#         rolling_window=best_result.iloc[row]['rolling_window'],
#         std_tolerance=best_result.iloc[row]['std_tolerance'],
#         min_tol=best_result.iloc[row]['min_tol']
#     )

#     plts.plot_deteccao_drift(
#         run_df,
#         best_result.iloc[row]['measure'],
#         best_result.iloc[row]['Drifts_Found'],
#         best_result.iloc[row]['Resp'],
#         info['means'],
#         info['lowers'],
#         info['uppers'],
#         save_png=""
#     )
#     plt.title("Log: " + best_result.iloc[row]["tipo_mudanca"] + str(best_result.iloc[row]["log_size"]) 
#                   + " - Rep: " + best_result.iloc[row]["representation"] 
#                   + " - Metric: " + best_result.iloc[row]["measure"]
#                   + " - F1: " + str(round(best_result.iloc[row]["F1"],2))
#               , size=30)
#     plt.show()

In [None]:
# all_results.to_excel('Resultados_PL.xlsx')