In [None]:
import pandas as pd
import numpy as np

from tqdm import tqdm

import matplotlib.pyplot as plt
# plt.rcParams["font.family"] = "Times New Roman"
# plt.rcParams["font.size"] = 16
import seaborn as sns
# sns.set_style("white")

import warnings
warnings.filterwarnings("ignore")

# %load_ext autoreload
# %autoreload 2

# My packages
from source import parse_mxml as pm
from source import log_representation as lr
from source import plots as plts
from source import drift_detection as dd
from source import drift_localization as dl
from source import offline_streaming_clustering as off_sc
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
import hdbscan

from sklearn.base import clone as sk_clone 

import random
random.seed(42)

import os
import glob

import gc
gc.enable()

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_colwidth', 100)

In [None]:
def insensitive_glob(pattern):
    def either(c):
        return '[%s%s]' % (c.lower(), c.upper()) if c.isalpha() else c
    return glob.glob(''.join(map(either, pattern)))

def if_any(string, lista):
    for l in lista:
        if l in string:
            return True
    return False

In [None]:
logs = insensitive_glob("../../../../../../../Datasets/Business_Process_Drift_Logs/Logs/*/*k.MXML")

In [None]:
logs = [x for x in logs if "2.5" not in x]

In [None]:
logs

### Read and Prep log file

In [None]:
# logs[39]
# logs[39]
# logs[24]
logs[15]

In [None]:
log = logs[16]
print(log)
log_read = pm.all_prep(log)
tokens = lr.get_traces_as_tokens(log_read)
y_true = list(range(int(len(tokens)/10), len(tokens), int(len(tokens)/10)))
print(len(tokens))
print(len(tokens.unique()))

### Trace Clustering

In [None]:

# "activity_binary": lr.get_binary_representation,
# "activity_frequency": lr.get_frequency_representation,

# "transitions_binary": lr.get_binary_transitions_representation,
# "transitions_frequency": lr.get_frequency_transitions_representation,

# "activity_tfidf": lr.get_tfidf_representation,
# "transitions_tfidf": lr.get_tfidf_transitions_representation,

# "activity_transitions_frequency": lr.get_activity_transitions_frequency_representation,
# "activity_transitions_binary": lr.get_activity_transitions_binary_representation

In [None]:
clustering_window_size=150
representation = lr.get_binary_representation
distance_list=['euclidean', 'hamming','cityblock', 'jaccard', 'cosine']

In [None]:
# model = DBSCAN(eps=0.5, min_samples=5, metric='euclidean')
# model = DBSCAN(eps=2, min_samples=3, metric='euclidean')
# model = DBSCAN(eps=2.5, min_samples=3, metric='euclidean')
model = DBSCAN(eps=5, min_samples=3, metric='manhattan')
# model = DBSCAN(eps=0.05, min_samples=3, metric='hamming')
# model = DBSCAN(eps=0.05, min_samples=3, metric='cosine')

In [None]:
# model=KMeans(n_clusters=2, random_state=42)

In [None]:
# model= hdbscan.HDBSCAN(metric='euclidean') #, cluster_selection_method ='leaf', min_cluster_size=5, allow_single_cluster=True
# model= hdbscan.HDBSCAN(metric='mahalanobis')
# model= hdbscan.HDBSCAN(metric='manhattan')
# model= hdbscan.HDBSCAN(metric='euclidean'
#                        ,gen_min_span_tree=True
# #                         , allow_single_cluster=True
# #                        , cluster_selection_method='leaf'
# #                         , min_cluster_size=int(clustering_window_size*0.1)
# #                         , min_samples=int(clustering_window_size*0.05)
# #                        , cluster_selection_epsilon=0.5
# #                        
#                       )

# model = hdbscan.HDBSCAN(metric="cosine",algorithm="generic")


In [None]:
# X=representation(tokens).drop_duplicates()
# test=model.fit_predict(X.values)
# test

In [None]:
run_df, X = off_sc.run_offline_clustering_window(
    tokens,
    representation,
    model,
    distance_list,
    clustering_window_size,
#     activity_binary,
    sliding_window=False,
    sliding_step=1
)

In [None]:
run_df

In [None]:
for i in run_df.columns:
    print(i)
    try:
        run_df[i].plot(figsize=(16,4),subplots=True)
        plts.plot_drift_vertical_lines(len(tokens), label="True drift")
        plt.legend()
        plt.show()
    except:
        pass

In [None]:
from sklearn.manifold import TSNE
df_subset = X.iloc[0:clustering_window_size]
tsne = TSNE(n_components=2, perplexity=20, n_iter=1000)
tsne_results = tsne.fit_transform(df_subset)

df_subset['tsne-2d-one'] = tsne_results[:,0]
df_subset['tsne-2d-two'] = tsne_results[:,1]
plt.figure(figsize=(16,10))
sns.scatterplot(
    x="tsne-2d-one"
    , y="tsne-2d-two"
#     ,hue="y"
    ,palette=sns.color_palette("hls", 10)
    ,data=df_subset
    ,legend="full"
    ,alpha=0.3
)

### Drift Detection

In [None]:
metric = "avg_MSE"

drifts, not_drifts,  info = dd.detect_concept_drift(
    run_df,
    metric,
    rolling_window=4,
    std_tolerance=2,
    min_tol=0.0025,
    verbose=False
)

# dd.get_metrics(drifts, not_drifts, y_true, window_size=clustering_window_size)

In [None]:
margin_error = 2
log_size = len(tokens)

dd.get_metrics(
    drifts
    ,not_drifts
    , y_true
    , clustering_window_size
    , log_size
    , margin_error
)

In [None]:
plts.plot_deteccao_drift(
    run_df,
    metric,
    drifts,
    y_true,
    info['means'],
    info['lowers'],
    info['uppers'],
    save_png=""
)

### Drift Localization

In [None]:
y_true

In [None]:
run_df

In [None]:
dl.localize_drift(
    run_df.centroids.loc[450], 
    run_df.centroids.loc[600], 
    X.columns
)

In [None]:
# Result of drift localization in the ground truth drifts

dl.localize_all_drifts(
    run_df,
    [x + clustering_window_size for x in y_true], 
    clustering_window_size,
    X.columns
)

In [None]:
# Result of drift localization in all predicted drifts

dl.localize_all_drifts(
    run_df,
    drifts, 
    clustering_window_size,
    activity_binary.columns
)