In [None]:
import pandas as pd
import numpy as np

from tqdm import tqdm

import matplotlib.pyplot as plt
plt.rcParams["font.family"] = "Times New Roman"
plt.rcParams["font.size"] = 16
import seaborn as sns
sns.set_style("white")

import warnings
warnings.filterwarnings("ignore")

%load_ext autoreload
%autoreload 2

# My packages
from source import parse_mxml as pm
from source import log_representation as lr
from source import plots as plts
from source import drift_detection as dd
from source import drift_localization as dl
from source import offline_streaming_clustering as off_sc
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
import hdbscan

from sklearn.base import clone as sk_clone 

import random
random.seed(42)

import os
import glob

import gc
gc.enable()

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_colwidth', 100)

In [None]:
def insensitive_glob(pattern):
    def either(c):
        return '[%s%s]' % (c.lower(), c.upper()) if c.isalpha() else c
    return glob.glob(''.join(map(either, pattern)))

def if_any(string, lista):
    for l in lista:
        if l in string:
            return True
    return False

In [None]:
logs = insensitive_glob("../../../../../../../Datasets/Business_Process_Drift_Logs/Logs/*/*k.MXML")

In [None]:
logs = [x for x in logs if "2.5" not in x]

### Read and Prep log file

In [None]:
logs[0]
# logs[39]

In [None]:
log_read = pm.all_prep(logs[0])
tokens = lr.get_traces_as_tokens(log_read)
y_true = list(range(int(len(tokens)/10), len(tokens), int(len(tokens)/10)))
print(len(tokens))
print(len(tokens.unique()))

### Vector space representations

In [None]:
activity_binary = lr.get_binary_representation(tokens)
transitions_binary = lr.get_binary_transitions_representation(tokens)

activity_frequency = lr.get_frequency_representation(tokens)
transitions_frequency = lr.get_frequency_transitions_representation(tokens)

activity_tfidf = lr.get_tfidf_representation(tokens)
transitions_tfidf = lr.get_tfidf_transitions_representation(tokens)

activity_transitions_frequency = pd.concat([transitions_frequency, activity_frequency],axis=1)
activity_transitions_binary = pd.concat([transitions_binary, activity_binary],axis=1)

### Trace Clustering - Transitions Binary

In [None]:
# run_df = off_sc.run_offline_clustering_window(
# #     KMeans(n_clusters=3, random_state=42),
#     DBSCAN(eps=2, min_samples=3, metric='euclidean'),
#     75,
#     transitions_binary,
#     sliding_window=False,
#     sliding_step=1
# )

##### Features from the evolution of trace clustering

In [None]:
# run_df['std_diff_centroids'].plot(figsize=(16,4), c='red')
# plts.plot_drift_vertical_lines(len(activity_binary), label="True drift")
# plt.legend();

In [None]:
# run_df['avg_dist_intra_cluster'].plot(figsize=(16,4), c='red')
# plts.plot_drift_vertical_lines(len(activity_binary), label="True drift")
# plt.legend();

### Trace Clustering - Activity Binary

In [None]:
# clustering_window_size = 125

# run_df = off_sc.run_offline_clustering_window(
# #     KMeans(n_clusters=3, random_state=42),
#     DBSCAN(eps=2, min_samples=3, metric='euclidean'),
#     clustering_window_size,
#     activity_binary,
#     sliding_window=False,
#     sliding_step=1
# )

##### Features from the evolution of trace clustering

In [None]:
# run_df['avg_dist_between_centroids'].plot(figsize=(16,4))
# plts.plot_drift_vertical_lines(len(activity_binary), label="True drift")
# plt.legend();

In [None]:
# run_df['Silhouette'].plot(figsize=(16,4))
# plts.plot_drift_vertical_lines(len(activity_binary), label="True drift")
# plt.legend();

### Trace Clustering

In [None]:
clustering_window_size=125

In [None]:
# model = DBSCAN(eps=0.1, min_samples=3, metric='euclidean')#, metric='manhattan', 'cosine', 'euclidean')
# model = DBSCAN(eps=0.5, min_samples=3, metric='cosine')#, metric='manhattan', 'cosine', 'euclidean')
# model = DBSCAN(eps=0.5, min_samples=3, metric='euclidean')
# model= hdbscan.HDBSCAN(gen_min_span_tree=True) #, min_cluster_size=5, allow_single_cluster=True
model=KMeans(n_clusters=2)

In [None]:
# X=vector_representation[0:250]
# test=model.fit_predict(X)
# test

# from DBCV import DBCV
# from scipy.spatial.distance import euclidean
# DBCV.DBCV(X, test, dist_function=euclidean)

In [None]:

# "activity_binary": lr.get_binary_representation,
# "activity_frequency": lr.get_frequency_representation,

# "transitions_binary": lr.get_binary_transitions_representation,
# "transitions_frequency": lr.get_frequency_transitions_representation,

# "activity_tfidf": lr.get_tfidf_representation,
# "transitions_tfidf": lr.get_tfidf_transitions_representation,

# "activity_transitions_frequency": lr.get_activity_transitions_frequency_representation,
# "activity_transitions_binary": lr.get_activity_transitions_binary_representation

In [None]:
run_df = off_sc.run_offline_clustering_window(
    tokens,
    lr.get_activity_transitions_binary_representation,
    model,
    clustering_window_size,
#     activity_binary,
    sliding_window=False,
    sliding_step=1
)

In [None]:
run_df

In [None]:
for i in run_df.columns:
    print(i)
    try:
        run_df[i].plot(figsize=(16,4),subplots=True)
        plts.plot_drift_vertical_lines(len(activity_binary), label="True drift")
        plt.legend()
        plt.show()
    except:
        pass

### Drift Detection

In [None]:
# metric = "avg_cluster_std"
metric = "diff_cluster_std"
# metric= "k"

drifts, not_drifts,  info = dd.detect_concept_drift(
    run_df,
    metric,
    rolling_window=3,
    std_tolerance=3,
    min_tol=0.003,
    verbose=False
)

dd.get_metrics(drifts, not_drifts, y_true, window_size=clustering_window_size)

In [None]:
plts.plot_deteccao_drift(
    run_df,
    metric,
    drifts,
    y_true,
    info['means'],
    info['lowers'],
    info['uppers'],
    save_png=""
)

### Drift Localization

In [None]:
dl.localize_drift(
    run_df.centroids.loc[500], 
    run_df.centroids.loc[625], 
    activity_binary.columns
)

In [None]:
# Result of drift localization in the ground truth drifts

dl.localize_all_drifts(
    run_df,
    [x + clustering_window_size for x in y_true], 
    clustering_window_size,
    activity_binary.columns
)

In [None]:
# Result of drift localization in all predicted drifts

dl.localize_all_drifts(
    run_df,
    drifts, 
    clustering_window_size,
    activity_binary.columns
)