# Trying BPIC2019 event logs calculations

In [1]:
import numpy as np
import pandas as pd

from replearn.eventlog import EventLog

from replearn.embedding_predict import EmbeddingPredict
from replearn.autoencoder import AutoencoderRepresentation
from replearn.doc2vec import Doc2VecRepresentation

from replearn.clustering import Clustering
import random
from replearn.evaluation import Evaluation
import pm4py

## Load event log

In [2]:
# event log configuration
event_log_path = '../logs/BPIC2019'
file_name = 'BPI_Challenge_2019.xes'

case_attributes = None # auto-detect attributes
event_attributes = ['concept:name', 'user'] # use activity name and user org:resource

# Item Type as true cluster label
true_cluster_label = 'Item Type'

# load file
event_log = EventLog(file_name, case_attributes=case_attributes, event_attributes=event_attributes, true_cluster_label=true_cluster_label)


event_log.load(event_log_path + '/' + file_name, False)



parsing log, completed traces ::   0%|          | 0/251734 [00:00<?, ?it/s]

In [4]:
# filter out Item Type Standard cases
event_log._event_log = pm4py.filter_trace_attribute_values(event_log._event_log, "Item Type", "Standard", retain=False)
event_log.preprocess()

In [5]:
# method def: Autoencoder = 1, Trace2Vec = 2, Case2Vec(event) = 3, Case2Vec(event+case) = 4, LSTMClust = 5, GRUClust = 6
def getResults(method=6, vector_size=32, cluster_alg='agglomerative', dist='cosine'):
    
    # hyperparameters
    n_epochs = 10
    n_batch_size = 64
    n_clusters = 5

    vector_size = vector_size

    if method == 1:
        # get sequences from event log as one-hot feature vector
        sequences = event_log.event_attributes_flat_onehot_features_2d
        # init and train autoencoder
        autoencoder = AutoencoderRepresentation(event_log)
        autoencoder.build_model(sequences.shape[1], encoder_dim=vector_size)
        autoencoder.fit(batch_size=n_batch_size, epochs=n_epochs, verbose=True)
        # get feature vector
        feature_vector = autoencoder.predict()
    
    elif method == 2:
        
        doc2vec = Doc2VecRepresentation(event_log)
        doc2vec.build_model(append_case_attr=False, append_event_attr=False, vector_size=vector_size, concat=True, epochs=n_epochs)
        doc2vec.fit()
        
        # infer the vector from the model
        feature_vector = doc2vec.predict(epochs=50)
        
    elif method == 3:
        
        # train doc2vec model
        doc2vec = Doc2VecRepresentation(event_log)
        doc2vec.build_model(append_case_attr=False, append_event_attr=True, vector_size=vector_size, concat=True, epochs=n_epochs)
        doc2vec.fit()
        
        # infer the vector from the model
        feature_vector = doc2vec.predict(epochs=50)
        
    elif method == 4:
        
        doc2vec = Doc2VecRepresentation(event_log)
        doc2vec.build_model(append_case_attr=True, append_event_attr=True, vector_size=vector_size, concat=True, epochs=n_epochs)
        doc2vec.fit()
        # infer the vector from the model
        feature_vector = doc2vec.predict(epochs=50)
        
    elif method == 5:
        
        # init and train LSTM
        predictor = EmbeddingPredict(event_log)
        predictor.build_model(embedding_dim=vector_size, gru_dim=vector_size, rnn='LSTM')
        predictor.fit(epochs=n_epochs, batch_size=n_batch_size, verbose=True)
        # get feature vector
        pred_model, feature_vector, embedding_vector = predictor.predict()
        
    else:
        
        # init and train LSTM
        predictor = EmbeddingPredict(event_log)
        predictor.build_model(embedding_dim=vector_size, gru_dim=vector_size, rnn='gru')
        predictor.fit(epochs=n_epochs, batch_size=n_batch_size, verbose=True)
        # get feature vector
        pred_model, feature_vector, embedding_vector = predictor.predict()
        
    
    # cluster feature vector
    cluster_analysis = Clustering(event_log)
    cluster_analysis.cluster(feature_vector, cluster_alg, n_clusters, dist)
    
    cluster_result = cluster_analysis.evaluate()
    print("F1-BCubed =", cluster_result[2])
    
    evaluation_a = Evaluation(event_log)
    results = evaluation_a.evaluate_clusters(n_clusters,cluster_analysis._pred_labels)
    
    data = {'Method': method, 'Fitness': results[0], 'Precision': results[1], 'Simplicity': results[2], 'F1-BCubed': cluster_result[2]}
    print("Replay results =", data)

    
    return data

In [6]:
resultsLSTM = getResults(method=5, vector_size=8, cluster_alg='agglomerative', dist='cosine')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
F1-BCubed = 0.7956694370905629




replaying log with TBR, completed variants ::   0%|          | 0/257 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/686 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/151 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/509 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/345 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/1273 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/303 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/963 [00:00<?, ?it/s]

Replay results = {'Method': 5, 'Fitness': 0.6969116397435655, 'Precision': 0.7620515005438527, 'Simplicity': 0.46265066667973836, 'F1-BCubed': 0.7956694370905629}


In [7]:
resultsGRU = getResults(method=6, vector_size=64, cluster_alg='agglomerative', dist='cosine')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


F1-BCubed = 0.7286312127726361




replaying log with TBR, completed variants ::   0%|          | 0/288 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/467 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/1192 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/158531 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/2479 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/20612 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/270 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/1108 [00:00<?, ?it/s]

Replay results = {'Method': 6, 'Fitness': 0.7341363391900212, 'Precision': 0.6825263253032546, 'Simplicity': 0.4585341376642275, 'F1-BCubed': 0.7286312127726361}


In [8]:
resultsAutoencoder = getResults(method=1, vector_size=32, cluster_alg='agglomerative', dist='cosine')

  super(Adam, self).__init__(name, **kwargs)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
F1-BCubed = 0.5247398035684203




replaying log with TBR, completed variants ::   0%|          | 0/18 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/3 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/80 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/50 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/2611 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/173711 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/56 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/85 [00:00<?, ?it/s]

Replay results = {'Method': 1, 'Fitness': 0.7193770897159211, 'Precision': 0.682379405651207, 'Simplicity': 0.44398504473778244, 'F1-BCubed': 0.5247398035684203}


In [9]:
resultsTrace2Vec = getResults(method=2, vector_size=8, cluster_alg='agglomerative', dist='cosine')

F1-BCubed = 0.6462336186324773




replaying log with TBR, completed variants ::   0%|          | 0/232 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/1068 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/736 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/8602 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/476 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/8137 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/460 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/1855 [00:00<?, ?it/s]

Replay results = {'Method': 2, 'Fitness': 0.7683096131282454, 'Precision': 0.7573858114856874, 'Simplicity': 0.4802708344315793, 'F1-BCubed': 0.6462336186324773}


In [10]:
resultsCase2Vec_event = getResults(method=3, vector_size=4, cluster_alg='agglomerative', dist='euclid')

F1-BCubed = 0.542980527021172




replaying log with TBR, completed variants ::   0%|          | 0/236 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/4112 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/1759 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/33842 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/1793 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/130297 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/565 [00:00<?, ?it/s]

replaying log with TBR, completed variants ::   0%|          | 0/12471 [00:00<?, ?it/s]

Replay results = {'Method': 3, 'Fitness': 0.5445497595231859, 'Precision': 0.35074663743819645, 'Simplicity': 0.285358403086877, 'F1-BCubed': 0.542980527021172}


In [12]:
resultsCase2Vec_event_case = getResults(method=4, vector_size=3, cluster_alg='agglomerative', dist='euclid')

TypeError: sequence item 11: expected str instance, bool found

In [13]:
allResults = pd.DataFrame([resultsAutoencoder, resultsGRU, resultsTrace2Vec, resultsCase2Vec_event, resultsLSTM])

In [14]:
allResults.to_csv('BPIC_2019_results.csv', encoding='utf-8', index=False, sep=';')