In [1]:
from Controller import TrainingController, TraceClassifierController
from Parameters import TrainingParameters
from Utils.SaveUtils import load_parameters
from Utils.PrintUtils import print_big
import tensorflow as tf
import json
from dice import EventLogDiCE
import numpy as np
import pandas as pd
import time
from Models import DiCEBinaryDefferentiable
from itertools import chain
from IPython.core.display import display, HTML
from Utils.DiCEHelpers import generate_fake_df, get_trace_with_id, get_longest_trace_row, remove_trail_steps, print_model_prediction_result, remove_tags_for_query_instance

In [2]:
folder_path = "./SavedModels/%s" % (
    "0.8264_BPI2012WithResource_BaselineLSTMWithResource_2021-06-18 06:11:10.009443" #AOW
)

In [3]:
### Initialise controllers
parameters_json = load_parameters(folder_path=folder_path)
parameters = TrainingParameters(**parameters_json)
tf.random.set_seed(parameters.dataset_split_seed)
np.random.seed(parameters.dataset_split_seed)
parameters.load_model_folder_path = folder_path
trainer = TrainingController(parameters = parameters)
trainer.show_model_info()


| Running on /job:localhost/replica:0/task:0/device:CPU:0  

| Preprocessed data loaded successfully: ./datasets/preprocessed/BPI_Challenge_2012_with_resource/AOW 

| Model loaded successfully from: ./SavedModels/0.8264_BPI2012WithResource_BaselineLSTMWithResource_2021-06-18 06:11:10.009443  
Model: "baseline_lstm_with_resource"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  832       
_________________________________________________________________
embedding_1 (Embedding)      multiple                  2304      
_________________________________________________________________
lstm (LSTM)                  multiple                  24832     
_________________________________________________________________
lstm_1 (LSTM)                multiple                  33024     
_________________________________________________________________
lstm_2 (

In [4]:
########### Get example data from trainer ###########
ordered_test_idx = (list(trainer.test_dataset.unbatch().as_numpy_iterator()))
ordered_test_idx.sort()
print_big(len(ordered_test_idx), "Test set length")


| 1309 


In [5]:
########## Get longest delcined trace for testing ##########
declined_df = get_trace_with_id(trainer.dataset.df.iloc[ordered_test_idx], trainer.model.vocab.vocab_to_index('A_DECLINED_COMPLETE'))
longest_declined_trace_row = get_longest_trace_row(declined_df)
longest_declined_trace_row

Unnamed: 0,trace,trace_vocab,caseid,amount,resource,resource_orig
8464,"[2, 12, 9, 10, 20, 22, 3, 17, 8, 15, 19, 22, 1...","[<SOS>, A_SUBMITTED_COMPLETE, A_PARTLYSUBMITTE...",200028,5800.0,"[70, 53, 53, 15, 15, 48, 37, 37, 37, 37, 37, 3...","[<SOS>, 112, 112, 10863, 10863, 11169, 11003, ..."


In [6]:
_, example_activities, _, example_resources, example_amount, _ = trainer.dataset.collate_fn([longest_declined_trace_row.index[0]])

In [7]:
 print("========================================Trace========================================")
 print(trainer.model.vocab.list_of_index_to_vocab_2d(example_activities))

[['<SOS>', 'A_SUBMITTED_COMPLETE', 'A_PARTLYSUBMITTED_COMPLETE', 'A_PREACCEPTED_COMPLETE', 'W_Afhandelen leads_COMPLETE', 'W_Completeren aanvraag_COMPLETE', 'A_ACCEPTED_COMPLETE', 'O_SELECTED_COMPLETE', 'A_FINALIZED_COMPLETE', 'O_CREATED_COMPLETE', 'O_SENT_COMPLETE', 'W_Completeren aanvraag_COMPLETE', 'O_SELECTED_COMPLETE', 'O_CANCELLED_COMPLETE', 'O_CREATED_COMPLETE', 'O_SENT_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'O_CANCELLED_COMPLETE', 'O_SELECTED_COMPLETE', 'O_CREATED_COMPLETE', 'O_SENT_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'O_SENT_BACK_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'W_Valideren aanvraag_COMPLETE', 'W_Nabellen incomplete dossiers_COMPLETE', 'W_Nabellen incomplete dossiers_COMPLETE', 'W_Nabellen incomplete dossiers_COMPLETE', 'W_Nabellen incomplete dossiers_COMPLETE', 'W_Nabellen incomplete dossiers_COMPLETE', 'W_Nabellen incomplete do

In [8]:
tail_length_to_remove = 45
example_idx_activities, example_idx_resources = remove_trail_steps(example_activities, example_resources, tail_length_to_remove)
example_vocab_activities = trainer.model.vocab.list_of_index_to_vocab_2d(example_idx_activities)[0]
print("========================================Trace after substraction========================================")
print(example_vocab_activities)

['<SOS>', 'A_SUBMITTED_COMPLETE', 'A_PARTLYSUBMITTED_COMPLETE', 'A_PREACCEPTED_COMPLETE', 'W_Afhandelen leads_COMPLETE', 'W_Completeren aanvraag_COMPLETE', 'A_ACCEPTED_COMPLETE', 'O_SELECTED_COMPLETE', 'A_FINALIZED_COMPLETE']


In [9]:
##### Get model output
example_activities_input = tf.constant(example_idx_activities)
example_resources_input = tf.constant(example_idx_resources)
example_amount_input = tf.constant(example_amount)

predicted_df = print_model_prediction_result(trainer.model, example_activities_input, example_resources_input, example_amount_input)


| Predicted activity with highest probability (0.51) is "O_CREATED_COMPLETE" 



<PAD>                                      0.000113
<EOS>                                      0.001951
<SOS>                                      0.000052
A_ACCEPTED_COMPLETE                        0.001554
A_ACTIVATED_COMPLETE                       0.000067
A_APPROVED_COMPLETE                        0.000172
A_CANCELLED_COMPLETE                       0.034913
A_DECLINED_COMPLETE                        0.001319
A_FINALIZED_COMPLETE                       0.003324
A_PARTLYSUBMITTED_COMPLETE                 0.000104
A_PREACCEPTED_COMPLETE                     0.000086
A_REGISTERED_COMPLETE                      0.000229
A_SUBMITTED_COMPLETE                       0.001744
O_ACCEPTED_COMPLETE                        0.000169
O_CANCELLED_COMPLETE                       0.000922
O_CREATED_COMPLETE                         0.506674
O_DECLINED_COMPLETE                        0.000688
O_SELECTED_COMPLETE               

In [10]:
predicted_df

Unnamed: 0,<PAD>,<EOS>,<SOS>,A_ACCEPTED_COMPLETE,A_ACTIVATED_COMPLETE,A_APPROVED_COMPLETE,A_CANCELLED_COMPLETE,A_DECLINED_COMPLETE,A_FINALIZED_COMPLETE,A_PARTLYSUBMITTED_COMPLETE,...,O_DECLINED_COMPLETE,O_SELECTED_COMPLETE,O_SENT_BACK_COMPLETE,O_SENT_COMPLETE,W_Afhandelen leads_COMPLETE,W_Beoordelen fraude_COMPLETE,W_Completeren aanvraag_COMPLETE,W_Nabellen incomplete dossiers_COMPLETE,W_Nabellen offertes_COMPLETE,W_Valideren aanvraag_COMPLETE
0,0.000113,0.001951,5.2e-05,0.001554,6.7e-05,0.000172,0.034913,0.001319,0.003324,0.000104,...,0.000688,0.156691,0.003712,0.256108,0.00221,0.002508,0.018834,0.000858,0.002904,0.002096


In [11]:
no_need_tags = ['<EOS>', '<SOS>', '<PAD>']
example_idx_activities_no_tag, example_idx_resources_no_tag =  remove_tags_for_query_instance(example_idx_activities, example_idx_resources, trainer.model.vocab.tags_idx(), [trainer.model.resources.index(tag) for tag in no_need_tags])

example_vocab_trace_no_tag = trainer.model.vocab.list_of_index_to_vocab(example_idx_activities_no_tag)
example_vocab_resource_no_tag = [trainer.model.resources[r]  for r in example_idx_resources_no_tag]

example_trace_len_no_tag = len(example_vocab_trace_no_tag)
#### Determine feature names for DiCE ####
activity_feature_names = np.array(["activity_step_%d" % (i+1) for i in range(example_trace_len_no_tag)])
resource_feature_names = np.array(["resource_step_%d" % (i+1) for i in range(example_trace_len_no_tag)])

possible_activities = [ a for a in list(trainer.model.vocab.vocabs) if  not a in no_need_tags]
possible_resources =  [ r for r in list(trainer.model.resources) if  not r in no_need_tags]

possbile_amount = [min(trainer.dataset.df["amount"]), max(trainer.dataset.df["amount"])]

print_big(example_vocab_trace_no_tag ,title="Example activities without tags", num_marks = 30)
print_big(example_vocab_resource_no_tag ,title="Example resources without tags", num_marks = 30)
print_big(example_amount, title="=================Amount=================", num_marks=16)


| ['A_SUBMITTED_COMPLETE', 'A_PARTLYSUBMITTED_COMPLETE', 'A_PREACCEPTED_COMPLETE', 'W_Afhandelen leads_COMPLETE', 'W_Completeren aanvraag_COMPLETE', 'A_ACCEPTED_COMPLETE', 'O_SELECTED_COMPLETE', 'A_FINALIZED_COMPLETE'] 

| ['112', '112', '10863', '10863', '11169', '11003', '11003', '11003'] 

| [5800.0] 


In [12]:
############ Setting up desired activity ############
desired_activity = 'A_DECLINED_COMPLETE' # A_DECLINED_COMPLETE, A_APPROVED_COMPLETE
print_big(desired_activity, "Desired activity")

dice_binary_model = DiCEBinaryDefferentiable(
    model=trainer.model,
    vocab=trainer.model.vocab,
    resources= trainer.model.resources,
    desired=trainer.model.vocab.vocab_to_index(desired_activity),
    trace_length = len(example_vocab_trace_no_tag),
    sos_idx_activity=trainer.model.vocab.vocab_to_index("<SOS>"),
    sos_idx_resource= trainer.model.resources.index('<SOS>'),
    amount_min = possbile_amount[0],
    amount_max = possbile_amount[1],
    possible_resources=possible_resources,
    possible_activities=possible_activities 
)


| A_DECLINED_COMPLETE 


In [13]:
fake_df = generate_fake_df(5000, activity_feature_names, resource_feature_names, possible_activities, possible_resources, possbile_amount, example_trace_len_no_tag)

In [14]:
scenario_folder_path = "./SavedModels/%s" % "0.9860_BPI2012WithResource_BaselineLSTMWithResource_2021-06-25 21:03:20.326179"
scenario_parameter_json = load_parameters(folder_path=scenario_folder_path)
scenario_parameter = TrainingParameters(**scenario_parameter_json)
parameters.load_model_folder_path = scenario_folder_path
scenarioController = TraceClassifierController(parameters = scenario_parameter)
scenarioController.show_model_info()


| Running on /job:localhost/replica:0/task:0/device:CPU:0  

| Preprocessed data loaded successfully: ./datasets/preprocessed/BPI_Challenge_2012_valid_trace/All 
Model: "baseline_lstm_with_resource_valid_trace_cf"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      multiple                  832       
_________________________________________________________________
embedding_3 (Embedding)      multiple                  9216      
_________________________________________________________________
lstm_4 (LSTM)                multiple                  8320      
_________________________________________________________________
lstm_5 (LSTM)                multiple                  8320      
_________________________________________________________________
lstm_6 (LSTM)                multiple                  20608     
___________________________________________________________

In [15]:
dice = EventLogDiCE(scenarioController.model.activity_vocab, scenarioController.model.resource_vocab, possbile_amount, possible_activities, possible_resources, dice_binary_model, scenarioController.model)

In [16]:
_ = dice.run_pls(example_amount_input.numpy(), example_idx_activities_no_tag, example_idx_resources_no_tag, use_valid_cf_only=False, max_iter=50, scenario_weight=200, distance_loss_weight=2,lr=0.5)


| Prediction: [0] | Desired: [1] 

| Multiply! 


StopIteration: 

In [19]:
# get example from training set and test it.
# use the same input to feed in the model to see if it's the problem with embedding matrix multiplication.

In [22]:
scenarioController.model(example_activities_input, example_resources_input, example_amount_input)[0]

<tf.Tensor: shape=(1, 9, 1), dtype=float32, numpy=
array([[[-862.7488 ],
        [-862.749  ],
        [-862.7507 ],
        [-862.75134],
        [-862.7527 ],
        [-862.7536 ],
        [-862.75464],
        [-862.75543],
        [-862.7565 ]]], dtype=float32)>

In [23]:
example_activities_input

<tf.Tensor: shape=(1, 9), dtype=int32, numpy=array([[ 2, 12,  9, 10, 20, 22,  3, 17,  8]], dtype=int32)>

In [18]:
dice.scenario_out

<tf.Tensor: shape=(1, 9, 1), dtype=float32, numpy=
array([[[-862.7488 ],
        [-862.7478 ],
        [-862.74744],
        [-862.74744],
        [-862.7477 ],
        [-862.74817],
        [-862.74744],
        [-862.74695],
        [-862.74634]]], dtype=float32)>

In [18]:
tf.one_hot(dice.activity_vocab.sos_idx(), depth= len(dice.activity_vocab))[tf.newaxis: ]

<tf.Tensor: shape=(26,), dtype=float32, numpy=
array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32)>

In [17]:
### Check the first step has little loss since it should be a valid trace
# raise Stop in the first input.
# then feed the model input to the scenario identifier. to see if the scenario see this a valid input.

In [17]:
dice.activity_vocab.sos_idx()

2