In [1]:
from Controller import TrainingController, ExplainingController
from Utils.SaveUtils import load_parameters
from Parameters import TrainingParameters, PredictingParameters
import tensorflow as tf
from IPython.core.display import display, HTML
import json
import numpy as np
import pandas as pd
from Parameters.Enums import TracePermutationStrategies
from Models import DiCEBinaryOutputModelWithResource
from itertools import chain

import dice_ml
from dice_ml.utils import helpers

print('TF version: ', tf.__version__)
print('Eager execution enabled: ', tf.executing_eagerly()) # False
seed = 123
tf.random.set_seed(seed)
np.random.seed(seed)

TF version:  2.4.0-rc0
Eager execution enabled:  True


In [2]:
folder_path = "./SavedModels/%s" % (
    "0.8264_BPI2012WithResource_BaselineLSTMWithResource_2021-06-18 06:11:10.009443" # AOW
)

In [3]:
parameters_json = load_parameters(folder_path=folder_path)
parameters = TrainingParameters(**parameters_json)
tf.random.set_seed(parameters.dataset_split_seed)
np.random.seed(parameters.dataset_split_seed)
parameters.load_model_folder_path = folder_path
predicting_parameters = PredictingParameters()
predicting_parameters.load_model_folder_path = folder_path
 

In [4]:
trainer = TrainingController(parameters = parameters)


| Running on /job:localhost/replica:0/task:0/device:CPU:0  

| Preprocessed data loaded successfully: ./datasets/preprocessed/BPI_Challenge_2012_with_resource/AOW 

| Model loaded successfully from: ./SavedModels/0.8264_BPI2012WithResource_BaselineLSTMWithResource_2021-06-18 06:11:10.009443  


In [5]:
explainer = ExplainingController(parameters=parameters, predicting_parameters= predicting_parameters)


| Running on /job:localhost/replica:0/task:0/device:CPU:0  

| Model loaded successfully from: ./SavedModels/0.8264_BPI2012WithResource_BaselineLSTMWithResource_2021-06-18 06:11:10.009443  


In [6]:
########### Get example data from trainer ###########
ordered_test_idx = (list(trainer.test_dataset.unbatch().as_numpy_iterator()))
ordered_test_idx.sort()
print("Test set length: %d" %(len(ordered_test_idx)))

Test set length: 1309


In [7]:
longest_fail_idx = None
longest_fail_length = 0
all_decline_idxs = []
declined_idx = trainer.model.vocab.vocab_to_index('A_DECLINED_COMPLETE')
for idx in ordered_test_idx:
    caseids, example_data, example_lengths, example_resources, example_amount, _= trainer.dataset.collate_fn([idx])
    if declined_idx in example_data[0]:
        all_decline_idxs.append(idx)
        if type(longest_fail_idx) == type(None):
            longest_fail_idx = idx
            longest_fail_length = example_data.shape[1]
        else:
            if example_data.shape[1] > longest_fail_length:
                longest_fail_idx = idx
                longest_fail_length = example_data.shape[1]

In [8]:
caseids, example_data, example_lengths, example_resources, example_amount, _ = trainer.dataset.collate_fn([8464])

In [9]:
example_data.shape

(1, 54)

In [10]:
explainer.vocab.list_of_index_to_vocab_2d(example_data)

[['<SOS>',
  'A_SUBMITTED_COMPLETE',
  'A_PARTLYSUBMITTED_COMPLETE',
  'A_PREACCEPTED_COMPLETE',
  'W_Afhandelen leads_COMPLETE',
  'W_Completeren aanvraag_COMPLETE',
  'A_ACCEPTED_COMPLETE',
  'O_SELECTED_COMPLETE',
  'A_FINALIZED_COMPLETE',
  'O_CREATED_COMPLETE',
  'O_SENT_COMPLETE',
  'W_Completeren aanvraag_COMPLETE',
  'O_SELECTED_COMPLETE',
  'O_CANCELLED_COMPLETE',
  'O_CREATED_COMPLETE',
  'O_SENT_COMPLETE',
  'W_Nabellen offertes_COMPLETE',
  'O_CANCELLED_COMPLETE',
  'O_SELECTED_COMPLETE',
  'O_CREATED_COMPLETE',
  'O_SENT_COMPLETE',
  'W_Nabellen offertes_COMPLETE',
  'W_Nabellen offertes_COMPLETE',
  'W_Nabellen offertes_COMPLETE',
  'W_Nabellen offertes_COMPLETE',
  'W_Nabellen offertes_COMPLETE',
  'O_SENT_BACK_COMPLETE',
  'W_Nabellen offertes_COMPLETE',
  'W_Valideren aanvraag_COMPLETE',
  'W_Nabellen incomplete dossiers_COMPLETE',
  'W_Nabellen incomplete dossiers_COMPLETE',
  'W_Nabellen incomplete dossiers_COMPLETE',
  'W_Nabellen incomplete dossiers_COMPLETE',
  'W

In [11]:
remove_trail_steps = 2
example_idx_trace = np.array([example_data[0][:-remove_trail_steps]])
example_idx_resources = np.array([example_resources[0][:-remove_trail_steps]])
example_vocab_trace = trainer.model.vocab.list_of_index_to_vocab_2d(example_idx_trace)[0]

In [12]:
example_input = tf.constant(example_idx_trace)
example_resources_input = tf.constant(example_idx_resources)
example_amount_input = tf.constant(example_amount)

In [13]:
out, _ = explainer.model(example_input, input_resources = example_resources_input, amount = example_amount_input)

In [14]:
predicted_vocab_distributions = tf.gather(out, len(example_vocab_trace)-1, axis=1)
predicted_vocab_distributions_df = pd.DataFrame(predicted_vocab_distributions.numpy().tolist(),columns=explainer.model.vocab.vocabs)
max_arg = tf.math.argmax(predicted_vocab_distributions, axis=-1).numpy()[0]
max_prob_vocab = explainer.model.vocab.index_to_vocab(max_arg)
print("Predicted activity with highest probability (%.2f) is \"%s\"" % (predicted_vocab_distributions[0][max_arg].numpy(), max_prob_vocab))
display(predicted_vocab_distributions_df)
print(predicted_vocab_distributions_df.iloc[0])

Predicted activity with highest probability (1.00) is "A_DECLINED_COMPLETE"


Unnamed: 0,<PAD>,<EOS>,<SOS>,A_ACCEPTED_COMPLETE,A_ACTIVATED_COMPLETE,A_APPROVED_COMPLETE,A_CANCELLED_COMPLETE,A_DECLINED_COMPLETE,A_FINALIZED_COMPLETE,A_PARTLYSUBMITTED_COMPLETE,...,O_DECLINED_COMPLETE,O_SELECTED_COMPLETE,O_SENT_BACK_COMPLETE,O_SENT_COMPLETE,W_Afhandelen leads_COMPLETE,W_Beoordelen fraude_COMPLETE,W_Completeren aanvraag_COMPLETE,W_Nabellen incomplete dossiers_COMPLETE,W_Nabellen offertes_COMPLETE,W_Valideren aanvraag_COMPLETE
0,3.966854e-09,6.010553e-08,1.303227e-07,0.000144,4.8e-05,0.000111,0.001052,0.997226,1.35893e-07,1.647052e-09,...,9.847422e-10,3.4e-05,3e-06,1.512201e-10,0.00021,6e-06,0.000151,0.000177,1e-06,2.5e-05


<PAD>                                      3.966854e-09
<EOS>                                      6.010553e-08
<SOS>                                      1.303227e-07
A_ACCEPTED_COMPLETE                        1.441261e-04
A_ACTIVATED_COMPLETE                       4.792645e-05
A_APPROVED_COMPLETE                        1.108822e-04
A_CANCELLED_COMPLETE                       1.052391e-03
A_DECLINED_COMPLETE                        9.972256e-01
A_FINALIZED_COMPLETE                       1.358930e-07
A_PARTLYSUBMITTED_COMPLETE                 1.647052e-09
A_PREACCEPTED_COMPLETE                     6.247066e-04
A_REGISTERED_COMPLETE                      1.890604e-05
A_SUBMITTED_COMPLETE                       1.207367e-07
O_ACCEPTED_COMPLETE                        1.657744e-04
O_CANCELLED_COMPLETE                       1.054919e-06
O_CREATED_COMPLETE                         1.086533e-07
O_DECLINED_COMPLETE                        9.847422e-10
O_SELECTED_COMPLETE                        3.429

In [15]:
indexes_to_remove_from_resource = [explainer.resources.index('<PAD>'), explainer.resources.index('<SOS>'),explainer.resources.index('<EOS>')]

In [16]:
example_idx_trace_without_tags = [ i for i in  example_idx_trace[0] if not i in explainer.vocab.tags_idx()]
example_vocab_trace_without_tags = explainer.model.vocab.list_of_index_to_vocab(example_idx_trace_without_tags)

example_idx_resources_without_tags = [ i for i in  example_idx_resources[0] if not i in indexes_to_remove_from_resource ]

example_vocab_resource_without_tags = [
   explainer.resources[r]  for r in example_idx_resources_without_tags
]

print("=============Example activities without tags=============")
print(example_vocab_trace_without_tags)

print("=============Example resources without tags=============")
print(example_vocab_resource_without_tags)

print("=================Amount=================")
print(example_amount)

['A_SUBMITTED_COMPLETE', 'A_PARTLYSUBMITTED_COMPLETE', 'A_PREACCEPTED_COMPLETE', 'W_Afhandelen leads_COMPLETE', 'W_Completeren aanvraag_COMPLETE', 'A_ACCEPTED_COMPLETE', 'O_SELECTED_COMPLETE', 'A_FINALIZED_COMPLETE', 'O_CREATED_COMPLETE', 'O_SENT_COMPLETE', 'W_Completeren aanvraag_COMPLETE', 'O_SELECTED_COMPLETE', 'O_CANCELLED_COMPLETE', 'O_CREATED_COMPLETE', 'O_SENT_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'O_CANCELLED_COMPLETE', 'O_SELECTED_COMPLETE', 'O_CREATED_COMPLETE', 'O_SENT_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'O_SENT_BACK_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'W_Valideren aanvraag_COMPLETE', 'W_Nabellen incomplete dossiers_COMPLETE', 'W_Nabellen incomplete dossiers_COMPLETE', 'W_Nabellen incomplete dossiers_COMPLETE', 'W_Nabellen incomplete dossiers_COMPLETE', 'W_Nabellen incomplete dossiers_COMPLETE', 'W_Nabellen incomplete dossiers_COM

In [17]:
activity_feature_names = np.array(["step_%d_activity" % (i+1) for i in range(len(example_vocab_trace_without_tags))])
without_tags_vocabs = [ v for v in explainer.vocab.vocabs if not "<" in v]
activity_cat_vars_without_tag = {}
for k in activity_feature_names:
    activity_cat_vars_without_tag[k] = without_tags_vocabs

resource_feature_names = np.array(["step_%d_resource" % (i+1) for i in range(len(example_vocab_trace_without_tags))])
without_tags_resources = [r for r in explainer.resources if not "<" in r ]
resources_cat_vars_without_tag = {}
for k in resource_feature_names:
    resources_cat_vars_without_tag[k] = without_tags_resources

In [18]:
############ Setting up desired activity ############
desired_activity = 'A_APPROVED_COMPLETE' # A_DECLINED_COMPLETE, A_APPROVED_COMPLETE
print("Desired activity is \"%s\"" %(desired_activity))

Desired activity is "A_APPROVED_COMPLETE"


In [19]:
all_features_and_range = {**activity_cat_vars_without_tag, **resources_cat_vars_without_tag,"amount": [0, 99999.0]}

In [20]:
dice_binary_model = DiCEBinaryOutputModelWithResource(explainer.model, explainer.vocab, desired=explainer.vocab.vocab_to_index(desired_activity),trace_length = len(example_vocab_trace_without_tags), without_tags_vocabs = without_tags_vocabs, without_tags_resources=without_tags_resources, resources= explainer.resources, sos_idx_activity=explainer.vocab.vocab_to_index("<SOS>"), sos_idx_resource= explainer.resources.index('<SOS>'), amount_min = all_features_and_range['amount'][0], amount_max = all_features_and_range['amount'][1])

In [21]:
d = dice_ml.Data(features=all_features_and_range, outcome_name="predicted",continuous_features = ['amount'])
m = dice_ml.Model(model=dice_binary_model, backend="TF2")
exp = dice_ml.Dice(d, m)

In [22]:
feature_names = activity_feature_names.tolist() + resource_feature_names.tolist() + ['amount']

In [23]:
query_instance = [example_vocab_trace_without_tags + example_vocab_resource_without_tags + example_amount]
example_df = pd.DataFrame(query_instance, columns= feature_names)

In [25]:
dice_exp = exp.generate_counterfactuals(
        example_df,
        total_CFs=1,
        verbose=True,
        min_iter=100,
        max_iter=3000,
        desired_class="opposite",
        features_to_vary="all",
        algorithm = "RandomInitCF", # DiverseCF, RandomInitCF
        # proximity_weight=0.1, #0.5,
        # diversity_weight=50000,#1.0,
        # init_near_query_instance=True,
        # tie_random = True,
        # categorical_penalty = 1,
        # learning_rate=0.0005,
    )

INFO:root: MAD is not given for feature amount, so using 1.0 as MAD instead.
step 1,  loss=6469.06
step 51,  loss=0.544609
step 101,  loss=0.0280896
Diverse Counterfactuals found! total time taken: 02 min 18 sec


In [33]:
#### Check what permutations are fed into the model. ####
all_received_activities = []
for i in range(len(example_vocab_trace_without_tags)):
    trace = explainer.model.vocab.list_of_index_to_vocab(list(set((np.squeeze(np.array(dice_binary_model.all_trace))[:, i]).tolist())))
    all_received_activities.append(trace)
print(all_received_activities)

[['W_Nabellen offertes_COMPLETE', 'A_SUBMITTED_COMPLETE'], ['A_PARTLYSUBMITTED_COMPLETE', 'A_PREACCEPTED_COMPLETE', 'A_SUBMITTED_COMPLETE'], ['A_PREACCEPTED_COMPLETE', 'O_SENT_COMPLETE'], ['O_SELECTED_COMPLETE', 'W_Afhandelen leads_COMPLETE', 'O_ACCEPTED_COMPLETE'], ['A_PREACCEPTED_COMPLETE', 'W_Completeren aanvraag_COMPLETE', 'O_CANCELLED_COMPLETE'], ['O_SELECTED_COMPLETE', 'A_ACCEPTED_COMPLETE', 'A_SUBMITTED_COMPLETE'], ['O_SELECTED_COMPLETE', 'O_CANCELLED_COMPLETE', 'W_Valideren aanvraag_COMPLETE'], ['A_FINALIZED_COMPLETE', 'A_CANCELLED_COMPLETE', 'O_CANCELLED_COMPLETE'], ['O_DECLINED_COMPLETE', 'O_CANCELLED_COMPLETE', 'O_CREATED_COMPLETE'], ['O_DECLINED_COMPLETE', 'O_SENT_COMPLETE', 'O_ACCEPTED_COMPLETE'], ['A_SUBMITTED_COMPLETE', 'W_Afhandelen leads_COMPLETE', 'W_Completeren aanvraag_COMPLETE'], ['O_DECLINED_COMPLETE', 'O_SELECTED_COMPLETE', 'A_REGISTERED_COMPLETE'], ['A_ACCEPTED_COMPLETE', 'A_SUBMITTED_COMPLETE', 'O_CANCELLED_COMPLETE'], ['O_SENT_COMPLETE', 'O_ACCEPTED_COMPLETE',

In [34]:
#### Check what permutations are fed into the model. ####
all_received_resources = []
for i in range(len(example_vocab_trace_without_tags)):
    res = [ explainer.resources[r] for r in list(set((np.squeeze(np.array(dice_binary_model.all_resource))[:, i]).tolist()))]
    all_received_resources.append(res)
print(all_received_resources)

[['10138', '11269', '112'], ['112', '10861', '11299'], ['10880', '10779', '10863'], ['10809', '11181', '10863'], ['11169', '11269', '11002'], ['11202', '11003', '11122'], ['10880', '10861', '11003'], ['10779', '10125', '11003'], ['10982', '11079', '11003'], ['10931', '11003', '10629'], ['11203', '11180', '11003'], ['10972', '11304', '11003'], ['10914', '11309', '11003'], ['10982', '11003', '11120'], ['10809', '11003', '10629'], ['10929', '11079', '11003'], ['10124', '11003', '10910', '11299'], ['11119', '11003', '11019'], ['10914', '10124', '11269', '11003'], ['10779', '10124', '11079', '11003'], ['112', '11111', '11003'], ['10932', '10859', '11003'], ['10138', '11120', '10913'], ['11180', '10138', '10935'], ['11304', '11203', '11000'], ['11203', '10899', '10912'], ['10899', '112', '11201'], ['10899', '11269'], ['10880', '11202', '10899'], ['10933', '112', '11121'], ['11000', '11189', '11019'], ['11179', '11181', '10861'], ['10125', '10909', '10228'], ['11049', '10909', '10609'], ['113

In [35]:
########## Checking if all the permutation are the same (Shouldn't be) ##########
all_cf_are_same = all([all((dice_binary_model.all_cf_input[1][:, 1:] == dice_binary_model.all_cf_input[i][:, 1:]).tolist()[0]) for i in range(len(dice_binary_model.all_cf_input))][1:])
print("All inputs are the same: %s" % (all_cf_are_same))

All inputs are the same: False


In [36]:
########## Checking if all the trace & resource (except amount) are the same (Shouldn't be) ##########
all_trace_and_resource_are_same = all([all((dice_binary_model.all_cf_input[1][:, 1:] == dice_binary_model.all_cf_input[i][:, 1:]).tolist()[0]) for
 i in range(len(dice_binary_model.all_cf_input))][1:])
print("All trace and resource are the same: %s" % (all_trace_and_resource_are_same))

All trace and resource are the same: False


In [37]:
dice_exp.visualize_as_dataframe(show_only_changes=True, display_sparse_df=False)

Query instance (original outcome : 1)


Unnamed: 0,step_1_activity,step_2_activity,step_3_activity,step_4_activity,step_5_activity,step_6_activity,step_7_activity,step_8_activity,step_9_activity,step_10_activity,...,step_44_resource,step_45_resource,step_46_resource,step_47_resource,step_48_resource,step_49_resource,step_50_resource,step_51_resource,amount,predicted
0,A_SUBMITTED_COMPLETE,A_PARTLYSUBMITTED_COMPLETE,A_PREACCEPTED_COMPLETE,W_Afhandelen leads_COMPLETE,W_Completeren aanvraag_COMPLETE,A_ACCEPTED_COMPLETE,O_SELECTED_COMPLETE,A_FINALIZED_COMPLETE,O_CREATED_COMPLETE,O_SENT_COMPLETE,...,11000,11169,11122,10982,11003,11003,11049,10972,5800.0,0.997



Diverse Counterfactual set without sparsity correction since only metadata about each  feature is available (new outcome:  0.0


Unnamed: 0,step_1_activity,step_2_activity,step_3_activity,step_4_activity,step_5_activity,step_6_activity,step_7_activity,step_8_activity,step_9_activity,step_10_activity,...,step_44_resource,step_45_resource,step_46_resource,step_47_resource,step_48_resource,step_49_resource,step_50_resource,step_51_resource,amount,predicted
0,W_Nabellen offertes_COMPLETE,A_SUBMITTED_COMPLETE,O_SENT_COMPLETE,O_ACCEPTED_COMPLETE,O_CANCELLED_COMPLETE,O_SELECTED_COMPLETE,W_Valideren aanvraag_COMPLETE,O_CANCELLED_COMPLETE,O_DECLINED_COMPLETE,O_ACCEPTED_COMPLETE,...,10982,10779,-,112,10889,10910,11120,10629,6185.0,0.0


In [38]:
######## Print the example trace ########
print(list(example_df.iloc[0]))

['A_SUBMITTED_COMPLETE', 'A_PARTLYSUBMITTED_COMPLETE', 'A_PREACCEPTED_COMPLETE', 'W_Afhandelen leads_COMPLETE', 'W_Completeren aanvraag_COMPLETE', 'A_ACCEPTED_COMPLETE', 'O_SELECTED_COMPLETE', 'A_FINALIZED_COMPLETE', 'O_CREATED_COMPLETE', 'O_SENT_COMPLETE', 'W_Completeren aanvraag_COMPLETE', 'O_SELECTED_COMPLETE', 'O_CANCELLED_COMPLETE', 'O_CREATED_COMPLETE', 'O_SENT_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'O_CANCELLED_COMPLETE', 'O_SELECTED_COMPLETE', 'O_CREATED_COMPLETE', 'O_SENT_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'O_SENT_BACK_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'W_Valideren aanvraag_COMPLETE', 'W_Nabellen incomplete dossiers_COMPLETE', 'W_Nabellen incomplete dossiers_COMPLETE', 'W_Nabellen incomplete dossiers_COMPLETE', 'W_Nabellen incomplete dossiers_COMPLETE', 'W_Nabellen incomplete dossiers_COMPLETE', 'W_Nabellen incomplete dossiers_COM

In [39]:
######## Print the counterfactual trace ########
print(list(dice_exp.final_cfs_df.iloc[0][:-1]))

['W_Nabellen offertes_COMPLETE', 'A_SUBMITTED_COMPLETE', 'O_SENT_COMPLETE', 'O_ACCEPTED_COMPLETE', 'O_CANCELLED_COMPLETE', 'O_SELECTED_COMPLETE', 'W_Valideren aanvraag_COMPLETE', 'O_CANCELLED_COMPLETE', 'O_DECLINED_COMPLETE', 'O_ACCEPTED_COMPLETE', 'W_Afhandelen leads_COMPLETE', 'O_DECLINED_COMPLETE', 'A_SUBMITTED_COMPLETE', 'O_SENT_COMPLETE', 'O_SENT_COMPLETE', 'A_ACTIVATED_COMPLETE', 'A_ACCEPTED_COMPLETE', 'W_Completeren aanvraag_COMPLETE', 'A_APPROVED_COMPLETE', 'O_CREATED_COMPLETE', 'A_DECLINED_COMPLETE', 'W_Completeren aanvraag_COMPLETE', 'O_DECLINED_COMPLETE', 'A_ACTIVATED_COMPLETE', 'A_REGISTERED_COMPLETE', 'W_Afhandelen leads_COMPLETE', 'A_SUBMITTED_COMPLETE', 'O_CREATED_COMPLETE', 'A_SUBMITTED_COMPLETE', 'W_Beoordelen fraude_COMPLETE', 'O_SELECTED_COMPLETE', 'A_ACCEPTED_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'W_Nabellen incomplete dossiers_COMPLETE', 'A_CANCELLED_COMPLETE', 'A_REGISTERED_COMPLETE', 'A_PREACCEPTED_COMPLETE', 'A_ACCEPTED_COMPLETE', 'O_CREATED_COMPLETE', 'A_P