In [1]:
from Controller import TrainingController, ExplainingController
from Utils.SaveUtils import load_parameters
from Parameters import TrainingParameters, PredictingParameters
import tensorflow as tf
from IPython.core.display import display, HTML
import json
import numpy as np
import pandas as pd
from Parameters.Enums import TracePermutationStrategies

# import DiCE
import dice_ml
from dice_ml.utils import helpers # helper functions


In [2]:
a_important_activities = ["A_ACTIVATED_COMPLETE", "A_APPROVED_COMPLETE", "A_REGISTERED_COMPLETE", "A_DECLINED_COMPLETE"]
o_important_activities = ["O_ACCEPTED_COMPLETE", "O_DECLINED_COMPLETE"]
all_important_vocabs = a_important_activities+o_important_activities

In [3]:
folder_path = "./SavedModels/%s" % (
# "0.8569_BPI2012_BaseLineLSTMModel_2021-05-28 19:02:39.442554" # OW
# "0.8324_BPI2012_BaseLineLSTMModel_2021-06-05 15:19:51.992793" # W
"0.8613_BPI2012_BaseLineLSTMModel_2021-06-07 17:36:01.075556" # AOW
)

In [4]:
parameters_json = load_parameters(folder_path=folder_path)
parameters = TrainingParameters(**parameters_json)
tf.random.set_seed(parameters.dataset_split_seed)
np.random.seed(parameters.dataset_split_seed)
parameters.load_model_folder_path = folder_path
predicting_parameters = PredictingParameters()
predicting_parameters.load_model_folder_path = folder_path

In [5]:
trainer = TrainingController(parameters = parameters)


| Running on /job:localhost/replica:0/task:0/device:CPU:0  

| Preprocessed data loaded successfully: ./datasets/preprocessed/BPI_Challenge_2012/AOW 

| Model loaded successfully from: ./SavedModels/0.8613_BPI2012_BaseLineLSTMModel_2021-06-07 17:36:01.075556  


In [6]:
all_last_steps =  [i[-2] for i in trainer.dataset.df["trace"]]
last_df = pd.DataFrame(all_last_steps, columns=["last_step"])
ending_vocab_counts = {}
ending_value_count_dict = dict(last_df["last_step"].value_counts())
for k in ending_value_count_dict:
    vocab_key = trainer.model.vocab.index_to_vocab(k)
    ending_vocab_counts[vocab_key] = ending_value_count_dict[k]

In [7]:
ending_vocab_counts

{'A_DECLINED_COMPLETE': 3429,
 'W_Valideren aanvraag_COMPLETE': 2745,
 'W_Afhandelen leads_COMPLETE': 2234,
 'W_Completeren aanvraag_COMPLETE': 1939,
 'W_Nabellen offertes_COMPLETE': 1289,
 'A_CANCELLED_COMPLETE': 655,
 'W_Nabellen incomplete dossiers_COMPLETE': 452,
 'O_CANCELLED_COMPLETE': 279,
 'W_Beoordelen fraude_COMPLETE': 57,
 'W_Wijzigen contractgegevens_SCHEDULE': 4,
 'W_Valideren aanvraag_START': 2,
 'W_Nabellen offertes_START': 1,
 'A_REGISTERED_COMPLETE': 1}

In [8]:
print("Most common ending activities: ")
print(trainer.model.vocab.list_of_index_to_vocab(list(last_df["last_step"].value_counts().keys())))

Most common ending activities: 
['A_DECLINED_COMPLETE', 'W_Valideren aanvraag_COMPLETE', 'W_Afhandelen leads_COMPLETE', 'W_Completeren aanvraag_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'A_CANCELLED_COMPLETE', 'W_Nabellen incomplete dossiers_COMPLETE', 'O_CANCELLED_COMPLETE', 'W_Beoordelen fraude_COMPLETE', 'W_Wijzigen contractgegevens_SCHEDULE', 'W_Valideren aanvraag_START', 'W_Nabellen offertes_START', 'A_REGISTERED_COMPLETE']


In [9]:
trainer.model.vocab.vocab_dict

{'<EOS>': 1,
 '<SOS>': 2,
 'A_ACCEPTED_COMPLETE': 3,
 'A_ACTIVATED_COMPLETE': 4,
 'A_APPROVED_COMPLETE': 5,
 'A_CANCELLED_COMPLETE': 6,
 'A_DECLINED_COMPLETE': 7,
 'A_FINALIZED_COMPLETE': 8,
 'A_PARTLYSUBMITTED_COMPLETE': 9,
 'A_PREACCEPTED_COMPLETE': 10,
 'A_REGISTERED_COMPLETE': 11,
 'A_SUBMITTED_COMPLETE': 12,
 'O_ACCEPTED_COMPLETE': 13,
 'O_CANCELLED_COMPLETE': 14,
 'O_CREATED_COMPLETE': 15,
 'O_DECLINED_COMPLETE': 16,
 'O_SELECTED_COMPLETE': 17,
 'O_SENT_BACK_COMPLETE': 18,
 'O_SENT_COMPLETE': 19,
 'W_Afhandelen leads_COMPLETE': 20,
 'W_Afhandelen leads_SCHEDULE': 21,
 'W_Afhandelen leads_START': 22,
 'W_Beoordelen fraude_COMPLETE': 23,
 'W_Beoordelen fraude_SCHEDULE': 24,
 'W_Beoordelen fraude_START': 25,
 'W_Completeren aanvraag_COMPLETE': 26,
 'W_Completeren aanvraag_SCHEDULE': 27,
 'W_Completeren aanvraag_START': 28,
 'W_Nabellen incomplete dossiers_COMPLETE': 29,
 'W_Nabellen incomplete dossiers_SCHEDULE': 30,
 'W_Nabellen incomplete dossiers_START': 31,
 'W_Nabellen offertes

In [10]:
explainer = ExplainingController(parameters=parameters, predicting_parameters= predicting_parameters)


| Running on /job:localhost/replica:0/task:0/device:CPU:0  

| Model loaded successfully from: ./SavedModels/0.8613_BPI2012_BaseLineLSTMModel_2021-06-07 17:36:01.075556  


In [11]:
tags_to_remove = ["<PAD>", "<EOS>", "<SOS>"]
index_to_remove = explainer.model.vocab.list_of_vocab_to_index(tags_to_remove)

In [12]:
########### Get example data from trainer ###########
trainer.test_dataset.unbatch()
ordered_test_idx = (list(trainer.test_dataset.unbatch().as_numpy_iterator()))
ordered_test_idx.sort()
print("Test set length: %d" %(len(ordered_test_idx)))

Test set length: 1308


In [13]:
## Get an completed example
index_from_test = 9

example_idx_trace = np.array([ 2, 12,  9, 10, 27, 28, 26, 28, 26, 28,  3,  8, 17, 15, 19, 33, 26,\
       34, 32, 34, 32, 34, 18, 36, 32, 37, 35, 37, 30, 35, 31, 29, 31, 29,\
       31, 29, 31, 29, 31, 29, 31, 29, 31, 29, 31, 29, 31, 29, 31, 29, 31,\
       29, 31, 29, 31, 29, 31, 29, 31, 29, 31, 29, 31, 29, 31, 29, 31, 29,\
       31, 29, 31, 29, 31, 29, 31, 29, 31, 29, 31, 29, 31, 29, 31, 29, 31,\
       13,  5, 11 ])
# example_idx_trace = trainer.dataset.collate_fn([ordered_test_idx[index_from_test]])[1][0]
example_vocab_trace = explainer.model.vocab.list_of_index_to_vocab(example_idx_trace.tolist())

### Remove tags
example_idx_trace_without_tags = [ i for i in  example_idx_trace if not i in index_to_remove]
example_vocab_trace_without_tags = explainer.model.vocab.list_of_index_to_vocab(example_idx_trace_without_tags)

print(example_vocab_trace_without_tags)

['A_SUBMITTED_COMPLETE', 'A_PARTLYSUBMITTED_COMPLETE', 'A_PREACCEPTED_COMPLETE', 'W_Completeren aanvraag_SCHEDULE', 'W_Completeren aanvraag_START', 'W_Completeren aanvraag_COMPLETE', 'W_Completeren aanvraag_START', 'W_Completeren aanvraag_COMPLETE', 'W_Completeren aanvraag_START', 'A_ACCEPTED_COMPLETE', 'A_FINALIZED_COMPLETE', 'O_SELECTED_COMPLETE', 'O_CREATED_COMPLETE', 'O_SENT_COMPLETE', 'W_Nabellen offertes_SCHEDULE', 'W_Completeren aanvraag_COMPLETE', 'W_Nabellen offertes_START', 'W_Nabellen offertes_COMPLETE', 'W_Nabellen offertes_START', 'W_Nabellen offertes_COMPLETE', 'W_Nabellen offertes_START', 'O_SENT_BACK_COMPLETE', 'W_Valideren aanvraag_SCHEDULE', 'W_Nabellen offertes_COMPLETE', 'W_Valideren aanvraag_START', 'W_Valideren aanvraag_COMPLETE', 'W_Valideren aanvraag_START', 'W_Nabellen incomplete dossiers_SCHEDULE', 'W_Valideren aanvraag_COMPLETE', 'W_Nabellen incomplete dossiers_START', 'W_Nabellen incomplete dossiers_COMPLETE', 'W_Nabellen incomplete dossiers_START', 'W_Nabel

In [14]:
### Set up arguments for CF
feature_names = np.array(["step_%d" % (i+1) for i in range(len(example_vocab_trace_without_tags))])
without_tags_vocabs = [ v for v in list(explainer.vocab.vocab_dict.keys()) if not "<" in v]
cat_vars_without_tag = {}
for k in feature_names:
    cat_vars_without_tag[k] = without_tags_vocabs

In [31]:
class ExtractingLastTimeStampProbDistributionLayer(tf.keras.Model):
    '''
    It's a new model classifying where the destination is prefered.
    '''
    def __init__(self, explainer: ExplainingController, desired: int, trace_length: int, without_tags_vocabs):
        super(ExtractingLastTimeStampProbDistributionLayer, self).__init__()
        self.explainer = explainer
        self.desired = desired
        self.trace_length = trace_length

    def call(self, input):
        '''
        Input will be one-hot encoded tensor.
        '''

        ### Get real input from the one-hot encoded tensor.
        input = tf.argmax(tf.stack(tf.split(input,self.trace_length, axis=-1,), axis = 1), axis = -1)
        # self.input_before_convert = input
        # print("=========Before Converting=========")
        # print(input)
        # print("=======================")

        ### transfer to the input with tags.
        input = tf.constant(explainer.vocab.list_of_vocab_to_index_2d([[without_tags_vocabs[idx] for idx in tf.squeeze(input).numpy()]]), dtype=tf.int64)
        # self.input_after_convert = input
        # print("=========After Converting=========")
        # print(input)
        # print("=======================")

        # print("=========Input.Shape=========")
        # print(input.shape)
        # print("=======================")

        ## Concate the <SOS> tag in the first step.
        input = tf.concat([tf.constant([[2]], dtype=tf.int64) ,  input], axis=-1)

        ## Feed to the model
        out = explainer.model(input)

        ## Take the activty with max possibility.
        out = tf.argmax(out[0][:, -1, :], axis = -1)

        ## Determine whether the 
        return tf.expand_dims(tf.cast(out == self.desired, dtype=tf.float32), axis = 0)


In [16]:
all_important_vocabs

['A_ACTIVATED_COMPLETE',
 'A_APPROVED_COMPLETE',
 'A_REGISTERED_COMPLETE',
 'A_DECLINED_COMPLETE',
 'O_ACCEPTED_COMPLETE',
 'O_DECLINED_COMPLETE']

In [17]:
desire_vocabs = all_important_vocabs[3]

In [18]:
desire_vocabs

'A_DECLINED_COMPLETE'

In [32]:
last_dist_model = ExtractingLastTimeStampProbDistributionLayer(explainer, desired=explainer.vocab.vocab_to_index(desire_vocabs), trace_length = len(example_vocab_trace_without_tags), without_tags_vocabs = without_tags_vocabs)

In [35]:
d = dice_ml.Data(features=cat_vars_without_tag, outcome_name="predicted",continuous_features = [])
m = dice_ml.Model(model= last_dist_model, backend="TF2")
exp = dice_ml.Dice(d, m)

tf.Tensor(
[[12 28 23 16 21 21 19 18 13 18 25 30 10 16 30  1 33 35 16 11 35 13 32  2
  20 26 16 10  8 11  1 11 23 30 33 16 30 30 29 18 23 16 18 25 14 25  3 28
   7 25 12 25 35 28 31  0  9 18  0  1 28 18 18 18  4  6  3 10 26  1 20  0
  16 34 14 17  4 31 13  2 30 11 16 25 13 10 10]], shape=(1, 87), dtype=int64)
tf.Tensor(
[[15 31 26 19 24 24 22 21 16 21 28 33 13 19 33  4 36 38 19 14 38 16 35  5
  23 29 19 13 11 14  4 14 26 33 36 19 33 33 32 21 26 19 21 28 17 28  6 31
  10 28 15 28 38 31 34  3 12 21  3  4 31 21 21 21  7  9  6 13 29  4 23  3
  19 37 17 20  7 34 16  5 33 14 19 28 16 13 13]], shape=(1, 87), dtype=int64)
(1, 87)


In [21]:
example_vocab_trace_without_tags

['A_SUBMITTED_COMPLETE',
 'A_PARTLYSUBMITTED_COMPLETE',
 'A_PREACCEPTED_COMPLETE',
 'W_Completeren aanvraag_SCHEDULE',
 'W_Completeren aanvraag_START',
 'W_Completeren aanvraag_COMPLETE',
 'W_Completeren aanvraag_START',
 'W_Completeren aanvraag_COMPLETE',
 'W_Completeren aanvraag_START',
 'A_ACCEPTED_COMPLETE',
 'A_FINALIZED_COMPLETE',
 'O_SELECTED_COMPLETE',
 'O_CREATED_COMPLETE',
 'O_SENT_COMPLETE',
 'W_Nabellen offertes_SCHEDULE',
 'W_Completeren aanvraag_COMPLETE',
 'W_Nabellen offertes_START',
 'W_Nabellen offertes_COMPLETE',
 'W_Nabellen offertes_START',
 'W_Nabellen offertes_COMPLETE',
 'W_Nabellen offertes_START',
 'O_SENT_BACK_COMPLETE',
 'W_Valideren aanvraag_SCHEDULE',
 'W_Nabellen offertes_COMPLETE',
 'W_Valideren aanvraag_START',
 'W_Valideren aanvraag_COMPLETE',
 'W_Valideren aanvraag_START',
 'W_Nabellen incomplete dossiers_SCHEDULE',
 'W_Valideren aanvraag_COMPLETE',
 'W_Nabellen incomplete dossiers_START',
 'W_Nabellen incomplete dossiers_COMPLETE',
 'W_Nabellen incom

In [22]:
example_df = pd.DataFrame([example_vocab_trace_without_tags], columns= feature_names)

In [23]:
example_df

Unnamed: 0,step_1,step_2,step_3,step_4,step_5,step_6,step_7,step_8,step_9,step_10,...,step_78,step_79,step_80,step_81,step_82,step_83,step_84,step_85,step_86,step_87
0,A_SUBMITTED_COMPLETE,A_PARTLYSUBMITTED_COMPLETE,A_PREACCEPTED_COMPLETE,W_Completeren aanvraag_SCHEDULE,W_Completeren aanvraag_START,W_Completeren aanvraag_COMPLETE,W_Completeren aanvraag_START,W_Completeren aanvraag_COMPLETE,W_Completeren aanvraag_START,A_ACCEPTED_COMPLETE,...,W_Nabellen incomplete dossiers_START,W_Nabellen incomplete dossiers_COMPLETE,W_Nabellen incomplete dossiers_START,W_Nabellen incomplete dossiers_COMPLETE,W_Nabellen incomplete dossiers_START,W_Nabellen incomplete dossiers_COMPLETE,W_Nabellen incomplete dossiers_START,O_ACCEPTED_COMPLETE,A_APPROVED_COMPLETE,A_REGISTERED_COMPLETE


In [24]:
dice_exp = exp.generate_counterfactuals(example_df,
 total_CFs=1,
  desired_class="opposite",
   min_iter=100, max_iter=5000,
    # permitted_range= cat_vars_without_tag
    tie_random = True,
    categorical_penalty = 20,
    )

tf.Tensor(
[[ 0  0  0  0  0  0  0  0 25 32  7 32 22 12  8  6 23 17  8 27 21 33 29  6
  14 29 20  7  8 20 11  5  2 35  3 31 20  3 27 19 21  9  4 31  2 12 32 23
   8 12 35 17 32 32 29 16  5  9 16 27 26  4 17 17 33 12  9 17 13 28 23 12
   3  1  1 32 26 29 20  7 11  7  6  0 32  2 23]], shape=(1, 87), dtype=int64)
tf.Tensor(
[[ 3  3  3  3  3  3  3  3 28 35 10 35 25 15 11  9 26 20 11 30 24 36 32  9
  17 32 23 10 11 23 14  8  5 38  6 34 23  6 30 22 24 12  7 34  5 15 35 26
  11 15 38 20 35 35 32 19  8 12 19 30 29  7 20 20 36 15 12 20 16 31 26 15
   6  4  4 35 29 32 23 10 14 10  9  3 35  5 26]], shape=(1, 87), dtype=int64)
(1, 87)
tf.Tensor(
[[ 0  0  0  0  0  0  0  0 25 18  7 29 18 15 14 31 20 19 30 30 20 32 13 22
  12 27  6  7 20  5  8  3 32 15 18 14  2 31 21 26  7 35 19 27 22 11 21  7
  33  0 27 27 26 22  7  0  5  5 33 20 10  9 19  1 25 18 20 16 13  2  0  8
   7 31  7  8 27 23 23  8 33  0  5 31 35  3  5]], shape=(1, 87), dtype=int64)
tf.Tensor(
[[ 3  3  3  3  3  3  3  3 28 21 10 32 21 18 17 3

In [25]:
dice_exp.visualize_as_dataframe(show_only_changes=True, display_sparse_df=False)

Query instance (original outcome : 0)


Unnamed: 0,step_1,step_2,step_3,step_4,step_5,step_6,step_7,step_8,step_9,step_10,...,step_79,step_80,step_81,step_82,step_83,step_84,step_85,step_86,step_87,predicted
0,A_SUBMITTED_COMPLETE,A_PARTLYSUBMITTED_COMPLETE,A_PREACCEPTED_COMPLETE,W_Completeren aanvraag_SCHEDULE,W_Completeren aanvraag_START,W_Completeren aanvraag_COMPLETE,W_Completeren aanvraag_START,W_Completeren aanvraag_COMPLETE,W_Completeren aanvraag_START,A_ACCEPTED_COMPLETE,...,W_Nabellen incomplete dossiers_COMPLETE,W_Nabellen incomplete dossiers_START,W_Nabellen incomplete dossiers_COMPLETE,W_Nabellen incomplete dossiers_START,W_Nabellen incomplete dossiers_COMPLETE,W_Nabellen incomplete dossiers_START,O_ACCEPTED_COMPLETE,A_APPROVED_COMPLETE,A_REGISTERED_COMPLETE,0.0



Diverse Counterfactual set without sparsity correction since only metadata about each  feature is available (new outcome:  1.0


Unnamed: 0,step_1,step_2,step_3,step_4,step_5,step_6,step_7,step_8,step_9,step_10,...,step_79,step_80,step_81,step_82,step_83,step_84,step_85,step_86,step_87,predicted
0,-,-,-,-,-,-,A_ACCEPTED_COMPLETE,A_ACCEPTED_COMPLETE,-,W_Nabellen offertes_SCHEDULE,...,W_Completeren aanvraag_COMPLETE,A_CANCELLED_COMPLETE,W_Nabellen offertes_COMPLETE,W_Beoordelen fraude_START,O_SENT_COMPLETE,W_Beoordelen fraude_COMPLETE,-,W_Afhandelen leads_START,A_ACCEPTED_COMPLETE,1.0


In [26]:
list(example_df.iloc[0])

['A_SUBMITTED_COMPLETE',
 'A_PARTLYSUBMITTED_COMPLETE',
 'A_PREACCEPTED_COMPLETE',
 'W_Completeren aanvraag_SCHEDULE',
 'W_Completeren aanvraag_START',
 'W_Completeren aanvraag_COMPLETE',
 'W_Completeren aanvraag_START',
 'W_Completeren aanvraag_COMPLETE',
 'W_Completeren aanvraag_START',
 'A_ACCEPTED_COMPLETE',
 'A_FINALIZED_COMPLETE',
 'O_SELECTED_COMPLETE',
 'O_CREATED_COMPLETE',
 'O_SENT_COMPLETE',
 'W_Nabellen offertes_SCHEDULE',
 'W_Completeren aanvraag_COMPLETE',
 'W_Nabellen offertes_START',
 'W_Nabellen offertes_COMPLETE',
 'W_Nabellen offertes_START',
 'W_Nabellen offertes_COMPLETE',
 'W_Nabellen offertes_START',
 'O_SENT_BACK_COMPLETE',
 'W_Valideren aanvraag_SCHEDULE',
 'W_Nabellen offertes_COMPLETE',
 'W_Valideren aanvraag_START',
 'W_Valideren aanvraag_COMPLETE',
 'W_Valideren aanvraag_START',
 'W_Nabellen incomplete dossiers_SCHEDULE',
 'W_Valideren aanvraag_COMPLETE',
 'W_Nabellen incomplete dossiers_START',
 'W_Nabellen incomplete dossiers_COMPLETE',
 'W_Nabellen incom

In [27]:
list(dice_exp.final_cfs_df.iloc[0][:-1])

['A_SUBMITTED_COMPLETE',
 'A_PARTLYSUBMITTED_COMPLETE',
 'A_PREACCEPTED_COMPLETE',
 'W_Completeren aanvraag_SCHEDULE',
 'W_Completeren aanvraag_START',
 'W_Completeren aanvraag_COMPLETE',
 'A_ACCEPTED_COMPLETE',
 'A_ACCEPTED_COMPLETE',
 'W_Completeren aanvraag_START',
 'W_Nabellen offertes_SCHEDULE',
 'A_PARTLYSUBMITTED_COMPLETE',
 'W_Completeren aanvraag_COMPLETE',
 'W_Nabellen offertes_COMPLETE',
 'W_Nabellen incomplete dossiers_COMPLETE',
 'A_CANCELLED_COMPLETE',
 'W_Nabellen incomplete dossiers_SCHEDULE',
 'W_Afhandelen leads_COMPLETE',
 'W_Nabellen offertes_SCHEDULE',
 'A_REGISTERED_COMPLETE',
 'W_Completeren aanvraag_COMPLETE',
 'A_SUBMITTED_COMPLETE',
 'W_Nabellen incomplete dossiers_SCHEDULE',
 'A_PARTLYSUBMITTED_COMPLETE',
 'O_DECLINED_COMPLETE',
 'W_Nabellen incomplete dossiers_COMPLETE',
 'W_Afhandelen leads_COMPLETE',
 'O_CREATED_COMPLETE',
 'W_Nabellen offertes_SCHEDULE',
 'W_Completeren aanvraag_SCHEDULE',
 'O_SENT_BACK_COMPLETE',
 'W_Wijzigen contractgegevens_SCHEDULE',


In [28]:
#   explainer.model.vocab.list_of_index_to_vocab([without_tags_vocabs.index(a) for a in list(dice_exp.final_cfs_df.iloc[1][:-1])])

In [29]:
list(dice_exp.final_cfs_df.iloc[1])

IndexError: single positional indexer is out-of-bounds