In [1]:
from Controller import TrainingController, ExplainingController
from Utils.SaveUtils import load_parameters
from Parameters import TrainingParameters, PredictingParameters
import tensorflow as tf
from IPython.core.display import display, HTML
import json
import numpy as np
import pandas as pd
from Parameters.Enums import TracePermutationStrategies
from Models import DiCEBinaryDefferientiable
from itertools import chain

import dice_ml
from dice_ml.utils import helpers

print('TF version: ', tf.__version__)
print('Eager execution enabled: ', tf.executing_eagerly()) # False
seed = 123
tf.random.set_seed(seed)
np.random.seed(seed)

TF version:  2.4.0-rc0
Eager execution enabled:  True


In [2]:
folder_path = "./SavedModels/%s" % ("0.8264_BPI2012WithResource_BaselineLSTMWithResource_2021-06-18 06:11:10.009443" # AOW
)

In [3]:
parameters_json = load_parameters(folder_path=folder_path)
parameters = TrainingParameters(**parameters_json)
tf.random.set_seed(parameters.dataset_split_seed)
np.random.seed(parameters.dataset_split_seed)
parameters.load_model_folder_path = folder_path
predicting_parameters = PredictingParameters()
predicting_parameters.load_model_folder_path = folder_path
 

In [4]:
trainer = TrainingController(parameters = parameters)


| Running on /job:localhost/replica:0/task:0/device:CPU:0  

| Preprocessed data loaded successfully: ./datasets/preprocessed/BPI_Challenge_2012_with_resource/AOW 

| Model loaded successfully from: ./SavedModels/0.8264_BPI2012WithResource_BaselineLSTMWithResource_2021-06-18 06:11:10.009443  


In [5]:
explainer = ExplainingController(parameters=parameters, predicting_parameters= predicting_parameters)


| Running on /job:localhost/replica:0/task:0/device:CPU:0  

| Model loaded successfully from: ./SavedModels/0.8264_BPI2012WithResource_BaselineLSTMWithResource_2021-06-18 06:11:10.009443  


In [6]:
########### Get example data from trainer ###########
ordered_test_idx = (list(trainer.test_dataset.unbatch().as_numpy_iterator()))
ordered_test_idx.sort()
print("Test set length: %d" %(len(ordered_test_idx)))

Test set length: 1309


In [7]:
longest_fail_idx = None
longest_fail_length = 0
all_decline_idxs = []
declined_idx = trainer.model.vocab.vocab_to_index('A_DECLINED_COMPLETE')
for idx in ordered_test_idx:
    caseids, example_data, example_lengths, example_resources, example_amount, _= trainer.dataset.collate_fn([idx])
    if declined_idx in example_data[0]:
        all_decline_idxs.append(idx)
        if type(longest_fail_idx) == type(None):
            longest_fail_idx = idx
            longest_fail_length = example_data.shape[1]
        else:
            if example_data.shape[1] > longest_fail_length:
                longest_fail_idx = idx
                longest_fail_length = example_data.shape[1]

In [8]:
caseids, example_data, example_lengths, example_resources, example_amount, _ = trainer.dataset.collate_fn([longest_fail_idx])

In [9]:
remove_trail_steps = 4
example_idx_trace = np.array([example_data[0][:-remove_trail_steps]])
example_idx_resources = np.array([example_resources[0][:-remove_trail_steps]])
example_vocab_trace = trainer.model.vocab.list_of_index_to_vocab_2d(example_idx_trace)[0]

In [10]:
print(example_vocab_trace)

['<SOS>', 'A_SUBMITTED_COMPLETE', 'A_PARTLYSUBMITTED_COMPLETE', 'A_PREACCEPTED_COMPLETE', 'W_Afhandelen leads_COMPLETE', 'W_Completeren aanvraag_COMPLETE', 'A_ACCEPTED_COMPLETE', 'O_SELECTED_COMPLETE', 'A_FINALIZED_COMPLETE', 'O_CREATED_COMPLETE', 'O_SENT_COMPLETE', 'W_Completeren aanvraag_COMPLETE', 'O_SELECTED_COMPLETE', 'O_CANCELLED_COMPLETE', 'O_CREATED_COMPLETE', 'O_SENT_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'O_CANCELLED_COMPLETE', 'O_SELECTED_COMPLETE', 'O_CREATED_COMPLETE', 'O_SENT_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'O_SENT_BACK_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'W_Valideren aanvraag_COMPLETE', 'W_Nabellen incomplete dossiers_COMPLETE', 'W_Nabellen incomplete dossiers_COMPLETE', 'W_Nabellen incomplete dossiers_COMPLETE', 'W_Nabellen incomplete dossiers_COMPLETE', 'W_Nabellen incomplete dossiers_COMPLETE', 'W_Nabellen incomplete dos

In [11]:
example_input = tf.constant(example_idx_trace)
example_resources_input = tf.constant(example_idx_resources)
example_amount_input = tf.constant(example_amount)

In [12]:
out, _ = explainer.model(example_input, input_resources = example_resources_input, amount = example_amount_input)

In [13]:
predicted_vocab_distributions = tf.gather(out, len(example_vocab_trace)-1, axis=1)
predicted_vocab_distributions_df = pd.DataFrame(predicted_vocab_distributions.numpy().tolist(),columns=explainer.model.vocab.vocabs)
max_arg = tf.math.argmax(predicted_vocab_distributions, axis=-1).numpy()[0]
max_prob_vocab = explainer.model.vocab.index_to_vocab(max_arg)
print("Predicted activity with highest probability (%.2f) is \"%s\"" % (predicted_vocab_distributions[0][max_arg].numpy(), max_prob_vocab))
display(predicted_vocab_distributions_df)
print(predicted_vocab_distributions_df.iloc[0])

Predicted activity with highest probability (0.63) is "W_Nabellen incomplete dossiers_COMPLETE"


Unnamed: 0,<PAD>,<EOS>,<SOS>,A_ACCEPTED_COMPLETE,A_ACTIVATED_COMPLETE,A_APPROVED_COMPLETE,A_CANCELLED_COMPLETE,A_DECLINED_COMPLETE,A_FINALIZED_COMPLETE,A_PARTLYSUBMITTED_COMPLETE,...,O_DECLINED_COMPLETE,O_SELECTED_COMPLETE,O_SENT_BACK_COMPLETE,O_SENT_COMPLETE,W_Afhandelen leads_COMPLETE,W_Beoordelen fraude_COMPLETE,W_Completeren aanvraag_COMPLETE,W_Nabellen incomplete dossiers_COMPLETE,W_Nabellen offertes_COMPLETE,W_Valideren aanvraag_COMPLETE
0,3.4e-05,0.002423,7.6e-05,0.001958,0.025987,0.040202,0.064902,0.029624,0.000463,7.7e-05,...,0.005079,0.01691,0.002994,0.000137,6e-06,6.4e-05,0.003706,0.631747,0.00057,0.112999


<PAD>                                      0.000034
<EOS>                                      0.002423
<SOS>                                      0.000076
A_ACCEPTED_COMPLETE                        0.001958
A_ACTIVATED_COMPLETE                       0.025987
A_APPROVED_COMPLETE                        0.040202
A_CANCELLED_COMPLETE                       0.064902
A_DECLINED_COMPLETE                        0.029624
A_FINALIZED_COMPLETE                       0.000463
A_PARTLYSUBMITTED_COMPLETE                 0.000077
A_PREACCEPTED_COMPLETE                     0.000112
A_REGISTERED_COMPLETE                      0.021333
A_SUBMITTED_COMPLETE                       0.000658
O_ACCEPTED_COMPLETE                        0.029949
O_CANCELLED_COMPLETE                       0.007966
O_CREATED_COMPLETE                         0.000026
O_DECLINED_COMPLETE                        0.005079
O_SELECTED_COMPLETE                        0.016910
O_SENT_BACK_COMPLETE                       0.002994
O_SENT_COMPL

In [14]:
indexes_to_remove_from_resource = [explainer.resources.index('<PAD>'), explainer.resources.index('<SOS>'),explainer.resources.index('<EOS>')]

In [15]:
example_idx_trace_without_tags = [ i for i in  example_idx_trace[0] if not i in explainer.vocab.tags_idx()]
example_vocab_trace_without_tags = explainer.model.vocab.list_of_index_to_vocab(example_idx_trace_without_tags)

example_idx_resources_without_tags = [ i for i in  example_idx_resources[0] if not i in indexes_to_remove_from_resource ]

example_vocab_resource_without_tags = [
   explainer.resources[r]  for r in example_idx_resources_without_tags
]

example_trace_len = len(example_vocab_trace_without_tags)

print("=============Example activities without tags=============")
print(example_vocab_trace_without_tags)

print("=============Example resources without tags=============")
print(example_vocab_resource_without_tags)

print("=================Amount=================")
print(example_amount)

['A_SUBMITTED_COMPLETE', 'A_PARTLYSUBMITTED_COMPLETE', 'A_PREACCEPTED_COMPLETE', 'W_Afhandelen leads_COMPLETE', 'W_Completeren aanvraag_COMPLETE', 'A_ACCEPTED_COMPLETE', 'O_SELECTED_COMPLETE', 'A_FINALIZED_COMPLETE', 'O_CREATED_COMPLETE', 'O_SENT_COMPLETE', 'W_Completeren aanvraag_COMPLETE', 'O_SELECTED_COMPLETE', 'O_CANCELLED_COMPLETE', 'O_CREATED_COMPLETE', 'O_SENT_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'O_CANCELLED_COMPLETE', 'O_SELECTED_COMPLETE', 'O_CREATED_COMPLETE', 'O_SENT_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'O_SENT_BACK_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'W_Valideren aanvraag_COMPLETE', 'W_Nabellen incomplete dossiers_COMPLETE', 'W_Nabellen incomplete dossiers_COMPLETE', 'W_Nabellen incomplete dossiers_COMPLETE', 'W_Nabellen incomplete dossiers_COMPLETE', 'W_Nabellen incomplete dossiers_COMPLETE', 'W_Nabellen incomplete dossiers_COM

In [16]:
#### Determine feature name ####
activity_feature_names = np.array(["activity_step_%d" % (i+1) for i in range(len(example_vocab_trace_without_tags))])

resource_feature_names = np.array(["resource_step_%d" % (i+1) for i in range(len(example_vocab_trace_without_tags))])

In [17]:
############ Setting up desired activity ############
desired_activity = 'A_DECLINED_COMPLETE' # A_DECLINED_COMPLETE, A_APPROVED_COMPLETE
print("Desired activity is \"%s\"" %(desired_activity))

Desired activity is "A_DECLINED_COMPLETE"


In [18]:
### all possible activities in the first step
trainer.dataset.df.head(5)

Unnamed: 0,trace,trace_vocab,caseid,amount,resource,resource_orig
0,"[2, 12, 9, 10, 3, 17, 8, 15, 19, 22, 24, 24, 1...","[<SOS>, A_SUBMITTED_COMPLETE, A_PARTLYSUBMITTE...",173688,20000.0,"[70, 53, 53, 53, 14, 14, 14, 14, 14, 71, 71, 2...","[<SOS>, 112, 112, 112, 10862, 10862, 10862, 10..."
1,"[2, 12, 9, 10, 22, 3, 8, 17, 15, 19, 22, 24, 1...","[<SOS>, A_SUBMITTED_COMPLETE, A_PARTLYSUBMITTE...",173691,5000.0,"[70, 53, 53, 53, 71, 14, 14, 14, 14, 14, 71, 7...","[<SOS>, 112, 112, 112, UNKNOWN, 10862, 10862, ..."
2,"[2, 12, 9, 10, 22, 22, 22, 3, 17, 8, 15, 19, 2...","[<SOS>, A_SUBMITTED_COMPLETE, A_PARTLYSUBMITTE...",173694,7000.0,"[70, 53, 53, 53, 22, 71, 55, 55, 55, 55, 55, 5...","[<SOS>, 112, 112, 112, 10912, UNKNOWN, 11201, ..."
3,"[2, 12, 9, 7, 1]","[<SOS>, A_SUBMITTED_COMPLETE, A_PARTLYSUBMITTE...",173697,15000.0,"[70, 53, 53, 53, 69]","[<SOS>, 112, 112, 112, <EOS>]"
4,"[2, 12, 9, 7, 1]","[<SOS>, A_SUBMITTED_COMPLETE, A_PARTLYSUBMITTE...",173700,5000.0,"[70, 53, 53, 53, 69]","[<SOS>, 112, 112, 112, <EOS>]"


In [19]:
dont_need = ['<EOS>', '<SOS>', '<PAD>']
possible_resources =  [ r for r in list(trainer.model.resources) if  not r in dont_need]
possible_activities = [ a for a in list(trainer.model.vocab.vocabs) if  not a in dont_need]

In [20]:
possbile_amount = [min(trainer.dataset.df["amount"]), max(trainer.dataset.df["amount"])]

In [21]:
fake_dataset_size = 5000
fake_df = pd.DataFrame([])
### Create a fake dataset
for i in range (example_trace_len):
    # fake_df[activity_feature_names[i]] = np.random.choice(possible_activities[i], fake_dataset_size)
    fake_df[activity_feature_names[i]] = np.random.choice(possible_activities, fake_dataset_size)

for i in range (example_trace_len):
    fake_df[resource_feature_names[i]] =np.random.choice(possible_resources, fake_dataset_size)

fake_df['amount'] = np.random.uniform(possbile_amount[0], possbile_amount[1], (fake_dataset_size,))

fake_df['predicted'] = np.random.choice([0, 1], fake_dataset_size)

In [22]:
fake_df.head(5)

Unnamed: 0,activity_step_1,activity_step_2,activity_step_3,activity_step_4,activity_step_5,activity_step_6,activity_step_7,activity_step_8,activity_step_9,activity_step_10,...,resource_step_42,resource_step_43,resource_step_44,resource_step_45,resource_step_46,resource_step_47,resource_step_48,resource_step_49,amount,predicted
0,A_APPROVED_COMPLETE,A_CANCELLED_COMPLETE,A_CANCELLED_COMPLETE,O_DECLINED_COMPLETE,O_CREATED_COMPLETE,O_ACCEPTED_COMPLETE,A_FINALIZED_COMPLETE,A_SUBMITTED_COMPLETE,W_Nabellen incomplete dossiers_COMPLETE,A_PREACCEPTED_COMPLETE,...,10138,10933,10972,11309,11019,11181,11254,10910,51853.978311,1
1,A_FINALIZED_COMPLETE,W_Nabellen incomplete dossiers_COMPLETE,A_PARTLYSUBMITTED_COMPLETE,O_SENT_COMPLETE,W_Beoordelen fraude_COMPLETE,O_SENT_COMPLETE,O_SELECTED_COMPLETE,A_APPROVED_COMPLETE,O_CREATED_COMPLETE,W_Nabellen incomplete dossiers_COMPLETE,...,10889,11269,UNKNOWN,11121,10859,10912,10789,11189,89959.165717,0
2,A_ACTIVATED_COMPLETE,O_SENT_COMPLETE,W_Valideren aanvraag_COMPLETE,W_Valideren aanvraag_COMPLETE,A_PARTLYSUBMITTED_COMPLETE,O_DECLINED_COMPLETE,A_REGISTERED_COMPLETE,A_ACCEPTED_COMPLETE,W_Afhandelen leads_COMPLETE,O_CANCELLED_COMPLETE,...,11203,10912,112,11049,10880,11319,10889,10809,83750.426267,1
3,A_DECLINED_COMPLETE,O_DECLINED_COMPLETE,A_CANCELLED_COMPLETE,W_Nabellen offertes_COMPLETE,O_SENT_BACK_COMPLETE,A_ACCEPTED_COMPLETE,O_CREATED_COMPLETE,A_REGISTERED_COMPLETE,O_ACCEPTED_COMPLETE,W_Valideren aanvraag_COMPLETE,...,11003,11203,10914,11189,10809,10929,10913,11003,78396.800106,0
4,A_SUBMITTED_COMPLETE,A_APPROVED_COMPLETE,W_Valideren aanvraag_COMPLETE,O_SENT_COMPLETE,W_Beoordelen fraude_COMPLETE,A_REGISTERED_COMPLETE,W_Beoordelen fraude_COMPLETE,O_CREATED_COMPLETE,O_ACCEPTED_COMPLETE,A_CANCELLED_COMPLETE,...,11179,11019,10971,10609,10125,11254,11254,11002,53551.286699,0


In [23]:
d = dice_ml.Data(dataframe=fake_df, outcome_name="predicted",continuous_features = ['amount'])

In [24]:
dice_binary_model = DiCEBinaryDefferientiable(
    explainer.model, 
    explainer.vocab,
    desired=explainer.vocab.vocab_to_index(desired_activity),
    trace_length = len(example_vocab_trace_without_tags),
    resources= explainer.resources,
    sos_idx_activity=explainer.vocab.vocab_to_index("<SOS>"),
    sos_idx_resource= explainer.resources.index('<SOS>'),
    amount_min = possbile_amount[0],
    amount_max = possbile_amount[1],
    possible_resources=possible_resources,
    possible_activities=possible_activities 
)


In [25]:
m = dice_ml.Model(model=dice_binary_model, backend="TF2")

In [26]:
### Create a fake data frame for all the possibility for them.
exp = dice_ml.Dice(d, m)

In [31]:
feature_names = activity_feature_names.tolist() + resource_feature_names.tolist() + ['amount']

In [33]:
query_instance = [example_vocab_trace_without_tags + example_vocab_resource_without_tags + example_amount]
example_df = pd.DataFrame(query_instance, columns= feature_names)

In [34]:
example_df

Unnamed: 0,activity_step_1,activity_step_2,activity_step_3,activity_step_4,activity_step_5,activity_step_6,activity_step_7,activity_step_8,activity_step_9,activity_step_10,...,resource_step_41,resource_step_42,resource_step_43,resource_step_44,resource_step_45,resource_step_46,resource_step_47,resource_step_48,resource_step_49,amount
0,A_SUBMITTED_COMPLETE,A_PARTLYSUBMITTED_COMPLETE,A_PREACCEPTED_COMPLETE,W_Afhandelen leads_COMPLETE,W_Completeren aanvraag_COMPLETE,A_ACCEPTED_COMPLETE,O_SELECTED_COMPLETE,A_FINALIZED_COMPLETE,O_CREATED_COMPLETE,O_SENT_COMPLETE,...,11181,11181,11181,11000,11169,11122,10982,11003,11003,5800.0


In [49]:
## Problem of weight propagation?
dice_exp = exp.generate_counterfactuals(
        example_df,
        total_CFs=1,
        verbose=True,
        min_iter=100,
        max_iter=500,
        # desired_class="opposite",
        # features_to_vary=['amount'],
        # yloss_type= "log_loss" # log_loss, hinge_loss, l2_loss
        # algorithm = "DiverseCF", # DiverseCF, RandomInitCF
        # proximity_weight=0.5, #0.5,
        # diversity_weight=1,#1.0,
        # init_near_query_instance=True,
        # tie_random = True,
        # categorical_penalty = 1,
        # learning_rate=0.0005,
    )

step 1,  loss=879.289
step 51,  loss=13.0575
step 101,  loss=12.9385
step 151,  loss=12.9382
step 201,  loss=12.9381
step 251,  loss=12.9381
step 301,  loss=12.9381
step 351,  loss=12.9381
step 401,  loss=12.9381


KeyboardInterrupt: 

In [26]:
#### Check what permutations are fed into the model. ####
all_received_activities = []
for i in range(len(example_vocab_trace_without_tags)):
    trace = explainer.model.vocab.list_of_index_to_vocab(list(set((np.squeeze(np.array(dice_binary_model.all_trace))[:, i]).tolist())))
    all_received_activities.append(trace)
print(all_received_activities)

[['A_SUBMITTED_COMPLETE'], ['A_PARTLYSUBMITTED_COMPLETE', 'A_PREACCEPTED_COMPLETE'], ['A_PREACCEPTED_COMPLETE']]


In [27]:
#### Check what permutations are fed into the model. ####
all_received_resources = []
for i in range(len(example_vocab_trace_without_tags)):
    res = [ explainer.resources[r] for r in list(set((np.squeeze(np.array(dice_binary_model.all_resource))[:, i]).tolist()))]
    all_received_resources.append(res)
print(all_received_resources)

[['112', '10971'], ['10859', '112'], ['10863', '11201']]


In [28]:
########## Checking if all the permutation are the same (Shouldn't be) ##########
all_cf_are_same = all([all((dice_binary_model.all_cf_input[1][:, 1:] == dice_binary_model.all_cf_input[i][:, 1:]).tolist()[0]) for i in range(len(dice_binary_model.all_cf_input))][1:])
print("All inputs are the same: %s" % (all_cf_are_same))

All inputs are the same: True


In [29]:
########## Checking if all the trace & resource (except amount) are the same (Shouldn't be) ##########
all_trace_and_resource_are_same = all([all((dice_binary_model.all_cf_input[1][:, 1:] == dice_binary_model.all_cf_input[i][:, 1:]).tolist()[0]) for
 i in range(len(dice_binary_model.all_cf_input))][1:])
print("All trace and resource are the same: %s" % (all_trace_and_resource_are_same))

All trace and resource are the same: True


In [30]:
dice_exp.visualize_as_dataframe(show_only_changes=True, display_sparse_df=False)

Query instance (original outcome : 0)


Unnamed: 0,step_1_activity,step_2_activity,step_3_activity,step_1_resource,step_2_resource,step_3_resource,amount,predicted
0,A_SUBMITTED_COMPLETE,A_PARTLYSUBMITTED_COMPLETE,A_PREACCEPTED_COMPLETE,112,112,10863,5800.0,0.0



No counterfactuals found!


In [31]:
######## Print the example trace ########
print(list(example_df.iloc[0]))

['A_SUBMITTED_COMPLETE', 'A_PARTLYSUBMITTED_COMPLETE', 'A_PREACCEPTED_COMPLETE', '112', '112', '10863', 5800.0]


In [32]:
######## Print the counterfactual trace ########
print(list(dice_exp.final_cfs_df.iloc[0][:-1]))

IndexError: single positional indexer is out-of-bounds

In [None]:
dice_exp.final_cfs_df.iloc[0][:-1]