In [1]:
from parameters.dataset import BPI2012Parameters
from dataset import BPI2012Dataset
from utils.print import print_block
import tensorflow as tf
import numpy as np
from model import LSTMPredNextModel
from utils.print import print_block
from model import OriginalDiCEWrapper
import dice_ml
from utils.bpi2012 import print_model_prediction_result, remove_tags_for_seq, get_example_data_with_removed_tail, generate_fake_df
import pandas as pd

In [2]:
dataset = BPI2012Dataset(BPI2012Parameters()) 


| Preprocessed data loaded successfully: ./data/preprocessed/BPI_Challenge_2012_with_resource/AOW_CompleteOnly 


In [3]:
# Get example from dataset
# 4 -> A_APPROVED_COMPLETE
# 12 -> A_FINALIZED_COMPLETE
# 13 -> A_ACCEPTED_COMPLETE

(
    example_vocab_activities,
    example_idx_activities,
    example_vocab_resources,
    example_idx_resources,
    example_amount,
    ground_truth_vocab
) = get_example_data_with_removed_tail(
    dataset,
    trace_idx=52,
    tail_length_to_remove=4
) 



| [['<SOS>', 'A_SUBMITTED_COMPLETE', 'A_PARTLYSUBMITTED_COMPLETE', 'A_PREACCEPTED_COMPLETE', 'W_Completeren aanvraag_COMPLETE', 'W_Completeren aanvraag_COMPLETE', 'A_ACCEPTED_COMPLETE', 'A_FINALIZED_COMPLETE', 'O_SELECTED_COMPLETE', 'O_CREATED_COMPLETE', 'O_SENT_COMPLETE', 'W_Completeren aanvraag_COMPLETE', 'O_SENT_BACK_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'O_ACCEPTED_COMPLETE', 'A_APPROVED_COMPLETE', 'A_REGISTERED_COMPLETE', 'A_ACTIVATED_COMPLETE', 'W_Valideren aanvraag_COMPLETE']] 

| [['<SOS>', '112', '112', '112', '11180', '11201', '11201', '11201', '11201', '11201', '11201', '11201', '11049', '11049', '10629', '10629', '10629', '10629', '10629']] 

| [15500.0] 

| ['<SOS>', 'A_SUBMITTED_COMPLETE', 'A_PARTLYSUBMITTED_COMPLETE', 'A_PREACCEPTED_COMPLETE', 'W_Completeren aanvraag_COMPLETE', 'W_Completeren aanvraag_COMPLETE', 'A_ACCEPTED_COMPLETE', 'A_FINALIZED_COMPLETE', 'O_SELECTED_COMPLETE', 'O_CREATED_COMPLETE', 'O_SENT_COMPLETE', 'W_Completeren aanvraag_COMPLETE', 'O_SENT_B

In [4]:
pred_model = LSTMPredNextModel.load(
    "./SavedModels/%s" % (
    "0.8175_LSTMPredNextModel_AOW_CompleteOnly_2021-07-01 20'45'16.353467" 
    )
)
_ = pred_model(**pred_model.get_example_input())
pred_model.summary()


| Model parameters loaded successfully from: ./SavedModels/0.8175_LSTMPredNextModel_AOW_CompleteOnly_2021-07-01 20'45'16.353467  

| Vocab loaded successfully from: ./SavedModels/0.8175_LSTMPredNextModel_AOW_CompleteOnly_2021-07-01 20'45'16.353467  

| Model loaded successfully from: ./SavedModels/0.8175_LSTMPredNextModel_AOW_CompleteOnly_2021-07-01 20'45'16.353467  
Model: "LSTMPredNextModel"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  832       
_________________________________________________________________
embedding_1 (Embedding)      multiple                  9216      
_________________________________________________________________
lstm (LSTM)                  multiple                  24832     
_________________________________________________________________
lstm_1 (LSTM)                multiple                  33024     
________

In [5]:
##### Get model output #####
example_activities_input = tf.constant(example_idx_activities)
example_resources_input = tf.constant(example_idx_resources)
example_amount_input = tf.constant(example_amount)

predicted_df = print_model_prediction_result(pred_model, example_activities_input, example_resources_input, example_amount_input)


| Predicted activity with highest probability (0.44) is "A_REGISTERED_COMPLETE" 



<PAD>                                      3.822472e-06
<EOS>                                      1.483298e-06
<SOS>                                      7.272783e-06
A_ACCEPTED_COMPLETE                        4.374774e-07
A_ACTIVATED_COMPLETE                       1.337052e-01
A_APPROVED_COMPLETE                        4.277948e-01
A_CANCELLED_COMPLETE                       2.484289e-07
A_DECLINED_COMPLETE                        9.023657e-06
A_FINALIZED_COMPLETE                       1.034867e-04
A_PARTLYSUBMITTED_COMPLETE                 1.634951e-04
A_PREACCEPTED_COMPLETE                     1.066932e-06
A_REGISTERED_COMPLETE                      4.373636e-01
A_SUBMITTED_COMPLETE                       8.535364e-06
O_ACCEPTED_COMPLETE                        2.829449e-04
O_CANCELLED_COMPLETE                       3.999644e-04
O_CREATED_COMPLETE                         7.831535e-06
O_DECLINED_COMPLETE

In [6]:
predicted_df

Unnamed: 0,<PAD>,<EOS>,<SOS>,A_ACCEPTED_COMPLETE,A_ACTIVATED_COMPLETE,A_APPROVED_COMPLETE,A_CANCELLED_COMPLETE,A_DECLINED_COMPLETE,A_FINALIZED_COMPLETE,A_PARTLYSUBMITTED_COMPLETE,...,O_DECLINED_COMPLETE,O_SELECTED_COMPLETE,O_SENT_BACK_COMPLETE,O_SENT_COMPLETE,W_Afhandelen leads_COMPLETE,W_Beoordelen fraude_COMPLETE,W_Completeren aanvraag_COMPLETE,W_Nabellen incomplete dossiers_COMPLETE,W_Nabellen offertes_COMPLETE,W_Valideren aanvraag_COMPLETE
0,4e-06,1e-06,7e-06,4.374774e-07,0.133705,0.427795,2.484289e-07,9e-06,0.000103,0.000163,...,1.5e-05,7.61081e-07,6.009959e-07,8.271413e-07,1.095829e-07,8e-06,7.371925e-07,4e-06,8.101372e-07,0.000116


In [7]:
no_need_tags = ['<EOS>', '<SOS>', '<PAD>']

possible_activities = [ a for a in list(pred_model.activity_vocab.vocabs) if  not a in no_need_tags]
possible_resources =  [ r for r in list(pred_model.resource_vocab.vocabs) if  not r in no_need_tags]

example_vocab_activities_no_tag = remove_tags_for_seq(example_vocab_activities, no_need_tags)

example_vocab_resources_no_tag = remove_tags_for_seq(example_vocab_resources, no_need_tags)

# transform to possible dim
example_idx_activities_no_tag = [ possible_activities.index(v)   for v in example_vocab_activities_no_tag ]

example_idx_resources_no_tag = [
possible_resources.index(v)   for v in example_vocab_resources_no_tag    
]

example_trace_len_no_tag = len(example_vocab_activities_no_tag)
#### Determine feature names for DiCE ####
activity_feature_names = np.array(["activity_step_%d" % (i+1) for i in range(example_trace_len_no_tag)])
resource_feature_names = np.array(["resource_step_%d" % (i+1) for i in range(example_trace_len_no_tag)])
possbile_amount = [min(dataset.df["amount"]), max(dataset.df["amount"])]

print_block(example_vocab_activities_no_tag ,title="Example activities without tags", num_marks = 30)
print_block(example_vocab_resources_no_tag ,title="Example resources without tags", num_marks = 30)
print_block(example_amount, title="=================Amount=================", num_marks=16)


| ['A_SUBMITTED_COMPLETE', 'A_PARTLYSUBMITTED_COMPLETE', 'A_PREACCEPTED_COMPLETE', 'W_Completeren aanvraag_COMPLETE', 'W_Completeren aanvraag_COMPLETE', 'A_ACCEPTED_COMPLETE', 'A_FINALIZED_COMPLETE', 'O_SELECTED_COMPLETE', 'O_CREATED_COMPLETE', 'O_SENT_COMPLETE', 'W_Completeren aanvraag_COMPLETE', 'O_SENT_BACK_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'O_ACCEPTED_COMPLETE'] 

| ['112', '112', '112', '11180', '11201', '11201', '11201', '11201', '11201', '11201', '11201', '11049', '11049', '10629'] 

| [15500.0] 


In [8]:
############ Setting up desired activity ############
# A_DECLINED_COMPLETE, A_APPROVED_COMPLETE
desired_activity = ground_truth_vocab
print_block(desired_activity, "Desired activity")

dice_binary_model = OriginalDiCEWrapper(
    model=pred_model,
    activity_vocab=pred_model.activity_vocab,
    resource_vocab=pred_model.resource_vocab,
    desired=pred_model.activity_vocab.vocab_to_index(desired_activity),
    trace_length=example_trace_len_no_tag,
    possible_amount=possbile_amount,
    possible_resources=possible_resources,
    possible_activities=possible_activities
)



| A_APPROVED_COMPLETE 


In [9]:
fake_df = generate_fake_df(5000, activity_feature_names, resource_feature_names, possible_activities, possible_resources, possbile_amount, example_trace_len_no_tag)
d = dice_ml.Data(dataframe=fake_df, outcome_name="predicted",continuous_features = ['amount'])
m = dice_ml.Model(model=dice_binary_model, backend="TF2")
exp = dice_ml.Dice(d, m)

In [10]:
### Prepare input df
feature_names = activity_feature_names.tolist() + resource_feature_names.tolist() + ['amount']
query_instance = [example_vocab_activities_no_tag + example_vocab_resources_no_tag + example_amount]
example_df = pd.DataFrame(query_instance, columns= feature_names)

In [11]:
example_df

Unnamed: 0,activity_step_1,activity_step_2,activity_step_3,activity_step_4,activity_step_5,activity_step_6,activity_step_7,activity_step_8,activity_step_9,activity_step_10,...,resource_step_6,resource_step_7,resource_step_8,resource_step_9,resource_step_10,resource_step_11,resource_step_12,resource_step_13,resource_step_14,amount
0,A_SUBMITTED_COMPLETE,A_PARTLYSUBMITTED_COMPLETE,A_PREACCEPTED_COMPLETE,W_Completeren aanvraag_COMPLETE,W_Completeren aanvraag_COMPLETE,A_ACCEPTED_COMPLETE,A_FINALIZED_COMPLETE,O_SELECTED_COMPLETE,O_CREATED_COMPLETE,O_SENT_COMPLETE,...,11201,11201,11201,11201,11201,11201,11049,11049,10629,15500.0


In [12]:
example_amount

[15500.0]

In [13]:
dice_exp = exp.generate_counterfactuals(
        example_df,
        total_CFs=3,
        verbose=True,
        # min_iter=100,
        # max_iter=500,
        features_to_vary=[
            'amount',
            *activity_feature_names.tolist(),
            *resource_feature_names.tolist(),
        ],
        # desired_class="opposite",
        # yloss_type= "log_loss" # log_loss, hinge_loss, l2_loss
        # algorithm = "DiverseCF", # DiverseCF, RandomInitCF
        # proximity_weight=0.5, # 0.5,
        # diversity_weight=1,# 1.0,
        # init_near_query_instance=True,
        # tie_random = True,
        # categorical_penalty = 1,
        # learning_rate=0.0005,
    )

step 1,  loss=256.16
step 51,  loss=16.4047
step 101,  loss=6.94313
step 151,  loss=5.9864
step 201,  loss=5.6078
step 251,  loss=5.44624
step 301,  loss=5.3714
step 351,  loss=5.33176
step 401,  loss=5.30849
step 451,  loss=5.29136
step 501,  loss=5.2772
step 551,  loss=5.26555
step 601,  loss=5.2567
step 651,  loss=5.25117
step 701,  loss=5.24654
step 751,  loss=5.24247
step 801,  loss=5.23889
step 851,  loss=5.2357
step 901,  loss=5.23278
step 951,  loss=5.23016
step 1001,  loss=5.22776
step 1051,  loss=5.22607
step 1101,  loss=5.22463
step 1151,  loss=5.22332
step 1201,  loss=5.2221
step 1251,  loss=5.221
step 1301,  loss=5.22
step 1351,  loss=5.2191
step 1401,  loss=4236.6
step 1451,  loss=742.22
step 1501,  loss=324.959
step 1551,  loss=188.58
step 1601,  loss=125.137
step 1651,  loss=92.2858
step 1701,  loss=72.208
step 1751,  loss=57.7812
step 1801,  loss=47.2398
step 1851,  loss=39.5181
step 1901,  loss=33.8948
step 1951,  loss=29.4731
step 2001,  loss=25.8291
step 2051,  loss

In [14]:
dice_exp.visualize_as_dataframe(show_only_changes=True, display_sparse_df=False)

Query instance (original outcome : 10)


Unnamed: 0,activity_step_1,activity_step_2,activity_step_3,activity_step_4,activity_step_5,activity_step_6,activity_step_7,activity_step_8,activity_step_9,activity_step_10,...,resource_step_7,resource_step_8,resource_step_9,resource_step_10,resource_step_11,resource_step_12,resource_step_13,resource_step_14,amount,predicted
0,A_SUBMITTED_COMPLETE,A_PARTLYSUBMITTED_COMPLETE,A_PREACCEPTED_COMPLETE,W_Completeren aanvraag_COMPLETE,W_Completeren aanvraag_COMPLETE,A_ACCEPTED_COMPLETE,A_FINALIZED_COMPLETE,O_SELECTED_COMPLETE,O_CREATED_COMPLETE,O_SENT_COMPLETE,...,11201,11201,11201,11201,11201,11049,11049,10629,15500.0,9.884



No counterfactuals found!


In [15]:
######## Print the example trace ########
print_block(list(example_df.iloc[0]), "Original", num_marks = 50)


| ['A_SUBMITTED_COMPLETE', 'A_PARTLYSUBMITTED_COMPLETE', 'A_PREACCEPTED_COMPLETE', 'W_Completeren aanvraag_COMPLETE', 'W_Completeren aanvraag_COMPLETE', 'A_ACCEPTED_COMPLETE', 'A_FINALIZED_COMPLETE', 'O_SELECTED_COMPLETE', 'O_CREATED_COMPLETE', 'O_SENT_COMPLETE', 'W_Completeren aanvraag_COMPLETE', 'O_SENT_BACK_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'O_ACCEPTED_COMPLETE', '112', '112', '112', '11180', '11201', '11201', '11201', '11201', '11201', '11201', '11201', '11049', '11049', '10629', 15500.0] 


In [16]:
######## Print the counterfactual trace ########
if len(dice_exp.final_cfs_df) > 0:
    print_block(list(dice_exp.final_cfs_df.iloc[0][:-1]), "Counterfactual", num_marks=50)
else:
    print_block("Not found!", "Counterfactual")


| Not found! 


In [17]:
# 5550.7s -> A_ACCEPTED_COMPLETE
# 5667.7s -> A_FINALIZED_COMPLETE
# 7062.4s -> A_APPROVED_COMPLETE
