In [1]:
from Controller import TrainingController
from Parameters import TrainingParameters
from Utils.SaveUtils import load_parameters
from Utils.PrintUtils import print_big
import tensorflow as tf
import json
import numpy as np
import pandas as pd
from Models import DiCEBinaryDefferentiable
from itertools import chain
from IPython.core.display import display, HTML
from Utils.DiCEHelpers import generate_fake_df, get_trace_with_id, get_longest_trace_row, remove_trail_steps, print_model_prediction_result, remove_tags_for_query_instance

# import dice_ml
# from dice_ml.utils import helpers

print('TF version: ', tf.__version__)
print('Eager execution enabled: ', tf.executing_eagerly()) # False

TF version:  2.4.0-rc0
Eager execution enabled:  True


In [2]:
folder_path = "./SavedModels/%s" % (
    "0.8264_BPI2012WithResource_BaselineLSTMWithResource_2021-06-18 06:11:10.009443" #AOW
)

In [3]:
### Initialise controllers
parameters_json = load_parameters(folder_path=folder_path)
parameters = TrainingParameters(**parameters_json)
tf.random.set_seed(parameters.dataset_split_seed)
np.random.seed(parameters.dataset_split_seed)
parameters.load_model_folder_path = folder_path
trainer = TrainingController(parameters = parameters)
trainer.show_model_info()


| Running on /job:localhost/replica:0/task:0/device:CPU:0  

| Preprocessed data loaded successfully: ./datasets/preprocessed/BPI_Challenge_2012_with_resource/AOW 

| Model loaded successfully from: ./SavedModels/0.8264_BPI2012WithResource_BaselineLSTMWithResource_2021-06-18 06:11:10.009443  
Model: "baseline_lstm_with_resource"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  832       
_________________________________________________________________
embedding_1 (Embedding)      multiple                  2304      
_________________________________________________________________
lstm (LSTM)                  multiple                  24832     
_________________________________________________________________
lstm_1 (LSTM)                multiple                  33024     
_________________________________________________________________
lstm_2 (

In [4]:
########### Get example data from trainer ###########
ordered_test_idx = (list(trainer.test_dataset.unbatch().as_numpy_iterator()))
ordered_test_idx.sort()
print_big(len(ordered_test_idx), "Test set length")


| 1309 


In [5]:
########## Get longest delcined trace for testing ##########
declined_df = get_trace_with_id(trainer.dataset.df.iloc[ordered_test_idx], trainer.model.vocab.vocab_to_index('A_DECLINED_COMPLETE'))
longest_declined_trace_row = get_longest_trace_row(declined_df)
longest_declined_trace_row

Unnamed: 0,trace,trace_vocab,caseid,amount,resource,resource_orig
8464,"[2, 12, 9, 10, 20, 22, 3, 17, 8, 15, 19, 22, 1...","[<SOS>, A_SUBMITTED_COMPLETE, A_PARTLYSUBMITTE...",200028,5800.0,"[70, 53, 53, 15, 15, 48, 37, 37, 37, 37, 37, 3...","[<SOS>, 112, 112, 10863, 10863, 11169, 11003, ..."


In [6]:
_, example_activities, _, example_resources, example_amount, _ = trainer.dataset.collate_fn([longest_declined_trace_row.index[0]])

In [7]:
 print("========================================Trace========================================")
 print(trainer.model.vocab.list_of_index_to_vocab_2d(example_activities))

[['<SOS>', 'A_SUBMITTED_COMPLETE', 'A_PARTLYSUBMITTED_COMPLETE', 'A_PREACCEPTED_COMPLETE', 'W_Afhandelen leads_COMPLETE', 'W_Completeren aanvraag_COMPLETE', 'A_ACCEPTED_COMPLETE', 'O_SELECTED_COMPLETE', 'A_FINALIZED_COMPLETE', 'O_CREATED_COMPLETE', 'O_SENT_COMPLETE', 'W_Completeren aanvraag_COMPLETE', 'O_SELECTED_COMPLETE', 'O_CANCELLED_COMPLETE', 'O_CREATED_COMPLETE', 'O_SENT_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'O_CANCELLED_COMPLETE', 'O_SELECTED_COMPLETE', 'O_CREATED_COMPLETE', 'O_SENT_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'O_SENT_BACK_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'W_Valideren aanvraag_COMPLETE', 'W_Nabellen incomplete dossiers_COMPLETE', 'W_Nabellen incomplete dossiers_COMPLETE', 'W_Nabellen incomplete dossiers_COMPLETE', 'W_Nabellen incomplete dossiers_COMPLETE', 'W_Nabellen incomplete dossiers_COMPLETE', 'W_Nabellen incomplete do

In [8]:
example_idx_activities, example_idx_resources = remove_trail_steps(example_activities, example_resources, 50)
example_vocab_activities = trainer.model.vocab.list_of_index_to_vocab_2d(example_idx_activities)[0]
print("========================================Trace after substraction========================================")
print(example_vocab_activities)

['<SOS>', 'A_SUBMITTED_COMPLETE', 'A_PARTLYSUBMITTED_COMPLETE', 'A_PREACCEPTED_COMPLETE']


In [9]:
##### Get model output
example_activities_input = tf.constant(example_idx_activities)
example_resources_input = tf.constant(example_idx_resources)
example_amount_input = tf.constant(example_amount)

predicted_df = print_model_prediction_result(trainer.model, example_activities_input, example_resources_input, example_amount_input)


| Predicted activity with highest probability (0.36) is "W_Afhandelen leads_COMPLETE" 



<PAD>                                      0.000377
<EOS>                                      0.005197
<SOS>                                      0.001689
A_ACCEPTED_COMPLETE                        0.113344
A_ACTIVATED_COMPLETE                       0.001507
A_APPROVED_COMPLETE                        0.000362
A_CANCELLED_COMPLETE                       0.081236
A_DECLINED_COMPLETE                        0.127201
A_FINALIZED_COMPLETE                       0.003128
A_PARTLYSUBMITTED_COMPLETE                 0.004422
A_PREACCEPTED_COMPLETE                     0.050626
A_REGISTERED_COMPLETE                      0.000840
A_SUBMITTED_COMPLETE                       0.017736
O_ACCEPTED_COMPLETE                        0.000727
O_CANCELLED_COMPLETE                       0.000166
O_CREATED_COMPLETE                         0.013158
O_DECLINED_COMPLETE                        0.000213
O_SELECTED_COMPLETE      

In [10]:
predicted_df

Unnamed: 0,<PAD>,<EOS>,<SOS>,A_ACCEPTED_COMPLETE,A_ACTIVATED_COMPLETE,A_APPROVED_COMPLETE,A_CANCELLED_COMPLETE,A_DECLINED_COMPLETE,A_FINALIZED_COMPLETE,A_PARTLYSUBMITTED_COMPLETE,...,O_DECLINED_COMPLETE,O_SELECTED_COMPLETE,O_SENT_BACK_COMPLETE,O_SENT_COMPLETE,W_Afhandelen leads_COMPLETE,W_Beoordelen fraude_COMPLETE,W_Completeren aanvraag_COMPLETE,W_Nabellen incomplete dossiers_COMPLETE,W_Nabellen offertes_COMPLETE,W_Valideren aanvraag_COMPLETE
0,0.000377,0.005197,0.001689,0.113344,0.001507,0.000362,0.081236,0.127201,0.003128,0.004422,...,0.000213,0.028422,0.000866,0.01163,0.362581,0.008908,0.154157,0.007841,0.001002,0.002665


In [11]:
no_need_tags = ['<EOS>', '<SOS>', '<PAD>']
example_idx_activities_no_tag, example_idx_resources_no_tag =  remove_tags_for_query_instance(example_idx_activities, example_idx_resources, trainer.model.vocab.tags_idx(), [trainer.model.resources.index(tag) for tag in no_need_tags])

example_vocab_trace_no_tag = trainer.model.vocab.list_of_index_to_vocab(example_idx_activities_no_tag)
example_vocab_resource_no_tag = [trainer.model.resources[r]  for r in example_idx_resources_no_tag]

example_trace_len_no_tag = len(example_vocab_trace_no_tag)
#### Determine feature names for DiCE ####
activity_feature_names = np.array(["activity_step_%d" % (i+1) for i in range(example_trace_len_no_tag)])
resource_feature_names = np.array(["resource_step_%d" % (i+1) for i in range(example_trace_len_no_tag)])

possible_activities = [ a for a in list(trainer.model.vocab.vocabs) if  not a in no_need_tags]
possible_resources =  [ r for r in list(trainer.model.resources) if  not r in no_need_tags]

possbile_amount = [min(trainer.dataset.df["amount"]), max(trainer.dataset.df["amount"])]

print_big(example_vocab_trace_no_tag ,title="Example activities without tags", num_marks = 30)
print_big(example_vocab_resource_no_tag ,title="Example resources without tags", num_marks = 30)
print_big(example_amount, title="=================Amount=================", num_marks=16)


| ['A_SUBMITTED_COMPLETE', 'A_PARTLYSUBMITTED_COMPLETE', 'A_PREACCEPTED_COMPLETE'] 

| ['112', '112', '10863'] 

| [5800.0] 


In [12]:
############ Setting up desired activity ############
desired_activity = 'A_DECLINED_COMPLETE' # A_DECLINED_COMPLETE, A_APPROVED_COMPLETE
print_big(desired_activity, "Desired activity")

dice_binary_model = DiCEBinaryDefferentiable(
    model=trainer.model,
    vocab=trainer.model.vocab,
    resources= trainer.model.resources,
    desired=trainer.model.vocab.vocab_to_index(desired_activity),
    trace_length = len(example_vocab_trace_no_tag),
    sos_idx_activity=trainer.model.vocab.vocab_to_index("<SOS>"),
    sos_idx_resource= trainer.model.resources.index('<SOS>'),
    amount_min = possbile_amount[0],
    amount_max = possbile_amount[1],
    possible_resources=possible_resources,
    possible_activities=possible_activities 
)


| A_DECLINED_COMPLETE 


In [13]:
fake_df = generate_fake_df(5000, activity_feature_names, resource_feature_names, possible_activities, possible_resources, possbile_amount, example_trace_len_no_tag)

In [14]:
def transform_to_ohe_normalized_input(activities, resources,  possible_activities, possible_resources):
    activity_cf = tf.one_hot(activities, depth=len(possible_activities))
    resource_cf = tf.one_hot(resources, depth=len(
        possible_resources
    ))

    return activity_cf, resource_cf

In [15]:
ohe_activity_cf, ohe_resource_cf = transform_to_ohe_normalized_input(example_idx_activities_no_tag, example_idx_resources_no_tag, possible_activities, possible_resources)

In [16]:
dice_binary_model(
    [
        example_amount_input,
        ohe_activity_cf,
        ohe_resource_cf,
    ]
)

Custom input


<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.00796507]], dtype=float32)>

In [23]:
class EventLogDiCE():
    def __init__(self, possible_activities, possible_resources, possible_amount, dice_model):
        self.possible_activities = possible_activities 
        self.possible_amount = possible_amount
        self.possible_resources = possible_resources
        self.dice_model = dice_model

    def min_max_scale_amount(self, input_amount, inverse=False):
        min_a = self.possible_amount[0]
        max_a = self.possible_amount[1]
        min_max_range = (max_a - min_a)
        if inverse:
            return (input_amount * min_max_range ) + min_a
        else:
            return input_amount - min_a / min_max_range

    def get_valid_cf(self, amount_cf, ohe_activity_cf, ohe_resource_cf):
        return tf.clip_by_value(amount_cf, self.possible_amount[0], self.possible_amount[1]) ,tf.one_hot(tf.argmax(ohe_activity_cf, axis= -1), depth=len(self.possible_activities)), tf.one_hot(tf.argmax(ohe_resource_cf, axis= -1), depth=len(self.possible_resources))


    def generate_counterfactual(self, query_activities, query_resources, query_amount, max_iter = 1000, verbose_freq = 50, lr = 0.005):

        ### Get the input for dice model (has to be differentiable)
        ohe_activity_cf, ohe_resource_cf = transform_to_ohe_normalized_input(query_activities, query_resources, self.possible_activities, self.possible_resources)

        ohe_activity_backup = ohe_activity_cf.numpy()
        ohe_resource_backup = ohe_resource_cf.numpy()
        amount_backup = query_amount.numpy()

        ## Create the cf variable 
        amount_cf = tf.Variable(query_amount)
        ohe_activity_cf = tf.Variable(ohe_activity_cf)
        ohe_resource_cf = tf.Variable(ohe_resource_cf)
        self.amount_cf = amount_cf

        ## Get current prediction.
        prediction = round(self.dice_model(
            [
                amount_cf,
                ohe_activity_cf,
                ohe_resource_cf
            ]
        ).numpy()[0, 0])
        
        print_big(prediction, "Original Prediction")

        desired_pred = 1 - prediction

        print_big(desired_pred, "Desired Prediction")

        ## init optimizer
        optim = tf.keras.optimizers.Adam(learning_rate=lr)

        for i in range(max_iter):
            if i % verbose_freq == 0 and i != 0:
                print_big(f"Current Loss [{loss.numpy()}]", f"Step {i}")

            with tf.GradientTape() as tape:
                ### Get prediction from cf
                cf_output = self.dice_model(
                    [
                        amount_cf,
                        ohe_activity_cf,
                        ohe_resource_cf
                    ]
                )

                ### Using hinge loss since we have cat data
                class_loss = tf.keras.metrics.hinge(desired_pred, cf_output)
                self.class_loss = class_loss
                self.cf_output = cf_output

                activity_distance_loss = tf.reduce_sum(tf.pow((ohe_activity_cf - ohe_activity_backup), 2))
                resources_distance_loss = tf.reduce_sum(tf.pow(ohe_resource_cf - ohe_resource_backup, 2))
                amount_distance_loss = self.min_max_scale_amount(tf.pow(amount_cf - amount_backup,2))
                distance_loss = activity_distance_loss + resources_distance_loss + amount_distance_loss
                self.distance_loss = distance_loss

                ### Categorical contraint
                activity_cat_loss = tf.pow(tf.reduce_sum(ohe_activity_cf, axis=1) - 1, 2)
                resource_cat_loss = tf.pow(tf.reduce_sum(ohe_resource_cf, axis=1) - 1, 2)
                cat_loss = tf.reduce_sum(activity_cat_loss + resource_cat_loss)

                # self.cat_loss = cat_loss
                loss = cf_output
                # loss = class_loss #  + distance_loss + cat_loss
                self.loss = loss

            ### Get gradient
            # grad = tape.gradient(loss, [amount_cf, ohe_activity_cf, ohe_resource_cf])
            grad = tape.gradient(loss, [ amount_cf ,ohe_activity_cf, ohe_resource_cf])
            self.grad = grad

            ### Update CF to this direction
            optim.apply_gradients(zip(grad, [amount_cf, ohe_activity_cf, ohe_resource_cf]))

            ### Get a valid version of CF
            temp_amount_cf, temp_ohe_activity_cf, temp_ohe_resource_cf = self.get_valid_cf(amount_cf, ohe_activity_cf, ohe_resource_cf)

            #### Get prediction

            cf_pred = round(self.dice_model(
            [
                temp_amount_cf,
                temp_ohe_activity_cf,
                temp_ohe_resource_cf
            ]
            ).numpy()[0, 0])

            if (cf_pred == desired_pred):
                return amount_cf, ohe_activity_cf, ohe_resource_cf
        
    

In [24]:
dice = EventLogDiCE(possible_activities, possible_resources, possbile_amount, dice_binary_model)

In [25]:
dice.generate_counterfactual(example_idx_activities_no_tag, example_idx_resources_no_tag,example_amount_input)

Custom input

| 0 

| 1 
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input
Custom input


KeyboardInterrupt: 

In [68]:
dice.grad

[None, None, None]

In [34]:
dice.loss

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[-0.9920349]], dtype=float32)>

In [22]:
dice.cat_loss

<tf.Tensor: shape=(), dtype=float32, numpy=0.0>

In [23]:
dice.class_loss

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.9920349], dtype=float32)>

In [24]:
dice.distance_loss

<tf.Tensor: shape=(), dtype=float32, numpy=0.0>

In [215]:
dice.loss

<tf.Tensor: shape=(1,), dtype=float32, numpy=array([0.92006016], dtype=float32)>

In [216]:
dice.grad

[<tf.Tensor: shape=(1,), dtype=float32, numpy=array([nan], dtype=float32)>,
 <tf.Tensor: shape=(8, 23), dtype=float32, numpy=
 array([[ 0.02666074,  0.00151159,  0.00955772, -0.00159479,  0.02104962,
          0.02017802,  0.01435035, -0.008002  , -0.008176  , -0.01699463,
         -0.00565872,  0.00833758,  0.02154505,  0.02183243, -0.01677009,
          0.00957994, -0.00315624,  0.01214932, -0.01439261,  0.00698711,
         -0.01069965,  0.00640788, -0.01145178],
        [ 0.03675214,  0.00269707,  0.01209273, -0.00078596,  0.03306829,
          0.02909263,  0.01737041, -0.01339286, -0.00795034, -0.02580699,
         -0.00672944,  0.00869185,  0.02560805,  0.03053433, -0.02055812,
          0.01240346, -0.00538529,  0.01592245, -0.02185494,  0.00698798,
         -0.014915  ,  0.00777805, -0.01679251],
        [ 0.04881603,  0.00254673,  0.01466103,  0.00074222,  0.05076082,
          0.03938919,  0.01925157, -0.02115978, -0.00724413, -0.0359022 ,
         -0.00716058,  0.00869203,  