In [4]:
from parameters.dataset import BPI2012Parameters
from dataset import BPI2012Dataset
from parameters.enum import BPI2012ActivityType
from utils.bpi2012 import print_model_prediction_result, remove_tags_for_seq, get_example_data_with_removed_tail, generate_fake_df


In [36]:
dataset = BPI2012Dataset(BPI2012Parameters(
    include_complete_only=True,
    include_types=[BPI2012ActivityType.A, BPI2012ActivityType.O, BPI2012ActivityType.W],
)) 


| Preprocessed data loaded successfully: ./data/preprocessed/BPI_Challenge_2012_with_resource/AOW_CompleteOnly 


In [35]:
# estimate how many features we will have.
all_activity_resource_combinations= []
for instance in dataset.df.iloc:
    for a, r in zip(instance["activity_vocab"], instance["resource_vocab"]):
        combination = f"{a}_{r}"
        if not (combination) in all_activity_resource_combinations:
            all_activity_resource_combinations.append(combination)

print(f"All {len(all_activity_resource_combinations)} combinations.")

## Too many combination. We better not doing this.

All 615 combinations.


In [None]:
# We don't need padding anymore.

In [81]:
trace_a = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h']
trace_b = ['a', 'e', 'h']

In [None]:
### Calculate distance for both trace.


In [78]:
# features.
new_count_df = []

for instance in dataset.df.iloc:
    count_map = {}
    for a in dataset.activity_vocab.vocabs:
        count_map[a] = instance.activity_vocab.count(a)

    new_count_df.append(count_map)
        # We count it.


In [79]:
import pandas as pd

In [80]:
pd.DataFrame(new_count_df)

Unnamed: 0,<PAD>,<EOS>,<SOS>,A_ACCEPTED_COMPLETE,A_ACTIVATED_COMPLETE,A_APPROVED_COMPLETE,A_CANCELLED_COMPLETE,A_DECLINED_COMPLETE,A_FINALIZED_COMPLETE,A_PARTLYSUBMITTED_COMPLETE,...,O_DECLINED_COMPLETE,O_SELECTED_COMPLETE,O_SENT_BACK_COMPLETE,O_SENT_COMPLETE,W_Afhandelen leads_COMPLETE,W_Beoordelen fraude_COMPLETE,W_Completeren aanvraag_COMPLETE,W_Nabellen incomplete dossiers_COMPLETE,W_Nabellen offertes_COMPLETE,W_Valideren aanvraag_COMPLETE
0,0,1,1,1,1,1,0,0,1,1,...,0,1,1,1,0,0,1,0,3,1
1,0,1,1,1,1,1,0,0,1,1,...,0,2,1,2,0,0,2,0,4,3
2,0,1,1,1,1,1,0,0,1,1,...,0,3,1,3,0,0,4,0,11,1
3,0,1,1,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,0,1,1,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13082,0,1,1,1,0,0,0,0,1,1,...,0,2,1,2,0,0,2,0,4,0
13083,0,1,1,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
13084,0,1,1,0,0,0,0,1,0,1,...,0,0,0,0,1,0,0,0,0,0
13085,0,1,1,1,0,0,0,0,1,1,...,0,1,0,1,1,0,1,0,1,0


In [55]:
dataset.activity_vocab.vocabs

['<PAD>',
 '<EOS>',
 '<SOS>',
 'A_ACCEPTED_COMPLETE',
 'A_ACTIVATED_COMPLETE',
 'A_APPROVED_COMPLETE',
 'A_CANCELLED_COMPLETE',
 'A_DECLINED_COMPLETE',
 'A_FINALIZED_COMPLETE',
 'A_PARTLYSUBMITTED_COMPLETE',
 'A_PREACCEPTED_COMPLETE',
 'A_REGISTERED_COMPLETE',
 'A_SUBMITTED_COMPLETE',
 'O_ACCEPTED_COMPLETE',
 'O_CANCELLED_COMPLETE',
 'O_CREATED_COMPLETE',
 'O_DECLINED_COMPLETE',
 'O_SELECTED_COMPLETE',
 'O_SENT_BACK_COMPLETE',
 'O_SENT_COMPLETE',
 'W_Afhandelen leads_COMPLETE',
 'W_Beoordelen fraude_COMPLETE',
 'W_Completeren aanvraag_COMPLETE',
 'W_Nabellen incomplete dossiers_COMPLETE',
 'W_Nabellen offertes_COMPLETE',
 'W_Valideren aanvraag_COMPLETE']

In [76]:
# from above cell we know W can be in 6 positions.

# What's last A milestones

W_activities = [
    'W_Afhandelen leads_COMPLETE',
    'W_Beoordelen fraude_COMPLETE',
    'W_Completeren aanvraag_COMPLETE',
    'W_Nabellen incomplete dossiers_COMPLETE',
    'W_Nabellen offertes_COMPLETE',
    'W_Valideren aanvraag_COMPLETE'
]

A_activities = [
    'A_ACCEPTED_COMPLETE',
    'A_ACTIVATED_COMPLETE',
    'A_APPROVED_COMPLETE',
    'A_CANCELLED_COMPLETE',
    'A_DECLINED_COMPLETE',
    'A_FINALIZED_COMPLETE',
    'A_PARTLYSUBMITTED_COMPLETE',
    'A_PREACCEPTED_COMPLETE',
    'A_REGISTERED_COMPLETE',
]

W_position_map = {}
# Initialise W map

for w_a in W_activities:
    W_position_map[w_a] = []
    # The element should be start_end. And we only consider "A"


for instance in dataset.df.iloc:
    instance_activities = instance["activity_vocab"]
    for idx, a in enumerate(instance_activities):
        if a.startswith("W_"):
            # I have to check the starting milestone.
            for start in instance_activities[0: idx][::-1]:
                if start.startswith("A_"):
                    starting_A = start
                    break

            # Ending milestone
            for end in instance_activities[idx+1:]:
                if end.startswith("A_"):
                    ending_A = end
                    break

            inMiddleOf = f"{starting_A}_&_{ending_A}"
            
            if not ( inMiddleOf in W_position_map[a]):
                W_position_map[a].append(inMiddleOf)
            


In [77]:
W_position_map # Just count the existance count.

{'W_Afhandelen leads_COMPLETE': ['A_PREACCEPTED_COMPLETE_&_A_DECLINED_COMPLETE',
  'A_PREACCEPTED_COMPLETE_&_A_CANCELLED_COMPLETE',
  'A_PARTLYSUBMITTED_COMPLETE_&_A_PREACCEPTED_COMPLETE',
  'A_DECLINED_COMPLETE_&_A_ACTIVATED_COMPLETE',
  'A_PREACCEPTED_COMPLETE_&_A_ACCEPTED_COMPLETE',
  'A_DECLINED_COMPLETE_&_A_APPROVED_COMPLETE',
  'A_DECLINED_COMPLETE_&_A_DECLINED_COMPLETE',
  'A_PARTLYSUBMITTED_COMPLETE_&_A_DECLINED_COMPLETE',
  'A_DECLINED_COMPLETE_&_A_CANCELLED_COMPLETE',
  'A_DECLINED_COMPLETE_&_A_REGISTERED_COMPLETE',
  'A_DECLINED_COMPLETE_&_A_ACCEPTED_COMPLETE',
  'A_PARTLYSUBMITTED_COMPLETE_&_A_CANCELLED_COMPLETE',
  'A_PREACCEPTED_COMPLETE_&_A_PREACCEPTED_COMPLETE',
  'A_PREACCEPTED_COMPLETE_&_A_ACTIVATED_COMPLETE',
  'A_DECLINED_COMPLETE_&_A_FINALIZED_COMPLETE',
  'A_PREACCEPTED_COMPLETE_&_A_APPROVED_COMPLETE',
  'A_DECLINED_COMPLETE_&_A_PREACCEPTED_COMPLETE'],
 'W_Beoordelen fraude_COMPLETE': ['A_PREACCEPTED_COMPLETE_&_A_DECLINED_COMPLETE',
  'A_DECLINED_COMPLETE_&_A_DECL

In [69]:
[1, 2,3,4,5,6][0:][::-1]

[6, 5, 4, 3, 2, 1]

In [46]:
milestones=[
        "A_SUBMITTED_COMPLETE",
        "A_PARTLYSUBMITTED_COMPLETE",
        "A_PREACCEPTED_COMPLETE",
        "A_ACCEPTED_COMPLETE",
        "A_FINALIZED_COMPLETE",
        # "O_SELECTED_COMPLETE",
        # "O_CREATED_COMPLETE",
        # "O_SENT_COMPLETE",
        # "O_SENT_BACK_COMPLETE",
        "A_APPROVED_COMPLETE",
        "A_ACTIVATED_COMPLETE",
        "A_REGISTERED_COMPLETE",
    ]

### Check that all milestones will not repeat





In [47]:
for instance in dataset.df.iloc:
    for m in milestones:
        if instance["activity_vocab"].count(m) > 1:
            print(m)
            print("============")
            print(instance["activity_vocab"])
            print("============")
            print(instance["activitay_vocab"].count(m))
            raise StopIteration

## All the A instance will not repeat.
## So we have to check what's the last instance of A.

### Question:
## For resource, how we manage it?
## 1. It has too much comination so we can't make it a categorical data.
## 2. If we don't use resource. We lost some of the information that may help us.
## But how can we use resource here.
## Resource is highly related to the activity.
## Let's see what each resource can do
## 3. 

In [49]:
# what each resource can do
resource_do_activity = {}
for instance in dataset.df.iloc:
    for a, r in zip(instance["activity_vocab"], instance["resource_vocab"]):
        if not (r in resource_do_activity.keys()):
            resource_do_activity[r] = []

        if not (a in resource_do_activity[r]):
            resource_do_activity[r].append(a)

In [53]:
## Each resource can do.
import numpy as np
print(np.mean([len(v) for v in resource_do_activity.values()]))

12.366197183098592


In [8]:
dataset.df

Unnamed: 0,activity,activity_vocab,caseid,amount,resource,resource_vocab
0,"[2, 12, 9, 10, 3, 17, 8, 15, 19, 22, 24, 24, 1...","[<SOS>, A_SUBMITTED_COMPLETE, A_PARTLYSUBMITTE...",173688,20000.0,"[70, 53, 53, 53, 14, 14, 14, 14, 14, 71, 71, 2...","[<SOS>, 112, 112, 112, 10862, 10862, 10862, 10..."
1,"[2, 12, 9, 10, 22, 3, 8, 17, 15, 19, 22, 24, 1...","[<SOS>, A_SUBMITTED_COMPLETE, A_PARTLYSUBMITTE...",173691,5000.0,"[70, 53, 53, 53, 71, 14, 14, 14, 14, 14, 71, 7...","[<SOS>, 112, 112, 112, UNKNOWN, 10862, 10862, ..."
2,"[2, 12, 9, 10, 22, 22, 22, 3, 17, 8, 15, 19, 2...","[<SOS>, A_SUBMITTED_COMPLETE, A_PARTLYSUBMITTE...",173694,7000.0,"[70, 53, 53, 53, 22, 71, 55, 55, 55, 55, 55, 5...","[<SOS>, 112, 112, 112, 10912, UNKNOWN, 11201, ..."
3,"[2, 12, 9, 7, 1]","[<SOS>, A_SUBMITTED_COMPLETE, A_PARTLYSUBMITTE...",173697,15000.0,"[70, 53, 53, 53, 69]","[<SOS>, 112, 112, 112, <EOS>]"
4,"[2, 12, 9, 7, 1]","[<SOS>, A_SUBMITTED_COMPLETE, A_PARTLYSUBMITTE...",173700,5000.0,"[70, 53, 53, 53, 69]","[<SOS>, 112, 112, 112, <EOS>]"
...,...,...,...,...,...,...
13082,"[2, 12, 9, 10, 22, 3, 8, 17, 15, 19, 22, 17, 1...","[<SOS>, A_SUBMITTED_COMPLETE, A_PARTLYSUBMITTE...",214364,5000.0,"[70, 53, 53, 53, 48, 37, 37, 37, 37, 37, 37, 3...","[<SOS>, 112, 112, 112, 11169, 11003, 11003, 11..."
13083,"[2, 12, 9, 7, 1]","[<SOS>, A_SUBMITTED_COMPLETE, A_PARTLYSUBMITTE...",214367,500.0,"[70, 53, 53, 53, 69]","[<SOS>, 112, 112, 112, <EOS>]"
13084,"[2, 12, 9, 7, 20, 1]","[<SOS>, A_SUBMITTED_COMPLETE, A_PARTLYSUBMITTE...",214370,20000.0,"[70, 53, 53, 48, 48, 69]","[<SOS>, 112, 112, 11169, 11169, <EOS>]"
13085,"[2, 12, 9, 10, 20, 3, 17, 8, 15, 19, 22, 24, 1]","[<SOS>, A_SUBMITTED_COMPLETE, A_PARTLYSUBMITTE...",214373,8500.0,"[70, 53, 53, 48, 48, 28, 28, 28, 28, 28, 28, 4...","[<SOS>, 112, 112, 11169, 11169, 10933, 10933, ..."


False

In [20]:
dataset.activity_vocab.vocabs

['<PAD>',
 '<EOS>',
 '<SOS>',
 'A_ACCEPTED_COMPLETE',
 'A_ACTIVATED_COMPLETE',
 'A_APPROVED_COMPLETE',
 'A_CANCELLED_COMPLETE',
 'A_DECLINED_COMPLETE',
 'A_FINALIZED_COMPLETE',
 'A_PARTLYSUBMITTED_COMPLETE',
 'A_PREACCEPTED_COMPLETE',
 'A_REGISTERED_COMPLETE',
 'A_SUBMITTED_COMPLETE',
 'O_ACCEPTED_COMPLETE',
 'O_CANCELLED_COMPLETE',
 'O_CREATED_COMPLETE',
 'O_DECLINED_COMPLETE',
 'O_SELECTED_COMPLETE',
 'O_SENT_BACK_COMPLETE',
 'O_SENT_COMPLETE',
 'W_Afhandelen leads_COMPLETE',
 'W_Beoordelen fraude_COMPLETE',
 'W_Completeren aanvraag_COMPLETE',
 'W_Nabellen incomplete dossiers_COMPLETE',
 'W_Nabellen offertes_COMPLETE',
 'W_Valideren aanvraag_COMPLETE']

In [22]:
print("The activity that everyone have")
for v in dataset.activity_vocab.vocabs:
    if len((dataset.df[dataset.df['activity_vocab'].apply(
        lambda a: v in a)])) == len(dataset.df):
        print(v)


<EOS>
<SOS>
A_PARTLYSUBMITTED_COMPLETE
A_SUBMITTED_COMPLETE


In [25]:
len(dataset.resource_vocab.vocabs) # We got 72 different resource

72

In [7]:
(
    example_vocab_activities,
    example_idx_activities,
    example_vocab_resources,
    example_idx_resources,
    example_amount,
    ground_truth_vocab
) = get_example_data_with_removed_tail(
    dataset,
    trace_idx=52,
    tail_length_to_remove=4
) 


| [['<SOS>', 'A_SUBMITTED_COMPLETE', 'A_PARTLYSUBMITTED_COMPLETE', 'A_PREACCEPTED_COMPLETE', 'W_Completeren aanvraag_COMPLETE', 'W_Completeren aanvraag_COMPLETE', 'A_ACCEPTED_COMPLETE', 'A_FINALIZED_COMPLETE', 'O_SELECTED_COMPLETE', 'O_CREATED_COMPLETE', 'O_SENT_COMPLETE', 'W_Completeren aanvraag_COMPLETE', 'O_SENT_BACK_COMPLETE', 'W_Nabellen offertes_COMPLETE', 'O_ACCEPTED_COMPLETE', 'A_APPROVED_COMPLETE', 'A_REGISTERED_COMPLETE', 'A_ACTIVATED_COMPLETE', 'W_Valideren aanvraag_COMPLETE']] 

| [['<SOS>', '112', '112', '112', '11180', '11201', '11201', '11201', '11201', '11201', '11201', '11201', '11049', '11049', '10629', '10629', '10629', '10629', '10629']] 

| [15500.0] 

| ['<SOS>', 'A_SUBMITTED_COMPLETE', 'A_PARTLYSUBMITTED_COMPLETE', 'A_PREACCEPTED_COMPLETE', 'W_Completeren aanvraag_COMPLETE', 'W_Completeren aanvraag_COMPLETE', 'A_ACCEPTED_COMPLETE', 'A_FINALIZED_COMPLETE', 'O_SELECTED_COMPLETE', 'O_CREATED_COMPLETE', 'O_SENT_COMPLETE', 'W_Completeren aanvraag_COMPLETE', 'O_SENT_B