In [16]:
# imports
import pandas as pd
import datetime
import re

In [17]:
# This notebook can be run separately from the deliverable tool.
if (('df' not in globals()) or ('df_test' not in globals())):
    df = pd.read_csv('data/BPI_Challenge_2012-training.csv')
    
    df_test = pd.read_csv('data/BPI_Challenge_2012-test.csv')

    # Defining database-specific variables
    case_column = "case concept:name"
    registration_time_column = "case REG_DATE"
    event_column = "event concept:name"
    timestamp_column = "event time:timestamp"
    timeformat_registration = "%Y-%m-%dT%H:%M:%S" # new time format
    timeformat_timestamp = "%d-%m-%Y %H:%M:%S.%f"

    # Names of columns we will add in this notebook
    position_column = "Position"
    baseline_la_next_event_column = "Baseline Last Event Prediction for Next Activity" # added in the baseline notebook
    baseline_la_next_timestamp_column = "Baseline Last Event Prediction for Next Timestamp" # added in the baseline notebook
    baseline_pos_next_event_column = "Baseline Last Pos Prediction for Next Activity" # added in the baseline notebook
    baseline_pos_next_timestamp_column = "Baseline Last Pos Prediction for Next Timestamp" # added in the baseline notebook

    
    # We fill in the Position column that shows which position is a certain event in the trace (the first event is 1)
    df[position_column] = df.groupby([case_column]).cumcount()+1
    df_test[position_column] = df_test.groupby([case_column]).cumcount()+1

    # Basic data preprocessing of the timestamps
    df[registration_time_column]= [re.sub('\..*|\+.*','',a,flags=re.DOTALL) for a in df[registration_time_column]]
    df[registration_time_column] = [datetime.datetime.strptime(date, timeformat_registration) for date in df[registration_time_column]]
    df[timestamp_column] = [datetime.datetime.strptime(date, timeformat_timestamp) for date in df[timestamp_column]]
    df = df.sort_values(by=[case_column, timestamp_column]).reset_index() # sort values by user and time of event

    # Basic data preprocessing of the timestamps
    df_test[registration_time_column]= [re.sub('\..*|\+.*','',a,flags=re.DOTALL) for a in df_test[registration_time_column]]
    df_test[registration_time_column] = [datetime.datetime.strptime(date, timeformat_registration) for date in df_test[registration_time_column]]
    df_test[timestamp_column] = [datetime.datetime.strptime(date, timeformat_timestamp) for date in df_test[timestamp_column]]
    df_test = df_test.sort_values(by=[case_column, timestamp_column]).reset_index() # sort values by user and time of event

df.head(10)

Unnamed: 0,index,eventID,case concept:name,case REG_DATE,case AMOUNT_REQ,event concept:name,event lifecycle:transition,event time:timestamp,Position,Baseline Last Event Prediction for Next Activity,Baseline Last Event Prediction for Next Timestamp,Baseline Last Pos Prediction for Next Activity,Baseline Last Pos Prediction for Next Timestamp
0,0,0,173688,2011-10-01 00:38:44,20000,A_SUBMITTED,COMPLETE,2011-10-01 00:38:44.546,1,A_PARTLYSUBMITTED,2011-10-01 00:38:45.128640462,A_PARTLYSUBMITTED,2011-10-01 00:38:45.128640462
1,1,1,173688,2011-10-01 00:38:44,20000,A_PARTLYSUBMITTED,COMPLETE,2011-10-01 00:38:44.880,2,W_Afhandelen leads,2011-10-01 00:39:19.982779539,W_Afhandelen leads,2011-10-01 00:39:19.982779539
2,2,2,173688,2011-10-01 00:38:44,20000,A_PREACCEPTED,COMPLETE,2011-10-01 00:39:37.906,3,W_Completeren aanvraag,2011-10-01 00:39:38.408771753,W_Afhandelen leads,2011-10-01 03:30:53.567069032
3,3,3,173688,2011-10-01 00:38:44,20000,W_Completeren aanvraag,SCHEDULE,2011-10-01 00:39:38.875,4,W_Completeren aanvraag,2011-10-01 17:58:08.032379243,W_Completeren aanvraag,2011-10-01 03:55:31.785088057
4,89,4,173688,2011-10-01 00:38:44,20000,W_Completeren aanvraag,START,2011-10-01 11:36:46.437,5,W_Completeren aanvraag,2011-10-02 04:55:15.594379243,W_Completeren aanvraag,2011-10-01 11:54:36.483873248
5,94,5,173688,2011-10-01 00:38:44,20000,A_ACCEPTED,COMPLETE,2011-10-01 11:42:43.308,6,O_SELECTED,2011-10-01 11:46:27.513278604,W_Completeren aanvraag,2011-10-01 15:21:10.772730102
6,98,7,173688,2011-10-01 00:38:44,20000,A_FINALIZED,COMPLETE,2011-10-01 11:45:09.243,7,O_CREATED,2011-10-01 11:45:11.557131461,W_Completeren aanvraag,2011-10-01 12:49:36.989151176
7,99,6,173688,2011-10-01 00:38:44,20000,O_SELECTED,COMPLETE,2011-10-01 11:45:09.243,8,O_CREATED,2011-10-01 11:45:11.199693633,W_Completeren aanvraag,2011-10-01 15:50:33.483132306
8,100,8,173688,2011-10-01 00:38:44,20000,O_CREATED,COMPLETE,2011-10-01 11:45:11.197,9,O_SENT,2011-10-01 11:45:11.257123109,W_Completeren aanvraag,2011-10-01 14:35:59.469570225
9,101,9,173688,2011-10-01 00:38:44,20000,O_SENT,COMPLETE,2011-10-01 11:45:11.380,10,W_Nabellen offertes,2011-10-01 11:45:11.587961660,W_Completeren aanvraag,2011-10-01 15:40:21.236745516


In [18]:
# Code Explanation: We subtract the timestamps and then shift the results up so the difference from n to n+1 is on row n.
# Only if both events are in the same trace should include the row in the mean() calculation.
shifted_deltatimes = df[timestamp_column].diff().shift(periods=-1)[df[case_column].shift(periods=-1) == df[case_column]]

# The mean() function will return NaT if the input is empty, we replace this with pd.Timedelta(0)
def replacenat(timedelta):
    if (pd.isna(timedelta)):
        return pd.Timedelta(0)
    else:
        return timedelta

# list of unique events in the data
unique_events = df[event_column].unique()

In [19]:
# dictionary to store the most common (mode) event following the key event
dict_common_next_event = {event: df[(df[case_column].shift(periods=-1) == df[case_column]) & (df[event_column].shift(periods=1) == event)][event_column].mode()[0] for event in unique_events}

In [20]:
# We apply the dictionary to the dataset to get the baseline event prediction
df[baseline_la_next_event_column] = [(dict_common_next_event[event] if (event in dict_common_next_event) else None) for event in df[event_column]]

#df[df[case_column] == 185548]

In [21]:
# dictionary to store the average time for each event
dict_time_per_event = {event: replacenat(shifted_deltatimes[df[event_column] == event].mean()) for event in unique_events}

In [22]:
# apply the average time to the dataframe to get the baseline time prediction
df[baseline_la_next_timestamp_column] = [(time + dict_time_per_event[event] if (event in dict_time_per_event) else time) for event,time in zip(df[event_column], df[timestamp_column])]

#df[df[case_column] == 185548]

In [23]:
# the dictionary dict_events_after_pos stores the counts of all events that occur after each position (if there are multiple modes we pick the first event that is encountered)
# for example dict_events_after_pos[3]['W_Completeren aanvraag'] gives the counts of how often 'W_Completeren aanvraag' occurred after position 3
# there will not be any prediction for the highest position in the dataset: None

dict_events_after_pos = {}

def count_events_after_pos(event_name, position):
    if (position-1 in dict_events_after_pos):
        if (event_name in dict_events_after_pos[position-1]):
            dict_events_after_pos[position-1][event_name] += 1
        else:
            dict_events_after_pos[position-1][event_name] = 1
    else:
        dict_events_after_pos[position-1] = {}
        dict_events_after_pos[position-1][event_name] = 1

[count_events_after_pos(event, pos) for event, pos in zip(df[event_column], df[position_column])]

len(dict_events_after_pos)

175

In [24]:
# We apply the baseline based on position to the dataset
df[baseline_pos_next_event_column] = [(max(dict_events_after_pos[pos], key=dict_events_after_pos[pos].get) if (pos in dict_events_after_pos) else None) for pos in df[position_column]]

#df[df[case_column] == 185548]

In [25]:
# dictionary to store the average time after each position
dict_time_per_pos = {pos: replacenat(shifted_deltatimes[df[position_column] == pos].mean()) for pos in range(1,len(dict_events_after_pos)+1)}
#dict_time_per_pos

In [26]:
# Applying the baseline based on position time prediction to df
df[baseline_pos_next_timestamp_column] = [time + dict_time_per_pos[pos] if (pos in dict_time_per_pos) else time for time,pos in zip(df[timestamp_column], df[position_column])]

df[df[case_column] == 185548]

Unnamed: 0,index,eventID,case concept:name,case REG_DATE,case AMOUNT_REQ,event concept:name,event lifecycle:transition,event time:timestamp,Position,Baseline Last Event Prediction for Next Activity,Baseline Last Event Prediction for Next Timestamp,Baseline Last Pos Prediction for Next Activity,Baseline Last Pos Prediction for Next Timestamp
78435,64935,16333760626688,185548,2011-11-15 13:42:45,20000,A_SUBMITTED,COMPLETE,2011-11-15 13:42:45.593,1,A_PARTLYSUBMITTED,2011-11-15 13:42:46.175640462,A_PARTLYSUBMITTED,2011-11-15 13:42:46.175640462
78436,64936,16333760626689,185548,2011-11-15 13:42:45,20000,A_PARTLYSUBMITTED,COMPLETE,2011-11-15 13:42:45.889,2,W_Afhandelen leads,2011-11-15 13:43:20.991779539,W_Afhandelen leads,2011-11-15 13:43:20.991779539
78437,64938,16333760626690,185548,2011-11-15 13:42:45,20000,A_PREACCEPTED,COMPLETE,2011-11-15 13:43:31.963,3,W_Completeren aanvraag,2011-11-15 13:43:32.465771753,W_Afhandelen leads,2011-11-15 16:34:47.624069032
78438,64939,16333760626691,185548,2011-11-15 13:42:45,20000,W_Completeren aanvraag,SCHEDULE,2011-11-15 13:43:32.557,4,W_Completeren aanvraag,2011-11-16 07:02:01.714379243,W_Completeren aanvraag,2011-11-15 16:59:25.467088057
78439,64989,16333760626692,185548,2011-11-15 13:42:45,20000,W_Completeren aanvraag,START,2011-11-15 13:59:33.696,5,W_Completeren aanvraag,2011-11-16 07:18:02.853379243,W_Completeren aanvraag,2011-11-15 14:17:23.742873248
...,...,...,...,...,...,...,...,...,...,...,...,...,...
78605,165456,16333760626858,185548,2011-11-15 13:42:45,20000,W_Nabellen incomplete dossiers,COMPLETE,2012-01-17 10:54:42.893,171,W_Nabellen incomplete dossiers,2012-01-17 18:08:57.358645946,W_Nabellen incomplete dossiers,2012-01-17 11:20:51.763000000
78606,165539,16333760626859,185548,2011-11-15 13:42:45,20000,W_Nabellen incomplete dossiers,START,2012-01-17 11:20:51.763,172,W_Nabellen incomplete dossiers,2012-01-17 18:35:06.228645946,O_CANCELLED,2012-01-17 11:25:56.714000000
78607,165555,16333760626861,185548,2011-11-15 13:42:45,20000,O_CANCELLED,COMPLETE,2012-01-17 11:25:56.714,173,O_SELECTED,2012-01-17 11:25:58.074857292,A_CANCELLED,2012-01-17 11:25:56.714000000
78608,165556,16333760626860,185548,2011-11-15 13:42:45,20000,A_CANCELLED,COMPLETE,2012-01-17 11:25:56.714,174,A_SUBMITTED,2012-01-17 11:28:58.061432222,W_Nabellen incomplete dossiers,2012-01-17 11:26:00.120000000


In [27]:
# Applying the baseline predictions to the test dataset
df_test[baseline_la_next_event_column] = [(dict_common_next_event[event] if (event in dict_common_next_event) else None) for event in df_test[event_column]]

df_test[baseline_la_next_timestamp_column] = [(time + dict_time_per_event[event] if (event in dict_time_per_event) else time) for event,time in zip(df_test[event_column], df_test[timestamp_column])]

df_test[baseline_pos_next_event_column] = [(max(dict_events_after_pos[pos], key=dict_events_after_pos[pos].get) if (pos in dict_events_after_pos) else None) for pos in df_test[position_column]]

df_test[baseline_pos_next_timestamp_column] = [time + dict_time_per_pos[pos] if (pos in dict_time_per_pos) else time for time,pos in zip(df_test[timestamp_column], df_test[position_column])]

df_test[df_test[case_column] == 206327]

Unnamed: 0,index,eventID,case concept:name,case REG_DATE,case AMOUNT_REQ,event concept:name,event lifecycle:transition,event time:timestamp,Position,Baseline Last Event Prediction for Next Activity,Baseline Last Event Prediction for Next Timestamp,Baseline Last Pos Prediction for Next Activity,Baseline Last Pos Prediction for Next Timestamp
3,3,44968307589120,206327,2012-02-03 17:23:41,6000,A_SUBMITTED,COMPLETE,2012-02-03 17:23:41.949,1,A_PARTLYSUBMITTED,2012-02-03 17:23:42.531640462,A_PARTLYSUBMITTED,2012-02-03 17:23:42.531640462
4,4,44968307589121,206327,2012-02-03 17:23:41,6000,A_PARTLYSUBMITTED,COMPLETE,2012-02-03 17:23:42.504,2,W_Afhandelen leads,2012-02-03 17:24:17.606779539,W_Afhandelen leads,2012-02-03 17:24:17.606779539
5,5,44968307589122,206327,2012-02-03 17:23:41,6000,A_PREACCEPTED,COMPLETE,2012-02-03 17:24:23.379,3,W_Completeren aanvraag,2012-02-03 17:24:23.881771753,W_Afhandelen leads,2012-02-03 20:15:39.040069032
6,6,44968307589123,206327,2012-02-03 17:23:41,6000,W_Completeren aanvraag,SCHEDULE,2012-02-03 17:24:24.052,4,W_Completeren aanvraag,2012-02-04 10:42:53.209379243,W_Completeren aanvraag,2012-02-03 20:40:16.962088057
7,162,44968307589124,206327,2012-02-03 17:23:41,6000,W_Completeren aanvraag,START,2012-02-03 20:52:43.090,5,W_Completeren aanvraag,2012-02-04 14:11:12.247379243,W_Completeren aanvraag,2012-02-03 21:10:33.136873248
8,167,44968307589125,206327,2012-02-03 17:23:41,6000,W_Completeren aanvraag,COMPLETE,2012-02-03 21:01:19.935,6,W_Completeren aanvraag,2012-02-04 14:19:49.092379243,W_Completeren aanvraag,2012-02-04 00:39:47.399730102
9,168,44968307589126,206327,2012-02-03 17:23:41,6000,W_Completeren aanvraag,START,2012-02-03 21:01:31.267,7,W_Completeren aanvraag,2012-02-04 14:20:00.424379243,W_Completeren aanvraag,2012-02-03 22:05:59.013151176
10,169,44968307589127,206327,2012-02-03 17:23:41,6000,A_DECLINED,COMPLETE,2012-02-03 21:06:24.391,8,A_SUBMITTED,2012-02-03 21:06:27.833911034,W_Completeren aanvraag,2012-02-04 01:11:48.631132306
11,170,44968307589128,206327,2012-02-03 17:23:41,6000,W_Completeren aanvraag,COMPLETE,2012-02-03 21:06:28.094,9,W_Completeren aanvraag,2012-02-04 14:24:57.251379243,W_Completeren aanvraag,2012-02-03 23:57:16.366570225


In [28]:
# Accuracy % of baseline last activity event predictions:

training_event_accuracy = len(df[(df[baseline_la_next_event_column].shift(periods=1)==df[event_column]) & (df[case_column].shift(periods=1) == df[case_column])]) * 100 / len(df[df[case_column].shift(periods=1) == df[case_column]])

test_event_accuracy = len(df_test[(df_test[baseline_la_next_event_column].shift(periods=1)==df_test[event_column]) & (df_test[case_column].shift(periods=1) == df_test[case_column])]) * 100 / len(df_test[df_test[case_column].shift(periods=1) == df_test[case_column]])

training_event_accuracy, test_event_accuracy

(64.9130980638327, 64.06370976661874)

In [29]:
# Mean Absolute Error of baseline last activity time predictions:

training_time_MAE = abs(df[timestamp_column] - df[baseline_la_next_timestamp_column].shift(periods=1))[df[case_column].shift(periods=1) == df[case_column]].mean()

test_time_MAE = abs(df_test[timestamp_column] - df_test[baseline_la_next_timestamp_column].shift(periods=1))[df_test[case_column].shift(periods=1) == df_test[case_column]].mean()

training_time_MAE, test_time_MAE

(Timedelta('0 days 16:20:59.739888261'),
 Timedelta('0 days 14:43:13.416911984'))