In [1]:
# imports
import pandas as pd
import pm4py

In [2]:
# This notebook can be run separately from the deliverable tool.
if ('df' not in globals()):
    df_total = pm4py.convert_to_dataframe(pm4py.read_xes("data/BPI_Challenge_2012.xes"))

    # Percentage cut-off points for training, validation and testing datasets
    training_percentage = 0.70
    validation_percentage = 0.80

    # Defining database-specific variables
    case_column = "case:concept:name"
    registration_time_column = "case:REG_DATE"
    event_column = "concept:name"
    timestamp_column = "time:timestamp"
    amount_column = 'case:AMOUNT_REQ'
    row_nr_column = 'row_nr'
    index_column = "index"
    #timeformat_registration = "%Y-%m-%dT%H:%M:%S" # new time format
    timeformat_timestamp = "%Y-%m-%d %H:%M:%S.%f"
    # additional info for the timePrediction_onTheFly, set to None if you don't want to use it
    lifecycle_column = 'lifecycle:transition'
    amount_column_in_data = True

    # Names of columns we will add in this notebook
    position_column = "Position"
    baseline_la_next_event_column = "Baseline Last Event Prediction for Next Activity" # added in the baseline notebook
    baseline_la_next_timestamp_column = "Baseline Last Event Prediction for Next Timestamp" # added in the baseline notebook
    baseline_pos_next_event_column = "Baseline Last Pos Prediction for Next Activity" # added in the baseline notebook
    baseline_pos_next_timestamp_column = "Baseline Last Pos Prediction for Next Timestamp" # added in the baseline notebook

    # Basic data preprocessing of the timestamps and reg dates
    df_total[timestamp_column] = [pd.Timestamp(time).astimezone(None) for time in df_total[timestamp_column]]
    df_total[registration_time_column]= [pd.Timestamp(reg_time).astimezone(None) for reg_time in df_total[registration_time_column]]
    
    # Turning the case and amount values back into numbers instead of strings
    df_total[case_column] = [int(case) for case in df_total[case_column]]
    if (amount_column_in_data):
        df_total[amount_column] = [int(amount) for amount in df_total[amount_column]]

    df_total = df_total.sort_values(by=[timestamp_column]).reset_index()
    # We fill in the Position column that shows which position is a certain event in the trace (the first event is 1)
    df_total[position_column] = df_total.groupby([case_column]).cumcount()+1

    training_cutoff_timestamp = df_total.iloc[[int(training_percentage*len(df_total))]].iloc[0][timestamp_column]
    validation_cutoff_timestamp = df_total.iloc[[int(validation_percentage*len(df_total))]].iloc[0][timestamp_column]

    dict_overlapping_cases = {}

    def find_overlapping_cases(case, timestamp):
        templist = [timestamp < training_cutoff_timestamp, timestamp > training_cutoff_timestamp, timestamp < validation_cutoff_timestamp, timestamp > validation_cutoff_timestamp]
        if (case in dict_overlapping_cases):
            dict_overlapping_cases[case] = [(a or b) for a,b in zip(templist, dict_overlapping_cases[case])]
        else:
            dict_overlapping_cases[case] = templist

    [find_overlapping_cases(case,time) for case,time in zip(df_total[case_column],df_total[timestamp_column])]

    def case_overlaps_a_timestamp(case):
        return (dict_overlapping_cases[case][0] and dict_overlapping_cases[case][1]) or (dict_overlapping_cases[case][2] and dict_overlapping_cases[case][3])

    list_overlapping_cases = [case for case in dict_overlapping_cases.keys() if case_overlaps_a_timestamp(case)]

    # splitting the data and removing all events belonging to cases that overlap the cutoff timestamps
    df = df_total.iloc[:int(training_percentage*len(df_total))]
    df_validation = df_total.iloc[int(training_percentage*len(df_total)):int(validation_percentage*len(df_total))]
    df_test = df_total.iloc[int(validation_percentage*len(df_total)):]

    df = df[[(case not in list_overlapping_cases) for case in df[case_column]]]
    df_validation = df_validation[[(case not in list_overlapping_cases) for case in df_validation[case_column]]]
    df_test = df_test[[(case not in list_overlapping_cases) for case in df_test[case_column]]]

df.head(10)


parsing log, completed traces :: 100%|██████████| 13087/13087 [00:09<00:00, 1405.59it/s]


Unnamed: 0,index,org:resource,lifecycle:transition,concept:name,time:timestamp,case:REG_DATE,case:concept:name,case:AMOUNT_REQ,Position
0,0,112,COMPLETE,A_SUBMITTED,2011-09-30 22:38:44.546,2011-09-30 22:38:44.546,173688,20000,1
1,1,112,COMPLETE,A_PARTLYSUBMITTED,2011-09-30 22:38:44.880,2011-09-30 22:38:44.546,173688,20000,2
2,2,112,COMPLETE,A_PREACCEPTED,2011-09-30 22:39:37.906,2011-09-30 22:38:44.546,173688,20000,3
3,3,112,SCHEDULE,W_Completeren aanvraag,2011-09-30 22:39:38.875,2011-09-30 22:38:44.546,173688,20000,4
4,26,112,COMPLETE,A_SUBMITTED,2011-10-01 06:08:58.256,2011-10-01 06:08:58.256,173691,5000,1
5,27,112,COMPLETE,A_PARTLYSUBMITTED,2011-10-01 06:09:02.195,2011-10-01 06:08:58.256,173691,5000,2
6,28,112,COMPLETE,A_PREACCEPTED,2011-10-01 06:09:56.648,2011-10-01 06:08:58.256,173691,5000,3
7,29,112,SCHEDULE,W_Completeren aanvraag,2011-10-01 06:09:59.578,2011-10-01 06:08:58.256,173691,5000,4
10,124,112,COMPLETE,A_SUBMITTED,2011-10-01 06:11:08.866,2011-10-01 06:11:08.865,173697,15000,1
11,125,112,COMPLETE,A_PARTLYSUBMITTED,2011-10-01 06:11:09.035,2011-10-01 06:11:08.865,173697,15000,2


In [3]:
# This notebook requires the data is sorted in this specific format
df = df.sort_values(by=[case_column, timestamp_column])
df_validation = df_validation.sort_values(by=[case_column, timestamp_column])
df_test = df_test.sort_values(by=[case_column, timestamp_column])

In [4]:
# Code Explanation: We subtract the timestamps and then shift the results up so the difference from n to n+1 is on row n.
# Only if both events are in the same trace should include the row in the mean() calculation.
shifted_deltatimes = df[timestamp_column].diff().shift(periods=-1)[df[case_column].shift(periods=-1) == df[case_column]]

# The mean() function will return NaT if the input is empty, we replace this with pd.Timedelta(0)
def replacenat(timedelta):
    if (pd.isna(timedelta)):
        return pd.Timedelta(0)
    else:
        return timedelta

# list of unique events in the data
unique_events = df[event_column].unique()

In [5]:
# dictionary to store the most common (mode) event following the key event
dict_common_next_event = {event: df[(df[case_column].shift(periods=-1) == df[case_column]) & (df[event_column].shift(periods=1) == event)][event_column].mode()[0] for event in unique_events}

In [6]:
# We apply the dictionary to the dataset to get the baseline event prediction
df[baseline_la_next_event_column] = [(dict_common_next_event[event] if (event in dict_common_next_event) else None) for event in df[event_column]]

#df[df[case_column] == 185548]

In [7]:
# dictionary to store the average time for each event
dict_time_per_event = {event: replacenat(shifted_deltatimes[df[event_column] == event].mean()) for event in unique_events}

In [8]:
# apply the average time to the dataframe to get the baseline time prediction
df[baseline_la_next_timestamp_column] = [(time + dict_time_per_event[event] if (event in dict_time_per_event) else time) for event,time in zip(df[event_column], df[timestamp_column])]

#df[df[case_column] == 185548]

In [9]:
# the dictionary dict_events_after_pos stores the counts of all events that occur after each position (if there are multiple modes we pick the first event that is encountered)
# for example dict_events_after_pos[3]['W_Completeren aanvraag'] gives the counts of how often 'W_Completeren aanvraag' occurred after position 3
# there will not be any prediction for the highest position in the dataset: None

dict_events_after_pos = {}

def count_events_after_pos(event_name, position):
    if (position-1 in dict_events_after_pos):
        if (event_name in dict_events_after_pos[position-1]):
            dict_events_after_pos[position-1][event_name] += 1
        else:
            dict_events_after_pos[position-1][event_name] = 1
    else:
        dict_events_after_pos[position-1] = {}
        dict_events_after_pos[position-1][event_name] = 1

[count_events_after_pos(event, pos) for event, pos in zip(df[event_column], df[position_column])]

len(dict_events_after_pos)

175

In [10]:
# We apply the baseline based on position to the dataset
df[baseline_pos_next_event_column] = [(max(dict_events_after_pos[pos], key=dict_events_after_pos[pos].get) if (pos in dict_events_after_pos) else None) for pos in df[position_column]]

#df[df[case_column] == 185548]

In [11]:
# dictionary to store the average time after each position
dict_time_per_pos = {pos: replacenat(shifted_deltatimes[df[position_column] == pos].mean()) for pos in range(1,len(dict_events_after_pos)+1)}
#dict_time_per_pos

In [16]:
# Applying the baseline based on position time prediction to df
df[baseline_pos_next_timestamp_column] = [time + dict_time_per_pos[pos] if (pos in dict_time_per_pos) else time for time,pos in zip(df[timestamp_column], df[position_column])]

df[df[case_column] == 185548]

Unnamed: 0,index,org:resource,lifecycle:transition,concept:name,time:timestamp,case:REG_DATE,case:concept:name,case:AMOUNT_REQ,Position,Baseline Last Event Prediction for Next Activity,Baseline Last Event Prediction for Next Timestamp,Baseline Last Pos Prediction for Next Activity,Baseline Last Pos Prediction for Next Timestamp
64935,78435,112,COMPLETE,A_SUBMITTED,2011-11-15 12:42:45.593,2011-11-15 12:42:45.592,185548,20000,1,A_PARTLYSUBMITTED,2011-11-15 12:42:46.164857703,A_PARTLYSUBMITTED,2011-11-15 12:42:46.164857703
64936,78436,112,COMPLETE,A_PARTLYSUBMITTED,2011-11-15 12:42:45.889,2011-11-15 12:42:45.592,185548,20000,2,W_Afhandelen leads,2011-11-15 12:43:21.091944045,W_Afhandelen leads,2011-11-15 12:43:21.091944045
64938,78437,112,COMPLETE,A_PREACCEPTED,2011-11-15 12:43:31.963,2011-11-15 12:42:45.592,185548,20000,3,W_Completeren aanvraag,2011-11-15 12:43:32.469518229,W_Afhandelen leads,2011-11-15 15:35:17.232165022
64939,78438,112,SCHEDULE,W_Completeren aanvraag,2011-11-15 12:43:32.557,2011-11-15 12:42:45.592,185548,20000,4,W_Completeren aanvraag,2011-11-16 04:59:47.289742381,W_Completeren aanvraag,2011-11-15 15:47:32.685714106
64989,78439,11189,START,W_Completeren aanvraag,2011-11-15 12:59:33.696,2011-11-15 12:42:45.592,185548,20000,5,W_Completeren aanvraag,2011-11-16 05:15:48.428742381,W_Completeren aanvraag,2011-11-15 13:18:41.789873962
...,...,...,...,...,...,...,...,...,...,...,...,...,...
165456,78605,,COMPLETE,W_Nabellen incomplete dossiers,2012-01-17 09:54:42.893,2011-11-15 12:42:45.592,185548,20000,171,W_Nabellen incomplete dossiers,2012-01-17 16:18:52.242973221,W_Nabellen incomplete dossiers,2012-01-17 10:20:51.763000000
165539,78606,10910,START,W_Nabellen incomplete dossiers,2012-01-17 10:20:51.763,2011-11-15 12:42:45.592,185548,20000,172,W_Nabellen incomplete dossiers,2012-01-17 16:45:01.112973221,A_CANCELLED,2012-01-17 10:25:56.714000000
165555,78607,10910,COMPLETE,A_CANCELLED,2012-01-17 10:25:56.714,2011-11-15 12:42:45.592,185548,20000,173,O_CANCELLED,2012-01-17 10:27:48.758042407,O_CANCELLED,2012-01-17 10:25:56.714000000
165556,78608,10910,COMPLETE,O_CANCELLED,2012-01-17 10:25:56.714,2011-11-15 12:42:45.592,185548,20000,174,O_SELECTED,2012-01-17 10:27:19.622127777,W_Nabellen incomplete dossiers,2012-01-17 10:26:00.120000000


In [21]:
# Applying the baseline predictions to the test dataset
df_test[baseline_la_next_event_column] = [(dict_common_next_event[event] if (event in dict_common_next_event) else None) for event in df_test[event_column]]

df_test[baseline_la_next_timestamp_column] = [(time + dict_time_per_event[event] if (event in dict_time_per_event) else time) for event,time in zip(df_test[event_column], df_test[timestamp_column])]

df_test[baseline_pos_next_event_column] = [(max(dict_events_after_pos[pos], key=dict_events_after_pos[pos].get) if (pos in dict_events_after_pos) else None) for pos in df_test[position_column]]

df_test[baseline_pos_next_timestamp_column] = [time + dict_time_per_pos[pos] if (pos in dict_time_per_pos) else time for time,pos in zip(df_test[timestamp_column], df_test[position_column])]

#df_test[df_test[case_column] == 207867]

Unnamed: 0,index,org:resource,lifecycle:transition,concept:name,time:timestamp,case:REG_DATE,case:concept:name,case:AMOUNT_REQ,Position,Baseline Last Event Prediction for Next Activity,Baseline Last Event Prediction for Next Timestamp,Baseline Last Pos Prediction for Next Activity,Baseline Last Pos Prediction for Next Timestamp
209801,224419,112,COMPLETE,A_SUBMITTED,2012-02-09 14:53:44.864,2012-02-09 14:53:44.863,207867,7500,1,A_PARTLYSUBMITTED,2012-02-09 14:53:45.435857703,A_PARTLYSUBMITTED,2012-02-09 14:53:45.435857703
209802,224420,112,COMPLETE,A_PARTLYSUBMITTED,2012-02-09 14:53:46.192,2012-02-09 14:53:44.863,207867,7500,2,W_Afhandelen leads,2012-02-09 14:54:21.394944045,W_Afhandelen leads,2012-02-09 14:54:21.394944045
209806,224421,112,SCHEDULE,W_Afhandelen leads,2012-02-09 14:54:24.885,2012-02-09 14:53:44.863,207867,7500,3,W_Afhandelen leads,2012-02-09 17:20:12.346444455,W_Afhandelen leads,2012-02-09 17:46:10.154165022
209815,224422,11201,START,W_Afhandelen leads,2012-02-09 14:55:43.718,2012-02-09 14:53:44.863,207867,7500,4,W_Afhandelen leads,2012-02-09 17:21:31.179444455,W_Completeren aanvraag,2012-02-09 17:59:43.846714106
209819,224423,11201,COMPLETE,W_Afhandelen leads,2012-02-09 14:59:41.144,2012-02-09 14:53:44.863,207867,7500,5,W_Afhandelen leads,2012-02-09 17:25:28.605444455,W_Completeren aanvraag,2012-02-09 15:18:49.237873962
209820,224424,11009,START,W_Afhandelen leads,2012-02-09 14:59:48.825,2012-02-09 14:53:44.863,207867,7500,6,W_Afhandelen leads,2012-02-09 17:25:36.286444455,W_Completeren aanvraag,2012-02-09 18:00:10.223733982
209821,224425,11009,COMPLETE,A_PREACCEPTED,2012-02-09 15:00:04.880,2012-02-09 14:53:44.863,207867,7500,7,W_Completeren aanvraag,2012-02-09 15:00:05.386518229,W_Completeren aanvraag,2012-02-09 15:59:21.513123116
209822,224426,11009,SCHEDULE,W_Completeren aanvraag,2012-02-09 15:00:04.924,2012-02-09 14:53:44.863,207867,7500,8,W_Completeren aanvraag,2012-02-10 07:16:19.656742381,W_Completeren aanvraag,2012-02-09 18:27:19.581407947
209823,224427,11009,COMPLETE,W_Afhandelen leads,2012-02-09 15:00:06.263,2012-02-09 14:53:44.863,207867,7500,9,W_Afhandelen leads,2012-02-09 17:25:53.724444455,W_Completeren aanvraag,2012-02-09 17:36:36.466666589
209824,224428,11009,START,W_Completeren aanvraag,2012-02-09 15:00:17.422,2012-02-09 14:53:44.863,207867,7500,10,W_Completeren aanvraag,2012-02-10 07:16:32.154742381,W_Completeren aanvraag,2012-02-09 18:45:54.002186086


In [22]:
# Applying the baseline predictions to the validation dataset
df_validation[baseline_la_next_event_column] = [(dict_common_next_event[event] if (event in dict_common_next_event) else None) for event in df_validation[event_column]]

df_validation[baseline_la_next_timestamp_column] = [(time + dict_time_per_event[event] if (event in dict_time_per_event) else time) for event,time in zip(df_validation[event_column], df_validation[timestamp_column])]

df_validation[baseline_pos_next_event_column] = [(max(dict_events_after_pos[pos], key=dict_events_after_pos[pos].get) if (pos in dict_events_after_pos) else None) for pos in df_validation[position_column]]

df_validation[baseline_pos_next_timestamp_column] = [time + dict_time_per_pos[pos] if (pos in dict_time_per_pos) else time for time,pos in zip(df_validation[timestamp_column], df_validation[position_column])]

#df_validation[df_validation[case_column] == 204164]

Unnamed: 0,index,org:resource,lifecycle:transition,concept:name,time:timestamp,case:REG_DATE,case:concept:name,case:AMOUNT_REQ,Position,Baseline Last Event Prediction for Next Activity,Baseline Last Event Prediction for Next Timestamp,Baseline Last Pos Prediction for Next Activity,Baseline Last Pos Prediction for Next Timestamp
183607,199066,112,COMPLETE,A_SUBMITTED,2012-01-26 10:25:29.187,2012-01-26 10:25:29.187,204164,5000,1,A_PARTLYSUBMITTED,2012-01-26 10:25:29.758857703,A_PARTLYSUBMITTED,2012-01-26 10:25:29.758857703
183608,199067,112,COMPLETE,A_PARTLYSUBMITTED,2012-01-26 10:25:30.017,2012-01-26 10:25:29.187,204164,5000,2,W_Afhandelen leads,2012-01-26 10:26:05.219944045,W_Afhandelen leads,2012-01-26 10:26:05.219944045
183611,199068,112,SCHEDULE,W_Afhandelen leads,2012-01-26 10:26:08.967,2012-01-26 10:25:29.187,204164,5000,3,W_Afhandelen leads,2012-01-26 12:51:56.428444455,W_Afhandelen leads,2012-01-26 13:17:54.236165022
183764,199069,11000,START,W_Afhandelen leads,2012-01-26 10:56:04.244,2012-01-26 10:25:29.187,204164,5000,4,W_Afhandelen leads,2012-01-26 13:21:51.705444455,W_Completeren aanvraag,2012-01-26 14:00:04.372714106
183775,199070,11000,COMPLETE,A_DECLINED,2012-01-26 10:58:24.086,2012-01-26 10:25:29.187,204164,5000,5,A_SUBMITTED,2012-01-26 10:58:27.553965572,W_Completeren aanvraag,2012-01-26 11:17:32.179873962
...,...,...,...,...,...,...,...,...,...,...,...,...,...
209718,224385,112,COMPLETE,A_PARTLYSUBMITTED,2012-02-09 14:26:39.338,2012-02-09 14:26:38.926,207861,6000,2,W_Afhandelen leads,2012-02-09 14:27:14.540944045,W_Afhandelen leads,2012-02-09 14:27:14.540944045
209719,224386,112,SCHEDULE,W_Afhandelen leads,2012-02-09 14:27:13.112,2012-02-09 14:26:38.926,207861,6000,3,W_Afhandelen leads,2012-02-09 16:53:00.573444455,W_Afhandelen leads,2012-02-09 17:18:58.381165022
209729,224387,11201,START,W_Afhandelen leads,2012-02-09 14:32:50.461,2012-02-09 14:26:38.926,207861,6000,4,W_Afhandelen leads,2012-02-09 16:58:37.922444455,W_Completeren aanvraag,2012-02-09 17:36:50.589714106
209731,224388,11201,COMPLETE,A_DECLINED,2012-02-09 14:35:38.574,2012-02-09 14:26:38.926,207861,6000,5,A_SUBMITTED,2012-02-09 14:35:42.041965572,W_Completeren aanvraag,2012-02-09 14:54:46.667873962


In [23]:
# Accuracy % of baseline last activity event predictions:
training_event_accuracy = len(df[(df[baseline_la_next_event_column].shift(periods=1)==df[event_column]) & (df[case_column].shift(periods=1) == df[case_column])]) * 100 / len(df[df[case_column].shift(periods=1) == df[case_column]])

validation_event_accuracy = len(df_validation[(df_validation[baseline_la_next_event_column].shift(periods=1)==df_validation[event_column]) & (df_validation[case_column].shift(periods=1) == df_validation[case_column])]) * 100 / len(df_validation[df_validation[case_column].shift(periods=1) == df_validation[case_column]])

test_event_accuracy = len(df_test[(df_test[baseline_la_next_event_column].shift(periods=1)==df_test[event_column]) & (df_test[case_column].shift(periods=1) == df_test[case_column])]) * 100 / len(df_test[df_test[case_column].shift(periods=1) == df_test[case_column]])

training_event_accuracy, validation_event_accuracy, test_event_accuracy

(64.24752669855205, 59.11699779249448, 64.18195075438892)

In [24]:
# Mean Absolute Error of baseline last activity time predictions:
training_time_MAE = abs(df[timestamp_column] - df[baseline_la_next_timestamp_column].shift(periods=1))[df[case_column].shift(periods=1) == df[case_column]].mean()

validation_time_MAE = abs(df_validation[timestamp_column] - df_validation[baseline_la_next_timestamp_column].shift(periods=1))[df_validation[case_column].shift(periods=1) == df_validation[case_column]].mean()

test_time_MAE = abs(df_test[timestamp_column] - df_test[baseline_la_next_timestamp_column].shift(periods=1))[df_test[case_column].shift(periods=1) == df_test[case_column]].mean()

training_time_MAE, validation_time_MAE, test_time_MAE

(Timedelta('0 days 14:53:49.471772992'),
 Timedelta('0 days 06:49:34.299488526'),
 Timedelta('0 days 13:56:11.855148595'))