In [1]:
# imports
import pandas as pd
from datetime import datetime
import re

In [2]:
# This notebook can be run separately from the deliverable tool.
if (('df' not in globals()) or ('df_test' not in globals())):
    df = pd.read_csv('data/BPI_Challenge_2012-training.csv')
    
    df_test = pd.read_csv('data/BPI_Challenge_2012-test.csv')

    # Defining database-specific variables
    case_column = "case concept:name"
    registration_time_column = "case REG_DATE"
    event_column = "event concept:name"
    timestamp_column = "event time:timestamp"
    timeformat_registration = "%Y-%m-%dT%H:%M:%S" # new time format
    timeformat_timestamp = "%d-%m-%Y %H:%M:%S.%f"

    # Names of columns we will add in this notebook
    position_column = "Position"
    baseline_next_event_column = "Baseline Prediction for Next Activity"
    baseline_next_timestamp_column = "Baseline Prediction for Next Timestamp"

df.head(10)

Unnamed: 0,eventID,case concept:name,case REG_DATE,case AMOUNT_REQ,event concept:name,event lifecycle:transition,event time:timestamp
0,0,173688,2011-10-01T00:38:44.546+02:00,20000,A_SUBMITTED,COMPLETE,01-10-2011 00:38:44.546
1,1,173688,2011-10-01T00:38:44.546+02:00,20000,A_PARTLYSUBMITTED,COMPLETE,01-10-2011 00:38:44.880
2,2,173688,2011-10-01T00:38:44.546+02:00,20000,A_PREACCEPTED,COMPLETE,01-10-2011 00:39:37.906
3,3,173688,2011-10-01T00:38:44.546+02:00,20000,W_Completeren aanvraag,SCHEDULE,01-10-2011 00:39:38.875
4,4294967296,173691,2011-10-01T08:08:58.256+02:00,5000,A_SUBMITTED,COMPLETE,01-10-2011 08:08:58.256
5,4294967297,173691,2011-10-01T08:08:58.256+02:00,5000,A_PARTLYSUBMITTED,COMPLETE,01-10-2011 08:09:02.195
6,4294967298,173691,2011-10-01T08:08:58.256+02:00,5000,A_PREACCEPTED,COMPLETE,01-10-2011 08:09:56.648
7,4294967299,173691,2011-10-01T08:08:58.256+02:00,5000,W_Completeren aanvraag,SCHEDULE,01-10-2011 08:09:59.578
8,8589934592,173694,2011-10-01T08:10:30.287+02:00,7000,A_SUBMITTED,COMPLETE,01-10-2011 08:10:30.287
9,8589934593,173694,2011-10-01T08:10:30.287+02:00,7000,A_PARTLYSUBMITTED,COMPLETE,01-10-2011 08:10:30.591


In [3]:
# Basic data preprocessing of the timestamps
df[registration_time_column]= [re.sub('\..*|\+.*','',a,flags=re.DOTALL) for a in df[registration_time_column]]
df[registration_time_column] = [datetime.strptime(date, timeformat_registration) for date in df[registration_time_column]]
df[timestamp_column] = [datetime.strptime(date, timeformat_timestamp) for date in df[timestamp_column]]
df = df.sort_values(by=[case_column, timestamp_column]).reset_index() # sort values by user and time of event
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214377 entries, 0 to 214376
Data columns (total 8 columns):
 #   Column                      Non-Null Count   Dtype         
---  ------                      --------------   -----         
 0   index                       214377 non-null  int64         
 1   eventID                     214377 non-null  int64         
 2   case concept:name           214377 non-null  int64         
 3   case REG_DATE               214377 non-null  datetime64[ns]
 4   case AMOUNT_REQ             214377 non-null  int64         
 5   event concept:name          214377 non-null  object        
 6   event lifecycle:transition  214377 non-null  object        
 7   event time:timestamp        214377 non-null  datetime64[ns]
dtypes: datetime64[ns](2), int64(4), object(2)
memory usage: 13.1+ MB


In [4]:
# Basic data preprocessing of the timestamps
df_test[registration_time_column]= [re.sub('\..*|\+.*','',a,flags=re.DOTALL) for a in df_test[registration_time_column]]
df_test[registration_time_column] = [datetime.strptime(date, timeformat_registration) for date in df_test[registration_time_column]]
df_test[timestamp_column] = [datetime.strptime(date, timeformat_timestamp) for date in df_test[timestamp_column]]
df_test = df_test.sort_values(by=[case_column, timestamp_column]).reset_index() # sort values by user and time of event
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 47823 entries, 0 to 47822
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   index                       47823 non-null  int64         
 1   eventID                     47823 non-null  int64         
 2   case concept:name           47823 non-null  int64         
 3   case REG_DATE               47823 non-null  datetime64[ns]
 4   case AMOUNT_REQ             47823 non-null  int64         
 5   event concept:name          47823 non-null  object        
 6   event lifecycle:transition  47823 non-null  object        
 7   event time:timestamp        47823 non-null  datetime64[ns]
dtypes: datetime64[ns](2), int64(4), object(2)
memory usage: 2.9+ MB


In [5]:
# We fill in the Position column that shows which position is a certain event in the trace (the first event is 1)
df[position_column] = df.groupby([case_column]).cumcount()+1
df_test[position_column] = df_test.groupby([case_column]).cumcount()+1

In [6]:
# Code Explanation: We subtract the timestamps and then shift the results up so the difference from n to n+1 is on row n.
# Only if both events are in the same trace should include the row in the mean() calculation.
shifted_deltatimes = df[timestamp_column].diff().shift(periods=-1)[df[case_column].shift(periods=-1) == df[case_column]]

# The mean() function will return NaT if the input is empty, we replace this with pd.Timedelta(0)
def replacenat(timedelta):
    if (pd.isna(timedelta)):
        return pd.Timedelta(0)
    else:
        return timedelta

# list of unique events in the data
unique_events = df[event_column].unique()

In [7]:
# dictionary to store the most common (mode) event following the key event
dict_common_next_event = {event: df[(df[case_column].shift(periods=-1) == df[case_column]) & (df[event_column].shift(periods=1) == event)][event_column].mode()[0] for event in unique_events}

In [8]:
# We apply the dictionary to the dataset to get the baseline event prediction
df[baseline_next_event_column] = [(dict_common_next_event[event] if (event in dict_common_next_event) else "-") for event in df[event_column]]

df[df[case_column] == 185548]

Unnamed: 0,index,eventID,case concept:name,case REG_DATE,case AMOUNT_REQ,event concept:name,event lifecycle:transition,event time:timestamp,Position,Baseline Prediction for Next Activity
78435,64935,16333760626688,185548,2011-11-15 13:42:45,20000,A_SUBMITTED,COMPLETE,2011-11-15 13:42:45.593,1,A_PARTLYSUBMITTED
78436,64936,16333760626689,185548,2011-11-15 13:42:45,20000,A_PARTLYSUBMITTED,COMPLETE,2011-11-15 13:42:45.889,2,W_Afhandelen leads
78437,64938,16333760626690,185548,2011-11-15 13:42:45,20000,A_PREACCEPTED,COMPLETE,2011-11-15 13:43:31.963,3,W_Completeren aanvraag
78438,64939,16333760626691,185548,2011-11-15 13:42:45,20000,W_Completeren aanvraag,SCHEDULE,2011-11-15 13:43:32.557,4,W_Completeren aanvraag
78439,64989,16333760626692,185548,2011-11-15 13:42:45,20000,W_Completeren aanvraag,START,2011-11-15 13:59:33.696,5,W_Completeren aanvraag
...,...,...,...,...,...,...,...,...,...,...
78605,165456,16333760626858,185548,2011-11-15 13:42:45,20000,W_Nabellen incomplete dossiers,COMPLETE,2012-01-17 10:54:42.893,171,W_Nabellen incomplete dossiers
78606,165539,16333760626859,185548,2011-11-15 13:42:45,20000,W_Nabellen incomplete dossiers,START,2012-01-17 11:20:51.763,172,W_Nabellen incomplete dossiers
78607,165555,16333760626861,185548,2011-11-15 13:42:45,20000,O_CANCELLED,COMPLETE,2012-01-17 11:25:56.714,173,O_SELECTED
78608,165556,16333760626860,185548,2011-11-15 13:42:45,20000,A_CANCELLED,COMPLETE,2012-01-17 11:25:56.714,174,A_SUBMITTED


In [9]:
# dictionary to store the average time for each event
dict_time_per_event = {event: replacenat(shifted_deltatimes[df[event_column] == event].mean()) for event in unique_events}

In [10]:
# apply the average time to the dataframe to get the baseline time prediction
df[baseline_next_timestamp_column] = [(time + dict_time_per_event[event] if (event in dict_time_per_event) else time) for event,time in zip(df[event_column], df[timestamp_column])]

df[df[case_column] == 185548]

Unnamed: 0,index,eventID,case concept:name,case REG_DATE,case AMOUNT_REQ,event concept:name,event lifecycle:transition,event time:timestamp,Position,Baseline Prediction for Next Activity,Baseline Prediction for Next Timestamp
78435,64935,16333760626688,185548,2011-11-15 13:42:45,20000,A_SUBMITTED,COMPLETE,2011-11-15 13:42:45.593,1,A_PARTLYSUBMITTED,2011-11-15 13:42:46.175640462
78436,64936,16333760626689,185548,2011-11-15 13:42:45,20000,A_PARTLYSUBMITTED,COMPLETE,2011-11-15 13:42:45.889,2,W_Afhandelen leads,2011-11-15 13:43:20.991779539
78437,64938,16333760626690,185548,2011-11-15 13:42:45,20000,A_PREACCEPTED,COMPLETE,2011-11-15 13:43:31.963,3,W_Completeren aanvraag,2011-11-15 13:43:32.465771753
78438,64939,16333760626691,185548,2011-11-15 13:42:45,20000,W_Completeren aanvraag,SCHEDULE,2011-11-15 13:43:32.557,4,W_Completeren aanvraag,2011-11-16 07:02:01.714379243
78439,64989,16333760626692,185548,2011-11-15 13:42:45,20000,W_Completeren aanvraag,START,2011-11-15 13:59:33.696,5,W_Completeren aanvraag,2011-11-16 07:18:02.853379243
...,...,...,...,...,...,...,...,...,...,...,...
78605,165456,16333760626858,185548,2011-11-15 13:42:45,20000,W_Nabellen incomplete dossiers,COMPLETE,2012-01-17 10:54:42.893,171,W_Nabellen incomplete dossiers,2012-01-17 18:08:57.358645946
78606,165539,16333760626859,185548,2011-11-15 13:42:45,20000,W_Nabellen incomplete dossiers,START,2012-01-17 11:20:51.763,172,W_Nabellen incomplete dossiers,2012-01-17 18:35:06.228645946
78607,165555,16333760626861,185548,2011-11-15 13:42:45,20000,O_CANCELLED,COMPLETE,2012-01-17 11:25:56.714,173,O_SELECTED,2012-01-17 11:25:58.074857292
78608,165556,16333760626860,185548,2011-11-15 13:42:45,20000,A_CANCELLED,COMPLETE,2012-01-17 11:25:56.714,174,A_SUBMITTED,2012-01-17 11:28:58.061432222


In [11]:
# Applying the baseline predictions to the test dataset
df_test[baseline_next_event_column] = [(dict_common_next_event[event] if (event in dict_common_next_event) else "-") for event in df_test[event_column]]

df_test[baseline_next_timestamp_column] = [(time + dict_time_per_event[event] if (event in dict_time_per_event) else time) for event,time in zip(df_test[event_column], df_test[timestamp_column])]

df_test[df_test[case_column] == 206327]

Unnamed: 0,index,eventID,case concept:name,case REG_DATE,case AMOUNT_REQ,event concept:name,event lifecycle:transition,event time:timestamp,Position,Baseline Prediction for Next Activity,Baseline Prediction for Next Timestamp
3,3,44968307589120,206327,2012-02-03 17:23:41,6000,A_SUBMITTED,COMPLETE,2012-02-03 17:23:41.949,1,A_PARTLYSUBMITTED,2012-02-03 17:23:42.531640462
4,4,44968307589121,206327,2012-02-03 17:23:41,6000,A_PARTLYSUBMITTED,COMPLETE,2012-02-03 17:23:42.504,2,W_Afhandelen leads,2012-02-03 17:24:17.606779539
5,5,44968307589122,206327,2012-02-03 17:23:41,6000,A_PREACCEPTED,COMPLETE,2012-02-03 17:24:23.379,3,W_Completeren aanvraag,2012-02-03 17:24:23.881771753
6,6,44968307589123,206327,2012-02-03 17:23:41,6000,W_Completeren aanvraag,SCHEDULE,2012-02-03 17:24:24.052,4,W_Completeren aanvraag,2012-02-04 10:42:53.209379243
7,162,44968307589124,206327,2012-02-03 17:23:41,6000,W_Completeren aanvraag,START,2012-02-03 20:52:43.090,5,W_Completeren aanvraag,2012-02-04 14:11:12.247379243
8,167,44968307589125,206327,2012-02-03 17:23:41,6000,W_Completeren aanvraag,COMPLETE,2012-02-03 21:01:19.935,6,W_Completeren aanvraag,2012-02-04 14:19:49.092379243
9,168,44968307589126,206327,2012-02-03 17:23:41,6000,W_Completeren aanvraag,START,2012-02-03 21:01:31.267,7,W_Completeren aanvraag,2012-02-04 14:20:00.424379243
10,169,44968307589127,206327,2012-02-03 17:23:41,6000,A_DECLINED,COMPLETE,2012-02-03 21:06:24.391,8,A_SUBMITTED,2012-02-03 21:06:27.833911034
11,170,44968307589128,206327,2012-02-03 17:23:41,6000,W_Completeren aanvraag,COMPLETE,2012-02-03 21:06:28.094,9,W_Completeren aanvraag,2012-02-04 14:24:57.251379243


In [12]:
# Accuracy % of event predictions:

training_event_accuracy = len(df[(df[baseline_next_event_column].shift(periods=1)==df[event_column]) & (df[case_column].shift(periods=1) == df[case_column])]) * 100 / len(df[df[case_column].shift(periods=1) == df[case_column]])

test_event_accuracy = len(df_test[(df_test[baseline_next_event_column].shift(periods=1)==df_test[event_column]) & (df_test[case_column].shift(periods=1) == df_test[case_column])]) * 100 / len(df_test[df_test[case_column].shift(periods=1) == df_test[case_column]])

training_event_accuracy, test_event_accuracy

(64.9130980638327, 64.06370976661874)

In [13]:
# Mean Absolute Error of time predictions:

training_time_MAE = abs(df[timestamp_column] - df[baseline_next_timestamp_column].shift(periods=1))[df[case_column].shift(periods=1) == df[case_column]].mean()

test_time_MAE = abs(df_test[timestamp_column] - df_test[baseline_next_timestamp_column].shift(periods=1))[df_test[case_column].shift(periods=1) == df_test[case_column]].mean()

training_time_MAE, test_time_MAE

(Timedelta('0 days 16:20:59.739888261'),
 Timedelta('0 days 14:43:13.416911984'))