In [None]:
# imports
import pandas as pd
from datetime import datetime
import re

In [None]:
# This notebook can be run separately from the deliverable tool.
if (('df' not in globals()) or ('df_test' not in globals())):
    df = pd.read_csv('data/BPI_Challenge_2012-training.csv')
    
    df_test = pd.read_csv('data/BPI_Challenge_2012-test.csv')

    # Defining database-specific variables
    case_column = "case concept:name"
    registration_time_column = "case REG_DATE"
    event_column = "event concept:name"
    timestamp_column = "event time:timestamp"
    timeformat_registration = "%Y-%m-%dT%H:%M:%S" # new time format
    timeformat_timestamp = "%d-%m-%Y %H:%M:%S.%f"

    # Names of columns we will add in this notebook
    position_column = "Position"
    baseline_next_event_column = "Baseline Prediction for Next Activity"
    baseline_next_timestamp_column = "Baseline Prediction for Next Timestamp"

df.head(10)

In [None]:
# Basic data preprocessing of the timestamps
df[registration_time_column]= [re.sub('\..*|\+.*','',a,flags=re.DOTALL) for a in df[registration_time_column]]
df[registration_time_column] = [datetime.strptime(date, timeformat_registration) for date in df[registration_time_column]]
df[timestamp_column] = [datetime.strptime(date, timeformat_timestamp) for date in df[timestamp_column]]
df = df.sort_values(by=[case_column, timestamp_column]).reset_index() # sort values by user and time of event
df.info()

In [None]:
# Basic data preprocessing of the timestamps
df_test[registration_time_column]= [re.sub('\..*|\+.*','',a,flags=re.DOTALL) for a in df_test[registration_time_column]]
df_test[registration_time_column] = [datetime.strptime(date, timeformat_registration) for date in df_test[registration_time_column]]
df_test[timestamp_column] = [datetime.strptime(date, timeformat_timestamp) for date in df_test[timestamp_column]]
df_test = df_test.sort_values(by=[case_column, timestamp_column]).reset_index() # sort values by user and time of event
df_test.info()

In [None]:
# We fill in the Position column that shows which position is a certain event in the trace (the first event is 1)
df[position_column] = df.groupby([case_column]).cumcount()+1
df_test[position_column] = df_test.groupby([case_column]).cumcount()+1

In [None]:
# Code Explanation: We subtract the timestamps and then shift the results up so the difference from n to n+1 is on row n.
# Only if both events are in the same trace should include the row in the mean() calculation.
shifted_deltatimes = df[timestamp_column].diff().shift(periods=-1)[df[case_column].shift(periods=-1) == df[case_column]]

# The mean() function will return NaT if the input is empty, we replace this with pd.Timedelta(0)
def replacenat(timedelta):
    if (pd.isna(timedelta)):
        return pd.Timedelta(0)
    else:
        return timedelta

# list of unique events in the data
unique_events = df[event_column].unique()

In [None]:
# dictionary to store the most common (mode) event following the key event
dict_common_next_event = {event: df[(df[case_column].shift(periods=-1) == df[case_column]) & (df[event_column].shift(periods=1) == event)][event_column].mode()[0] for event in unique_events}

In [None]:
# We apply the dictionary to the dataset to get the baseline event prediction
df[baseline_next_event_column] = [(dict_common_next_event[event] if (event in dict_common_next_event) else "-") for event in df[event_column]]

df[df[case_column] == 185548]

In [None]:
# dictionary to store the average time for each event
dict_time_per_event = {event: replacenat(shifted_deltatimes[df[event_column] == event].mean()) for event in unique_events}

In [None]:
# apply the average time to the dataframe to get the baseline time prediction
df[baseline_next_timestamp_column] = [(time + dict_time_per_event[event] if (event in dict_time_per_event) else time) for event,time in zip(df[event_column], df[timestamp_column])]

df[df[case_column] == 185548]

In [None]:
# Applying the baseline predictions to the test dataset
df_test[baseline_next_event_column] = [(dict_common_next_event[event] if (event in dict_common_next_event) else "-") for event in df_test[event_column]]

df_test[baseline_next_timestamp_column] = [(time + dict_time_per_event[event] if (event in dict_time_per_event) else time) for event,time in zip(df_test[event_column], df_test[timestamp_column])]

df_test[df_test[case_column] == 206327]

In [None]:
# Accuracy % of event predictions:

training_event_accuracy = len(df[(df[baseline_next_event_column].shift(periods=1)==df[event_column]) & (df[case_column].shift(periods=1) == df[case_column])]) * 100 / len(df[df[case_column].shift(periods=1) == df[case_column]])

test_event_accuracy = len(df_test[(df_test[baseline_next_event_column].shift(periods=1)==df_test[event_column]) & (df_test[case_column].shift(periods=1) == df_test[case_column])]) * 100 / len(df_test[df_test[case_column].shift(periods=1) == df_test[case_column]])

training_event_accuracy, test_event_accuracy

In [None]:
# Mean Absolute Error of time predictions:

training_time_MAE = abs(df[timestamp_column] - df[baseline_next_timestamp_column].shift(periods=1))[df[case_column].shift(periods=1) == df[case_column]].mean()

test_time_MAE = abs(df_test[timestamp_column] - df_test[baseline_next_timestamp_column].shift(periods=1))[df_test[case_column].shift(periods=1) == df_test[case_column]].mean()

training_time_MAE, test_time_MAE