In [1]:
### IMPORT PACKAGES ###

import pandas as pd
import numpy as np

# to make pretty plots
import plotly
import plotly.express as px
import plotly.graph_objects as go

# to play with time :-)
import datetime
import time

In [2]:
# This notebook can be run separately from the deliverable tool.
if (('df' not in globals()) or ('df_test' not in globals())):
    ### IMPORT THE DATA ###
    df = pd.read_csv('data/BPI_Challenge_2012-training.csv')
    df_test = pd.read_csv('data/BPI_Challenge_2012-test.csv')

    # Defining database-specific variables
    case_column = "case concept:name"
    event_column = "event concept:name"
    timestamp_column = "event time:timestamp"
    timeformat_timestamp = "%d-%m-%Y %H:%M:%S.%f"

In [3]:
row_nr_column = 'row_nr'
additionalInfo = ['event lifecycle:transition']

# do you want any error plots?
printPlots = False
# do you want to write the plots to html?
writePlots = False


# events that are influenced by concurency
eventList = ['A_SUBMITTED', 'A_PARTLYSUBMITTED', 'A_PREACCEPTED', 'O_SELECTED', 'A_FINALIZED', 'O_ACCEPTED']

# variables
weight_newtimePassed = 0.1

# make a list of the columns you need
columns = [case_column, event_column, timestamp_column]
columns.extend(additionalInfo)

# you can get rid of all the other columns, to make things faster
dataset = df[columns]
dataset_test = df_test[columns]
dataset[row_nr_column] = dataset.index
dataset_test[row_nr_column] = dataset_test.index

print(dataset.head())

   case concept:name      event concept:name     event time:timestamp  \
0             173688             A_SUBMITTED  01-10-2011 00:38:44.546   
1             173688       A_PARTLYSUBMITTED  01-10-2011 00:38:44.880   
2             173688           A_PREACCEPTED  01-10-2011 00:39:37.906   
3             173688  W_Completeren aanvraag  01-10-2011 00:39:38.875   
4             173691             A_SUBMITTED  01-10-2011 08:08:58.256   

  event lifecycle:transition  row_nr  
0                   COMPLETE       0  
1                   COMPLETE       1  
2                   COMPLETE       2  
3                   SCHEDULE       3  
4                   COMPLETE       4  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [4]:
### ON THE FLY ###
# This has two practical implications that we should be aware of
# 1) You can't preprocess the data, when a new event comes in, you have to be able to process it on the spot
# 2) You don't have any historical data, so your predictions can only rely on the things you have seen before

# This means that the data needs to be sorted on timestamp_column
# you need to first convert to a datetime, because otherwise you'll sort the strings
# and since the day of the month is the first part of the string, you will first get all firsts of all months, etc.
dataset.loc[:,timestamp_column] = pd.to_datetime(dataset.loc[:,timestamp_column], format=timeformat_timestamp)
dataset = dataset.sort_values(by=timestamp_column)

dataset_test.loc[:,timestamp_column] = pd.to_datetime(dataset_test.loc[:,timestamp_column], format=timeformat_timestamp)
dataset_test = dataset_test.sort_values(by=timestamp_column)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [5]:
### STORAGE ###
# we want to keep track of things that happen within one instance (also called trace or case)

# instanceStorage stores: key=instance, values= [event_column, timestamp_column, predictedDuration],
# to look up the previous event in this instance, and it's predicted TimePassed
instanceStorage = {}

# we also want to keep track of the predictionError
# errorStorage stores: key=unique integer, values= [previousEvent, previousTimestamp, predictionError, predictionErrorRatio, predictedTimePassed, timePassed]
errorStorage = {}


# we also want to add a log feature => we don't have to save this seperatly, just needed to check if having it makes sense.
# how many events of the same activty are running at the same time might be interesting
# concurentEvents = {}

def updateStorage(case, event, timestamp, row_nr, predictedDuration):
    """
    :param case: string; the case_id, so which instance this event belongs to
    :param event: string; the id of the event we're evaluating
    :param timestamp: pandas datetime; the time of the event that we're evaluating
    :param row_nr: int; unique id in the dataset
    :param predictedDuration: float; the predicted time for this event
    (which you need to store to calculate the error when you know the real time passed when the next event comes in)
    :return: previousEvent, timePassed; string, pandas datetime delta;
    the id of the previous event and the time that passed since that event, you need this information to update the model
    """

    # if this instance is already in the instanceStorage, we have a previous event
    if case in instanceStorage:

        # find the previous event
        previousEvent = instanceStorage[case][0]
        previousRow_nr = instanceStorage[case][3]

        # calculate the time passed between the previous event and this event
        thisTimestamp = pd.to_datetime(timestamp, format=timeformat_timestamp)
        previousTimestamp = pd.to_datetime(instanceStorage[case][1], format=timeformat_timestamp)
        timePassed = (thisTimestamp - previousTimestamp)/ datetime.timedelta(minutes=1)

        # since we have also saved the predicted time between the previous event and this event,
        # we can now also compute the difference

        predictedTimePassed = instanceStorage[case][2]
        if predictedTimePassed != None:
            predictionError = abs(predictedTimePassed - timePassed)
            if timePassed > 0:
                predictionErrorRatio = predictionError/timePassed
            else:
                predictionErrorRatio = None
        else:
            predictionError = None
            predictionErrorRatio = None
            predictedTimePassed = None


        # the key is now the row number of the event that is in the values
        errorStorage[previousRow_nr] = [previousEvent, previousTimestamp, predictionError, predictionErrorRatio, predictedTimePassed, timePassed]

    # otherwise we don't have a previous event
    else:
        previousEvent = None
        timePassed = None

    # key = case_column (so which case we're looking at), and the values are the activity and the timestamp
    instanceStorage[case] = [event, timestamp, predictedDuration, row_nr]


    # update concurentEvents -> I only used this once to be able to make pretty plots showing if it would have effect or not
    # howManyConcurentEvents = [x[0] for x in instanceStorage.values()].count(previousEvent)
    # concurentEvents[row_nr] = [previousEvent, howManyConcurentEvents, timePassed, row_nr]

    return previousEvent, timePassed



In [6]:
### THE MODEL ###
# We start with only one feature: the historical average time between two events based on the previous event
# You don't need a second model for the test set, since you train your model on the training-set and then use it on the test-set

predictionModel = {}

def getPrediction(event):
    """
    :param event: string; the name of the activity for which you want a prediction of time until the next event
    :return: float (or None); predicted time until the next event in this instance, if there is already data on this activity,
    otherwise None
    """

    # if this activity is already in the predictionModel, then return the predicted duration
    # try & except works slightly faster then if key in dict
    # but only for valid keys, since you skip the if statement
    try:
        return predictionModel[event]

    except:
        return None



def updatePredictionModel(this_event_id, timePassed, weight_newtimePassed):
    """
    :param this_event_id: string; the slicing-id of the current event
    :param timePassed: datatime delta; the time between the previous event in this instance and the current event
    :param weight_newtimePassed: float; how much weight should the new timePassed have in the update
    :return: None
    It will update the global variable predictionModel
    """

    if timePassed != None:

        # if this activity is already in the predictionModel, update the model
        # try & except works slightly faster then if key in dict
        # but only for valid keys, since you skip the if statement
        try:
            predictionModel[this_event_id] = timePassed*weight_newtimePassed + predictionModel[this_event_id]*(1-weight_newtimePassed)

        except:
            predictionModel[this_event_id] = timePassed


In [7]:
### RUNNING THE MODEL ###
# We have to run the model line by line, we can't do things in parallel, otherwise it's not on the fly

def processEvent(case, event, additional, timestamp, row_nr, includeAdditional=True):
    """
    :param case: string; the case_id, so which instance this event belongs to
    :param event: string; the id of the event we're evaluating
    :param additional: string or number; something additional that you want to slice on
    :param timestamp: pandas datetime; the time of the event that we're evaluating
    :param row_nr: int; unique id in the dataset
    :param includeAdditional: bool, if you want to include additional information to slice on or you want the baseline model
    :return: None
    It updates the global variables predictionModel, instanceStorage and errorStorage
    """

    # step 1: preProcess the incoming event -> create an event_id that indicates the dataSlice that you want to use
    # The event-id is where you want to slice the data on before you take the average timePassed
    # If you don't want to include any additional information, this is just the event itself
    # otherwise include it in the event_id to make sure that you slice on everything you want to slice on
    if includeAdditional == True:

        if event in eventList:
            # you want to find how many times this activity occurs without any additional information
            howManyConcurentEvents = [x[0].split('_XX_')[0] for x in instanceStorage.values()].count(event)
            event_id = event+'_XX_'+additional+'_XX_'+str(howManyConcurentEvents)
        else:
            event_id = event+'_XX_'+additional
    else:
        event_id = event


    # step 2: predict
    # look up if this event is already in predictionModel
    # if it is, use the predictedDuration as current prediction
    # otherwise predict: unknown
    predictedDuration = getPrediction(event_id)

    # step 3: loop up historical data
    # you only know things of the past, so you want to check if there is a previous event in the instanceStorage
    # if there is, you can calculate the duration between that event and this one
    previousEvent, timePassed = updateStorage(case, event_id , timestamp, row_nr, predictedDuration)

    # step 4: update predictionModel
    updatePredictionModel(previousEvent, timePassed, weight_newtimePassed)



### Run over all lines =>
# let's first do a simple for-loop, maybe I can improve the running time by using .apply later on
#mini_dataset = dataset.loc[0:1000, :]
#print(mini_dataset)
# Start the stopwatch / counter

# Start the stopwatch / counter
start_time = time.process_time()

# running the on-the-fly model for each row
# I have tested it, ... for ... in ... does iterate over all events in order
# so we're not using information from the future.
[processEvent(case, event, additional, timestamp, row_nr, includeAdditional=True) for case,event,additional,timestamp,row_nr in zip(dataset[case_column],dataset[event_column], dataset[additionalInfo].iloc[:,0], dataset[timestamp_column], dataset[row_nr_column])]

# Stop the stopwatch / counter
end_time = time.process_time()

print("dict: Elapsed time during the whole program in seconds:", end_time-start_time)


dict: Elapsed time during the whole program in seconds: 116.8125


concurent_df = pd.DataFrame.from_dict(concurentEvents, orient='index', columns=['event_id', 'concurent_events', 'timePassed','row_nr'])
print(concurent_df.head())


In [8]:
### THE PREDICTION ERROR ###
# plot different things to get more insight
# first the difference in time passed
# then the log of the difference in time passed
# then the ratio between the difference in time passed and the real time passed

# flatten the error dictionary to a dataframe
error_df = pd.DataFrame.from_dict(errorStorage, orient='index', columns=['event_column', 'timestamp', 'predictionError', 'predictionErrorRatio', 'predictedTimePassed', 'timePassed'])
error_df.loc[:,'timestamp'] = pd.to_datetime(error_df.loc[:,'timestamp'])
error_df = error_df.sort_values(by='timestamp')
error_df['event_column_basic'] = error_df.event_column.str.split('_XX_').str.get(0)

# calculate the log of the prediction error
error_df['log_predictionError'] = [np.log2(x) for x in error_df['predictionError']]

# calculate statistics
mae = np.mean(error_df['predictionError'])
print('The mean absolute error on the training-set is: ' + str(round(mae, 2)) + ' minutes.')


  


The mean absolute error on the training-set is: 725.8 minutes.


In [9]:
### HISTOGRAMS ###

if printPlots == True:
    # make the prediction error plot
    fig= px.line(error_df, x='timestamp', y='predictionError', color='event_column')
    fig.update_layout(title='Prediction error (difference in timePassed between real timePassed an predicted timePassed) per activity')
    fig.show()
    if writePlots == True:
        fig.write_html("images/predictionError_withboth.html")

    # make the log prediction error plot
    fig= px.line(error_df, x='timestamp', y='log_predictionError', color='event_column')
    fig.update_layout(title='Prediction error ratio (log of the difference in timePassed) per activity')
    fig.show()
    if writePlots == True:
        fig.write_html("images/log_predictionError_withboth.html")

    # make the ratio prediction error plot
    fig= px.line(error_df, x='timestamp', y='predictionErrorRatio', color='event_column')
    fig.update_layout(title='Prediction error ratio (difference in timePassed divided by real timePassed) per activity')
    fig.show()
    if writePlots==True:
        fig.write_html("images/predictionErrorRatio_withboth.html")

In [10]:
### BOXPLOTS ###

if printPlots == True:
    fig = px.box(error_df, y='predictionError', color='event_column_basic')
    fig.show()
    if writePlots == True:
        fig.write_html('images/box_predictionError_withboth.html')


    fig = px.box(error_df, y='predictionErrorRatio', color='event_column_basic')
    fig.show()
    if writePlots == True:
        fig.write_html('images/box_predictionErrorRatio_withboth.html')


In [11]:
if printPlots == True:
    for activity in eventList:
        fig=px.box(error_df[error_df['event_column_basic'] == activity], y='predictionError', color='event_column')
        fig.show()
        if writePlots == True:
            fig.write_html('images/box_predictionError_splitForConcurentEvents.html')

# there is a difference in error distribution, so it looks like it does have an effect.
# I don't save the timePassed anywhere, so I can't check on that. But in the InfluenceOnTime file I have tested that there is indeed a
# different distribution

In [12]:
### RUN ON THE TEST SET ###
# You can use the model that you have created during the train, so you don't have to start with empty model
# but you do have to start with an empty storage

instanceStorage = {}
errorStorage = {}

# Start the stopwatch / counter
start_time = time.process_time()

# running the on-the-fly model for each row
# I have tested it, ... for ... in ... does iterate over all events in order
# so we're not using information from the future.
[processEvent(case, event, additional, timestamp, row_nr, includeAdditional=True) for case,event,additional,timestamp,row_nr in zip(dataset_test[case_column],dataset_test[event_column], dataset_test[additionalInfo].iloc[:,0], dataset_test[timestamp_column], dataset_test[row_nr_column])]

# Stop the stopwatch / counter
end_time = time.process_time()

print("dict: Elapsed time during the whole program on the testset in seconds:", end_time-start_time)



dict: Elapsed time during the whole program on the testset in seconds: 6.84375


In [13]:
### THE PREDICTION ERROR ###
# plot different things to get more insight
# first the difference in time passed
# then the log of the difference in time passed
# then the ratio between the difference in time passed and the real time passed

# flatten the error dictionary to a dataframe
error_df_test = pd.DataFrame.from_dict(errorStorage, orient='index', columns=['event_column', 'timestamp', 'predictionError', 'predictionErrorRatio', 'predictedTimePassed', 'timePassed'])
error_df_test.loc[:,'timestamp'] = pd.to_datetime(error_df_test.loc[:,'timestamp'])
error_df_test = error_df_test.sort_values(by='timestamp')
error_df_test['event_column_basic'] = error_df_test.event_column.str.split('_XX_').str.get(0)

# calculate the log of the prediction error
error_df_test['log_predictionError'] = [np.log2(x) for x in error_df_test['predictionError']]

# calculate statistics
mae = np.mean(error_df_test['predictionError'])
print('The mean absolute error on the test-set is: ' + str(round(mae, 2)) + ' minutes.')


The mean absolute error on the test-set is: 619.86 minutes.


In [22]:
### MERGE ORIGINAL DATAFRAME WITH THE ERROR DF ###

total_df = df.join(error_df)

# check if this works properly
# but this should be a left join on the dataframe, and the index should match

              eventID   case concept:name                  case REG_DATE  \
214375  35858681954366             199678  2012-01-10T19:16:52.800+01:00   
214376  35858681954367             199678  2012-01-10T19:16:52.800+01:00   

        case AMOUNT_REQ   event concept:name event lifecycle:transition  \
214375            30000  W_Nabellen offertes                      START   
214376            30000  W_Nabellen offertes                   COMPLETE   

           event time:timestamp  
214375  14-03-2012 15:36:15.299  
214376  14-03-2012 15:40:34.231  
                        event_column               timestamp  predictionError  \
214375  W_Nabellen offertes_XX_START 2012-03-14 15:36:15.299         1.428251   

        predictionErrorRatio  predictedTimePassed  timePassed  \
214375              0.330956             2.887283    4.315533   

         event_column_basic  log_predictionError  
214375  W_Nabellen offertes             0.514249  


In [14]:
### BOXPLOTS ###

if printPlots == True:
    fig = px.box(error_df_test, y='predictionError', color='event_column_basic', title='PredictionError on the test-dataset')
    fig.show()
    if writePlots == True:
        fig.write_html('images/TEST_box_predictionError_withboth.html')


    fig = px.box(error_df_test, y='predictionErrorRatio', color='event_column_basic', title='PredictionErrorRatio on the test-dataset')
    fig.show()
    if writePlots == True:
        fig.write_html('images/TEST_box_predictionErrorRatio_withboth.html')

In [15]:
print(error_df_test.head(5))

### MERGE ORIGINAL DATAFRAME WITH THE ERROR DF ###

total_df_test = df_test.join(error_df_test)

# check if this works properly
# but this should be a left join on the dataframe, and the index should match

                         event_column               timestamp  \
0        A_SUBMITTED_XX_COMPLETE_XX_0 2012-02-03 17:17:11.047   
1  A_PARTLYSUBMITTED_XX_COMPLETE_XX_0 2012-02-03 17:17:11.323   
3        A_SUBMITTED_XX_COMPLETE_XX_0 2012-02-03 17:23:41.949   
4  A_PARTLYSUBMITTED_XX_COMPLETE_XX_0 2012-02-03 17:23:42.504   
5      A_PREACCEPTED_XX_COMPLETE_XX_0 2012-02-03 17:24:23.379   

   predictionError  predictionErrorRatio  predictedTimePassed  timePassed  \
0         0.004439              0.964925             0.009039    0.004600   
1         0.008243              0.015630             0.535593    0.527350   
3         0.000655              0.070834             0.008595    0.009250   
4         0.146482              0.215019             0.534768    0.681250   
5         0.002926              0.260872             0.008291    0.011217   

  event_column_basic  log_predictionError  
0        A_SUBMITTED            -7.815662  
1  A_PARTLYSUBMITTED            -6.922686  
3        A_SUB

# to make an accurate prediction, it is important to know how much variance there is when you only take the previous event into account.
# because if there is still a lot of variance, you can never make an accurate prediction.

activities = pd.unique(error_df['event_column_basic'])


for activity in activities:

    print(activity)
    subset = error_df[error_df['event_column_basic'] == activity]['predictionError']
    print(np.mean(subset))
    print(np.std(subset))
    print(' ')

concurent_df[event_column] = concurent_df.event_id.str.split('_XX_').str.get(0)
concurent_df2 = concurent_df.dropna()

for activity in pd.unique(concurent_df2[event_column]):

    print(activity)
    fig= px.box(concurent_df2[concurent_df2[event_column] == activity], y='timePassed', color='concurent_events')
    fig.show()
    fig.write_html(activity+'_concurentEvents.html')

concurent_df2.to_csv('data/concurent_df.csv')