In [1]:
### IMPORT PACKAGES ###

import pandas as pd
import numpy as np

# to make pretty plots
import plotly
import plotly.express as px
import plotly.graph_objects as go

# to play with time :-)
import datetime
import time

In [2]:
# This notebook can be run separately from the deliverable tool.
if (('df' not in globals()) or ('df_test' not in globals())):
    ### IMPORT THE DATA ###

    # there are 3 datasets

    # df is the test set, this is used to train the model
    df = pd.read_csv('final data/BPI_Challenge_2012-training.csv')

    # df_validation is the validation set, this is used to train the parameters, in our case the learningRate (how much our model should learn from a new event)
    df_validation = pd.read_csv('final data/BPI_Challenge_2012-validation.csv')

    # df_test is the unseen test ste, this is used to test how well the model works
    df_test = pd.read_csv('final data/BPI_Challenge_2012-test.csv')


    # Defining database-specific variables
    case_column = "case concept:name"
    event_column = "event concept:name"
    timestamp_column = "event time:timestamp"
    timeformat_timestamp = "%Y-%m-%d %H:%M:%S.%f"
    lifecycle_column = 'event lifecycle:transition'

    # set the row_number column
    row_nr_column = 'row_nr'


    # do you want to see the evaluation of the error?
    errorEval = True
    # do you want any error plots? (only available if errorEval == True)
    printPlots = False
    # do you want to write the plots to html? (only available if both errorEval and printPlots == True)
    writePlots = False

else:
    # if you run this from the global tool, you won't see the evaluation anyways, so why compute it :-)
    # but if you want, you can just set this to True
    errorEval = False

In [3]:
# test if all columns are present:
def checkIfColumnsArePresent (myDataFrame,listOfColumns):
    """
    :param myDataFrame: the dataframe in which you want to check if the columns are present
    :param listOfColumns: the list of columns that you want to check if they are present
    :return: all Columns in the listOfColumns that are present in myDataFrame
    """

    allColumns = set(myDataFrame.columns)
    if not set(listOfColumns).issubset(allColumns):
        print (f"{' and '.join(set(listOfColumns).difference(allColumns))} are not available in the dataframe")
        return list(set(listOfColumns).intersection(allColumns))
    else:
        print ("All columns are available in the dataframe")
        return listOfColumns



# variables
learningRate = 0.1

# make a list of the columns you need
columns = [case_column, event_column, timestamp_column]
if lifecycle_column != None:
    columns.append(lifecycle_column)

# you can get rid of all the other columns, to make things faster
# but you first have to check if they are there:
# you only have to do this on one of the dataframes,
# since they should all be subsets of the same big dataframe
columns = checkIfColumnsArePresent (df,columns)

dataset = df[columns]
dataset_validation = df_validation[columns]
dataset_test = df_test[columns]

dataset[row_nr_column] = dataset.index
dataset_validation[row_nr_column] = dataset_validation.index
dataset_test[row_nr_column] = dataset_test.index

print(dataset.head())

All columns are available in the dataframe
   case concept:name      event concept:name     event time:timestamp  \
0             173688             A_SUBMITTED  2011-10-01 00:38:44.546   
1             173688       A_PARTLYSUBMITTED  2011-10-01 00:38:44.880   
2             173688           A_PREACCEPTED  2011-10-01 00:39:37.906   
3             173688  W_Completeren aanvraag  2011-10-01 00:39:38.875   
4             173691             A_SUBMITTED  2011-10-01 08:08:58.256   

  event lifecycle:transition  row_nr  
0                   COMPLETE       0  
1                   COMPLETE       1  
2                   COMPLETE       2  
3                   SCHEDULE       3  
4                   COMPLETE       4  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [4]:
### ON THE FLY ###
# This has two practical implications that we should be aware of
# 1) You can't preprocess the data, when a new event comes in, you have to be able to process it on the spot
# 2) You don't have any historical data, so your predictions can only rely on the things you have seen before

# This means that the data needs to be sorted on timestamp_column
# you need to first convert to a datetime, because otherwise you'll sort the strings
# and since the day of the month is the first part of the string, you will first get all firsts of all months, etc.
dataset.loc[:,timestamp_column] = pd.to_datetime(dataset.loc[:,timestamp_column], format=timeformat_timestamp)
dataset = dataset.sort_values(by=timestamp_column)

dataset_test.loc[:,timestamp_column] = pd.to_datetime(dataset_test.loc[:,timestamp_column], format=timeformat_timestamp)
dataset_test = dataset_test.sort_values(by=timestamp_column)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [5]:
### STORAGE ###
# we want to keep track of things that happen within one instance (also called trace or case)

# instanceStorage stores: key=instance, values= [event_column, timestamp_column, predictedDuration],
# to look up the previous event in this instance, and it's predicted TimePassed
instanceStorage = {}

# we also want to keep track of the predictionError
# errorStorage stores: key=unique integer, values= [previousEvent, previousTimestamp, predictionError, predictionErrorRatio, predictedTimePassed, timePassed]
errorStorage = {}


def updateStorage(case, event, event_base, timestamp, row_nr, predictedDuration):
    """
    :param case: string; the case_id, so which instance this event belongs to
    :param event: string; the id of the event we're evaluating
    :param event_base: string; the name of the event as is in the original database
    :param timestamp: pandas datetime; the time of the event that we're evaluating
    :param row_nr: int; unique id in the dataset
    :param predictedDuration: float; the predicted time for this event
    (which you need to store to calculate the error when you know the real time passed when the next event comes in)
    :return: previousEvent, timePassed; string, pandas datetime delta;
    the id of the previous event and the time that passed since that event, you need this information to update the model
    """

    # if this instance is already in the instanceStorage, we have a previous event
    if case in instanceStorage:

        # find the previous event
        previousEvent = instanceStorage[case][0]
        previousRow_nr = instanceStorage[case][3]

        # calculate the time passed between the previous event and this event
        thisTimestamp = pd.to_datetime(timestamp, format=timeformat_timestamp)
        previousTimestamp = pd.to_datetime(instanceStorage[case][1], format=timeformat_timestamp)
        timePassed = (thisTimestamp - previousTimestamp)/ datetime.timedelta(minutes=1)

        # since we have also saved the predicted time between the previous event and this event,
        # we can now also compute the difference

        predictedTimePassed = instanceStorage[case][2]
        if predictedTimePassed != None:
            predictionError = abs(predictedTimePassed - timePassed)
            if timePassed > 0:
                predictionErrorRatio = predictionError/timePassed
            else:
                predictionErrorRatio = None
        else:
            predictionError = None
            predictionErrorRatio = None
            predictedTimePassed = None


        # the key is now the row number of the event that is in the values
        errorStorage[previousRow_nr] = [previousEvent, previousTimestamp, predictionError, predictionErrorRatio, predictedTimePassed, timePassed]

    # otherwise we don't have a previous event
    else:
        previousEvent = None
        timePassed = None

    # key = case_column (so which case we're looking at), and the values are the activity and the timestamp
    instanceStorage[case] = [event, timestamp, predictedDuration, row_nr, event_base]


    return previousEvent, timePassed



In [6]:
### THE MODEL ###
# We start with only one feature: the historical average time between two events based on the previous event
# You don't need a second model for the test set, since you train your model on the training-set and then use it on the test-set

predictionModel = {}

def getPrediction(event):
    """
    :param event: string; the name of the activity for which you want a prediction of time until the next event
    :return: float (or None); predicted time until the next event in this instance, if there is already data on this activity,
    otherwise None
    """

    # if this activity is already in the predictionModel, then return the predicted duration
    # try & except works slightly faster then if key in dict
    # but only for valid keys, since you skip the if statement
    try:
        return predictionModel[event]

    except:
        return None



def updatePredictionModel(this_event_id, timePassed, learningRate):
    """
    :param this_event_id: string; the slicing-id of the current event
    :param timePassed: datatime delta; the time between the previous event in this instance and the current event
    :param learningRate: float; how much weight should the new timePassed have in the update
    :return: None
    It will update the global variable predictionModel
    """

    if timePassed != None:

        # if this activity is already in the predictionModel, update the model
        # try & except works slightly faster then if key in dict
        # but only for valid keys, since you skip the if statement
        try:
            predictionModel[this_event_id] = timePassed*learningRate + predictionModel[this_event_id]*(1-learningRate)

        except:
            predictionModel[this_event_id] = timePassed


In [7]:
### RUNNING THE MODEL ###
# We have to run the model line by line, we can't do things in parallel, otherwise it's not on the fly
# This is the function that runs the model and outputs the predictedTimeStamp for the next event

def processEvent(case, event, additional, timestamp, row_nr, includeAdditional=True, returnPrediction=False):
    """
    :param case: string; the case_id, so which instance this event belongs to
    :param event: string; the id of the event we're evaluating
    :param additional: string or number; something additional that you want to slice on
    :param timestamp: pandas datetime; the time of the event that we're evaluating
    :param row_nr: int; unique id in the dataset
    :param includeAdditional: bool, if you want to include additional information to slice on or you want the baseline model
    :return: None
    It updates the global variables predictionModel, instanceStorage and errorStorage
    """

    # step 1: preProcess the incoming event -> create an event_id that indicates the dataSlice that you want to use
    # The event-id is where you want to slice the data on before you take the average timePassed
    # If you don't want to include any additional information, this is just the event itself
    # otherwise include it in the event_id to make sure that you slice on everything you want to slice on
    if includeAdditional == True:

        howManyConcurentEvents = [x[4] for x in instanceStorage.values()].count(event)

        #howManyConcurentEvents = [x[0].split('_XX_')[0] for x in instanceStorage.values()].count(event)
        if howManyConcurentEvents > 3:
            # when an event is the last event in an instance, it won't get out of the instance storage
            # so this number can get really high for events that are common to be the last one
            # even though they are not actually running at the same time. Therefore I will top it off at 3.
            howManyConcurentEvents = 3
        event_id = event+'_XX_'+additional+'_XX_'+str(howManyConcurentEvents)

    else:
        event_id = event


    # step 2: predict
    # look up if this event is already in predictionModel
    # if it is, use the predictedDuration as current prediction
    # otherwise predict: unknown
    predictedDuration = getPrediction(event_id)

    # step 3: loop up historical data
    # you only know things of the past, so you want to check if there is a previous event in the instanceStorage
    # if there is, you can calculate the duration between that event and this one
    previousEvent, timePassed = updateStorage(case, event_id , event, timestamp, row_nr, predictedDuration)

    # step 4: update predictionModel
    updatePredictionModel(previousEvent, timePassed, learningRate)

    # step 5: return the prediction of the next timestamp
    if returnPrediction == True:
        try:
            # we can't predict the next timestamp if we don't have enough historical data yet to make prediction of the timeDelta
            # so if the timeDelta is still None, we will have to return None
            return timestamp+datetime.timedelta(minutes=predictedDuration)
        except:
            return None



In [8]:
### TRAIN THE MODEL ON THE TRAIN DATASET ###
# We have to run the model line by line, we can't do things in parallel, otherwise it's not on the fly
# This actually runs the model

### Run over all lines =>
# Start the stopwatch / counter

start_time = time.process_time()

# running the on-the-fly model for each row
# I have tested it, ... for ... in ... does iterate over all events in order
# so we're not using information from the future.
df["OnTheFly Prediction for Next Timestamp"] = [processEvent(case, event, additional, timestamp, row_nr, includeAdditional=True, returnPrediction=True) for case,event,additional,timestamp,row_nr in zip(dataset[case_column],dataset[event_column], dataset[additionalInfo], dataset[timestamp_column], dataset[row_nr_column])]

# Stop the stopwatch / counter
end_time = time.process_time()

print("dict: Elapsed time during the training the model in seconds:", end_time-start_time)

dict: Elapsed time during the training the model in seconds: 73.515625


In [9]:
if errorEval == True:

    ### THE PREDICTION ERROR ###
    # plot different things to get more insight
    # first the difference in time passed
    # then the log of the difference in time passed
    # then the ratio between the difference in time passed and the real time passed

    # flatten the error dictionary to a dataframe
    error_df = pd.DataFrame.from_dict(errorStorage, orient='index', columns=['event_column', 'timestamp', 'predictionError', 'predictionErrorRatio', 'predictedTimePassed', 'timePassed'])
    error_df.loc[:,'timestamp'] = pd.to_datetime(error_df.loc[:,'timestamp'])
    error_df = error_df.sort_values(by='timestamp')
    error_df['event_column_basic'] = error_df.event_column.str.split('_XX_').str.get(0)

    # calculate the log of the prediction error
    error_df['log_predictionError'] = [np.log2(x) for x in error_df['predictionError']]

    # calculate statistics
    mae = np.mean(error_df['predictionError'])
    print('The mean absolute error on the training-set is: ' + str(round(mae, 2)) + ' minutes.')


The mean absolute error on the training-set is: 649.94 minutes.


  app.launch_new_instance()


In [10]:
if errorEval == True:


    if printPlots == True:

        ### HISTOGRAMS ###

        # make the prediction error plot
        fig= px.line(error_df, x='timestamp', y='predictionError', color='event_column')
        fig.update_layout(title='Prediction error (difference in timePassed between real timePassed an predicted timePassed) per activity')
        fig.show()
        if writePlots == True:
            fig.write_html("images/predictionError_withboth.html")

        # make the log prediction error plot
        fig= px.line(error_df, x='timestamp', y='log_predictionError', color='event_column')
        fig.update_layout(title='Prediction error ratio (log of the difference in timePassed) per activity')
        fig.show()
        if writePlots == True:
            fig.write_html("images/log_predictionError_withboth.html")

        # make the ratio prediction error plot
        fig= px.line(error_df, x='timestamp', y='predictionErrorRatio', color='event_column')
        fig.update_layout(title='Prediction error ratio (difference in timePassed divided by real timePassed) per activity')
        fig.show()
        if writePlots==True:
            fig.write_html("images/predictionErrorRatio_withboth.html")


        ### BOXPLOTS ###
        fig = px.box(error_df, y='predictionError', color='event_column_basic')
        fig.show()
        if writePlots == True:
            fig.write_html('images/box_predictionError_withboth.html')

        fig = px.box(error_df, y='predictionErrorRatio', color='event_column_basic')
        fig.show()
        if writePlots == True:
            fig.write_html('images/box_predictionErrorRatio_withboth.html')

In [11]:
if errorEval == True:

    if printPlots == True:

        eventList = np.unique(error_df['event_column_basic'])

        for activity in eventList:
            fig=px.box(error_df[error_df['event_column_basic'] == activity], y='predictionError', color='event_column')
            fig.show()
            if writePlots == True:
                fig.write_html('images/box_predictionError_splitForConcurentEvents.html')

    # there is a difference in error distribution, so it looks like it does have an effect.
    # I don't save the timePassed anywhere, so I can't check on that. But in the InfluenceOnTime file I have tested that there is indeed a
    # different distribution

In [12]:
### SET THE PARAMETERS ON THE VALIDATION SET ###


learningRateList = [0.01, 0.1, 0.5]
maeList = []

# Start the stopwatch / counter
start_time = time.process_time()

for learningRate in learningRateList:
    # You can use the model that you have created during the train, so you don't have to start with empty model
    # but you do have to start with an empty storage
    instanceStorage = {}
    errorStorage = {}

    # running the on-the-fly model for each row
    # I have tested it, ... for ... in ... does iterate over all events in order
    # so we're not using information from the future.
    df_validation["OnTheFly Prediction for Next Timestamp"] = [processEvent(case, event, additional, timestamp, row_nr, includeAdditional=True, returnPrediction=True) for case,event,additional,timestamp,row_nr in zip(dataset_validation[case_column],dataset_validation[event_column], dataset_validation[additionalInfo], dataset_validation[timestamp_column], dataset_validation[row_nr_column])]

    error_df_validation = pd.DataFrame.from_dict(errorStorage, orient='index', columns=['event_column', 'timestamp', 'predictionError', 'predictionErrorRatio', 'predictedTimePassed', 'timePassed'])
    maeList.append(np.mean(error_df_validation['predictionError']))

# let the learningRate to the learningRate with the lowest mae
learningRate = learningRateList[np.argmin(maeList)]

# Stop the stopwatch / counter
end_time = time.process_time()

print("dict: Elapsed time during the setting of the parameters on the validation set took ", end_time-start_time , "seconds.")
print("The new learning rate is ", learningRate)


dict: Elapsed time during the setting of the parameters on the validation set took  19.9375 seconds.
The new learning rate is  0.01


In [13]:
### RUN ON THE TEST SET ###
# You can use the model that you have created during the train, so you don't have to start with empty model
# but you do have to start with an empty storage

instanceStorage = {}
errorStorage = {}

# Start the stopwatch / counter
start_time = time.process_time()

# running the on-the-fly model for each row
# I have tested it, ... for ... in ... does iterate over all events in order
# so we're not using information from the future.
df_test["OnTheFly Prediction for Next Timestamp"] = [processEvent(case, event, additional, timestamp, row_nr, includeAdditional=True, returnPrediction=True) for case,event,additional,timestamp,row_nr in zip(dataset_test[case_column],dataset_test[event_column], dataset_test[lifecycle_column], dataset_test[timestamp_column], dataset_test[row_nr_column])]

# Stop the stopwatch / counter
end_time = time.process_time()

print("dict: Elapsed time during the whole program on the testset in seconds:", end_time-start_time)



dict: Elapsed time during the whole program on the testset in seconds: 5.140625


In [14]:
if errorEval == True:

    ### THE PREDICTION ERROR ###
    # plot different things to get more insight
    # first the difference in time passed
    # then the log of the difference in time passed
    # then the ratio between the difference in time passed and the real time passed

    # flatten the error dictionary to a dataframe
    error_df_test = pd.DataFrame.from_dict(errorStorage, orient='index', columns=['event_column', 'timestamp', 'predictionError', 'predictionErrorRatio', 'predictedTimePassed', 'timePassed'])
    error_df_test.loc[:,'timestamp'] = pd.to_datetime(error_df_test.loc[:,'timestamp'])
    error_df_test = error_df_test.sort_values(by='timestamp')
    error_df_test['event_column_basic'] = error_df_test.event_column.str.split('_XX_').str.get(0)

    # calculate the log of the prediction error
    error_df_test['log_predictionError'] = [np.log2(x) for x in error_df_test['predictionError']]

    # calculate statistics
    mae = np.mean(error_df_test['predictionError'])
    print('The mean absolute error on the test-set is: ' + str(round(mae, 2)) + ' minutes.')


The mean absolute error on the test-set is: 573.57 minutes.


In [15]:
if errorEval == True:
    ### BOXPLOTS ###

    if printPlots == True:
        fig = px.box(error_df_test, y='predictionError', color='event_column_basic', title='PredictionError on the test-dataset')
        fig.show()
        if writePlots == True:
            fig.write_html('images/TEST_box_predictionError_withboth.html')


        fig = px.box(error_df_test, y='predictionErrorRatio', color='event_column_basic', title='PredictionErrorRatio on the test-dataset')
        fig.show()
        if writePlots == True:
            fig.write_html('images/TEST_box_predictionErrorRatio_withboth.html')