In [1]:
### IMPORT PACKAGES ###

import pandas as pd
import numpy as np

# to make pretty plots
import plotly
import plotly.express as px
import plotly.graph_objects as go

# to play with time :-)
import datetime
import time

In [68]:
# This notebook can be run separately from the deliverable tool.
#if (('df' not in globals()) or ('df_test' not in globals())):
    ### IMPORT THE DATA ###
df = pd.read_csv('data/BPI_Challenge_2012-training.csv')
df_test = pd.read_csv('data/BPI_Challenge_2012-test.csv')

# Defining database-specific variables
case_column = "case concept:name"
event_column = "event concept:name"
timestamp_column = "event time:timestamp"
timeformat_timestamp = "%d-%m-%Y %H:%M:%S.%f"
lifecycle_column = 'event lifecycle:transition'
amount_column = 'case AMOUNT_REQ'
df[amount_column] = [round(x,-4) for x in df[amount_column]]
df_test[amount_column] = [round(x,-4) for x in df_test[amount_column]]
#df[event_column] = [x if x!='A_SUBMITTED' else 'End_case/Begin_case' for x in df[event_column]]

In [114]:
480/sum(df[df[amount_column]>20000].sort_values([case_column,timestamp_column])[event_column].value_counts())

0.014631469853075656

In [117]:
df[df[amount_column]>20000].sort_values([case_column,timestamp_column])[event_column].value_counts()

W_Nabellen offertes               7096
W_Completeren aanvraag            6627
W_Nabellen incomplete dossiers    3810
W_Valideren aanvraag              3009
W_Afhandelen leads                1731
A_SUBMITTED                       1273
A_PARTLYSUBMITTED                 1273
A_PREACCEPTED                      904
O_SELECTED                         897
O_CREATED                          897
O_SENT                             897
A_DECLINED                         651
A_ACCEPTED                         611
A_FINALIZED                        596
O_CANCELLED                        514
O_SENT_BACK                        441
A_CANCELLED                        359
A_APPROVED                         257
A_ACTIVATED                        257
A_REGISTERED                       257
O_ACCEPTED                         257
O_DECLINED                         120
W_Beoordelen fraude                 71
W_Wijzigen contractgegevens          1
Name: event concept:name, dtype: int64

In [70]:
row_nr_column = 'row_nr'
additionalInfo = ['event lifecycle:transition']

# do you want any error plots?
printPlots = False
# do you want to write the plots to html?
writePlots = False


# events that are influenced by concurency
eventList = ['A_SUBMITTED', 'A_PARTLYSUBMITTED', 'A_PREACCEPTED', 'O_SELECTED', 'A_FINALIZED', 'O_ACCEPTED']

# variables
weight_newtimePassed = 0.1

# make a list of the columns you need
columns = [case_column, event_column, lifecycle_column, amount_column, timestamp_column]
#columns.extend(additionalInfo)
all_events_dict = {}
for event in list(df[event_column].unique()):
    all_events_dict[event] = 0

# you can get rid of all the other columns, to make things faster
dataset = df[columns]
dataset_test = df_test[columns]
dataset[row_nr_column] = dataset.index
dataset_test[row_nr_column] = dataset_test.index

print(dataset.head())
print(all_events_dict)

   case concept:name      event concept:name event lifecycle:transition  \
0             173688             A_SUBMITTED                   COMPLETE   
1             173688       A_PARTLYSUBMITTED                   COMPLETE   
2             173688           A_PREACCEPTED                   COMPLETE   
3             173688  W_Completeren aanvraag                   SCHEDULE   
4             173691             A_SUBMITTED                   COMPLETE   

   case AMOUNT_REQ     event time:timestamp  row_nr  
0            20000  01-10-2011 00:38:44.546       0  
1            20000  01-10-2011 00:38:44.880       1  
2            20000  01-10-2011 00:39:37.906       2  
3            20000  01-10-2011 00:39:38.875       3  
4                0  01-10-2011 08:08:58.256       4  
{'A_SUBMITTED': 0, 'A_PARTLYSUBMITTED': 0, 'A_PREACCEPTED': 0, 'W_Completeren aanvraag': 0, 'A_DECLINED': 0, 'W_Afhandelen leads': 0, 'A_ACCEPTED': 0, 'O_SELECTED': 0, 'A_FINALIZED': 0, 'O_CREATED': 0, 'O_SENT': 0, 'W_Nabelle

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset[row_nr_column] = dataset.index
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_test[row_nr_column] = dataset_test.index


In [71]:
### ON THE FLY ###
# This has two practical implications that we should be aware of
# 1) You can't preprocess the data, when a new event comes in, you have to be able to process it on the spot
# 2) You don't have any historical data, so your predictions can only rely on the things you have seen before

# This means that the data needs to be sorted on timestamp_column
# you need to first convert to a datetime, because otherwise you'll sort the strings
# and since the day of the month is the first part of the string, you will first get all firsts of all months, etc.
dataset.loc[:,timestamp_column] = pd.to_datetime(dataset.loc[:,timestamp_column], format=timeformat_timestamp)
dataset = dataset.sort_values(by=timestamp_column)

dataset_test.loc[:,timestamp_column] = pd.to_datetime(dataset_test.loc[:,timestamp_column], format=timeformat_timestamp)
dataset_test = dataset_test.sort_values(by=timestamp_column)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [72]:
s = {'error':-1, 'ok':1}
max(s, key=s.get)
str('None')+'k'

'Nonek'

In [159]:
### STORAGE ###
# we want to keep track of things that happen within one instance (also called trace or case)

# instanceStorage stores: key=instance, values= [event_column, timestamp_column, predictedDuration],
# to look up the previous event in this instance, and it's predicted TimePassed
instanceStorage = {}

# we also want to keep track of the predictionError
# errorStorage stores: key=unique integer, values= [previousEvent, previousTimestamp, predictionError, predictionErrorRatio, predictedTimePassed, timePassed]
errorStorage = {}

predictionError = 0
predictionErrorRatio=0
# we also want to add a log feature => we don't have to save this seperatly, just needed to check if having it makes sense.
# how many events of the same activty are running at the same time might be interesting
# concurentEvents = {}

def updateStorage(case, event, row_nr, amount, predictionError):


    # if this instance is already in the instanceStorage, we have a previous event
    if case in instanceStorage:

        # find the previous event
        previousEvent = instanceStorage[case][0]
        previousLifecycle = instanceStorage[case][4]
        predictedEvent = instanceStorage[case][7]
        previousEvent_id = instanceStorage[case][2]



        if predictedEvent == event:
            predictionError +=1




        # this is the rownumber of the current event, not of the previous one, but that doesn't matter,
        # I only need a unique key
        errorStorage[row_nr] = [previousEvent, previousLifecycle, predictionError,predictedEvent, event]

    # otherwise we don't have a previous event
    else:
        previousEvent = None
        previousEvent_id = None
        previousLifecycle = None
    event_id = str(event)+'_XX_'+str(amount)+'_XX_'+str(previousEvent)+'_XX_'+str(previousLifecycle)


    # key = case_column (so which case we're looking at), and the values are the activity and the timestamp
    #instanceStorage[case] = [event, previousEvent, event_id, previousEvent_id, lifecycle, previousLifecycle, amount, predictedEvent]


    # update concurentEvents -> I only used this once to be able to make pretty plots showing if it would have effect or not
    # howManyConcurentEvents = [x[0] for x in instanceStorage.values()].count(previousEvent)
    # concurentEvents[row_nr] = [previousEvent, howManyConcurentEvents, timePassed, row_nr]

    return previousEvent, previousEvent_id, previousLifecycle, event_id



In [160]:
### THE MODEL ###
# We start with only one feature: the historical average time between two events based on the previous event
# You don't need a second model for the test set, since you train your model on the training-set and then use it on the test-set

predictionModel = {}

def getPrediction(event_id):
    """

    """

    # if this activity is already in the predictionModel, then return the predicted duration
    # try & except works slightly faster then if key in dict
    # but only for valid keys, since you skip the if statement
    try:
        return max(predictionModel[event_id],key=predictionModel[event_id].get)

    except:
        return 'A_PARTLYSUBMITTED'



def updatePredictionModel(this_event_id, event):
    """

    """
    all_events_dict={'A_SUBMITTED': 0, 'A_PARTLYSUBMITTED': 0, 'A_PREACCEPTED': 0, 'W_Completeren aanvraag': 0, 'A_DECLINED': 0, 'W_Afhandelen leads': 0, 'A_ACCEPTED': 0, 'O_SELECTED': 0, 'A_FINALIZED': 0, 'O_CREATED': 0, 'O_SENT': 0, 'W_Nabellen offertes': 0, 'O_CANCELLED': 0, 'A_CANCELLED': 0, 'W_Beoordelen fraude': 0, 'O_SENT_BACK': 0, 'W_Valideren aanvraag': 0, 'W_Nabellen incomplete dossiers': 0, 'O_ACCEPTED': 0, 'A_APPROVED': 0, 'A_ACTIVATED': 0, 'A_REGISTERED': 0, 'O_DECLINED': 0, 'W_Wijzigen contractgegevens': 0, 'End_of_case':0}

    if this_event_id != None:

        # if this activity is already in the predictionModel, update the model
        # try & except works slightly faster then if key in dict
        # but only for valid keys, since you skip the if statement
        try:
            predictionModel[this_event_id][event] +=1

        except:
            predictionModel[this_event_id] = all_events_dict
            predictionModel[this_event_id][event] +=1



In [161]:
### RUNNING THE MODEL ###
# We have to run the model line by line, we can't do things in parallel, otherwise it's not on the fly

def processEvent(case, event, amount, lifecycle, row_nr):
    """

    """

    # step 1: preProcess the incoming event -> create an event_id that indicates the dataSlice that you want to use
    # The event-id is where you want to slice the data on before you take the average timePassed
    # If you don't want to include any additional information, this is just the event itself
    # otherwise include it in the event_id to make sure that you slice on everything you want to slice on




    # step 2: predict
    # look up if this event is already in predictionModel
    # if it is, use the predictedDuration as current prediction
    # otherwise predict: unknown
    previousEvent, previousEvent_id, previousLifecycle, event_id = updateStorage(case, event, row_nr, amount, predictionError)
    predictedEvent = getPrediction(event_id)

    # step 3: loop up historical data
    # you only know things of the past, so you want to check if there is a previous event in the instanceStorage
    # if there is, you can calculate the duration between that event and this one
    #case, event, event_id, row_nr, predictedEvent, amount, predictionError

    instanceStorage[case] = [event, previousEvent, event_id, previousEvent_id, lifecycle, previousLifecycle, amount, predictedEvent]
    # step 4: update predictionModel
    #this_event_id, previousEvent, event
    updatePredictionModel(previousEvent_id, event)



### Run over all lines =>
# let's first do a simple for-loop, maybe I can improve the running time by using .apply later on
#mini_dataset = dataset.loc[0:1000, :]
#print(mini_dataset)
# Start the stopwatch / counter

# Start the stopwatch / counter
start_time = time.process_time()

# running the on-the-fly model for each row
# I have tested it, ... for ... in ... does iterate over all events in order
# so we're not using information from the future.
[processEvent(case, event, amount, lifecycle, row_nr) for case, event, amount, lifecycle, row_nr in zip(dataset[case_column],dataset[event_column], dataset[amount_column], dataset[lifecycle_column], dataset[row_nr_column])]

# Stop the stopwatch / counter
end_time = time.process_time()

print("dict: Elapsed time during the whole program in seconds:", end_time-start_time)


dict: Elapsed time during the whole program in seconds: 1.671875


In [162]:
predictionModel

{'A_SUBMITTED_XX_20000_XX_None_XX_None': {'A_SUBMITTED': 0,
  'A_PARTLYSUBMITTED': 2355,
  'A_PREACCEPTED': 0,
  'W_Completeren aanvraag': 0,
  'A_DECLINED': 0,
  'W_Afhandelen leads': 0,
  'A_ACCEPTED': 0,
  'O_SELECTED': 0,
  'A_FINALIZED': 0,
  'O_CREATED': 0,
  'O_SENT': 0,
  'W_Nabellen offertes': 0,
  'O_CANCELLED': 0,
  'A_CANCELLED': 0,
  'W_Beoordelen fraude': 0,
  'O_SENT_BACK': 0,
  'W_Valideren aanvraag': 0,
  'W_Nabellen incomplete dossiers': 0,
  'O_ACCEPTED': 0,
  'A_APPROVED': 0,
  'A_ACTIVATED': 0,
  'A_REGISTERED': 0,
  'O_DECLINED': 0,
  'W_Wijzigen contractgegevens': 0,
  'End_of_case': 0},
 'A_PARTLYSUBMITTED_XX_20000_XX_A_SUBMITTED_XX_COMPLETE': {'A_SUBMITTED': 0,
  'A_PARTLYSUBMITTED': 0,
  'A_PREACCEPTED': 1059,
  'W_Completeren aanvraag': 0,
  'A_DECLINED': 411,
  'W_Afhandelen leads': 873,
  'A_ACCEPTED': 0,
  'O_SELECTED': 0,
  'A_FINALIZED': 0,
  'O_CREATED': 0,
  'O_SENT': 0,
  'W_Nabellen offertes': 0,
  'O_CANCELLED': 0,
  'A_CANCELLED': 0,
  'W_Beoordele

In [163]:
instanceStorage

{173688: ['W_Valideren aanvraag',
  'O_ACCEPTED',
  'W_Valideren aanvraag_XX_20000_XX_O_ACCEPTED_XX_COMPLETE',
  'O_ACCEPTED_XX_20000_XX_A_REGISTERED_XX_COMPLETE',
  'COMPLETE',
  'COMPLETE',
  20000,
  'A_PARTLYSUBMITTED'],
 173691: ['W_Valideren aanvraag',
  'A_ACTIVATED',
  'W_Valideren aanvraag_XX_0_XX_A_ACTIVATED_XX_COMPLETE',
  'A_ACTIVATED_XX_0_XX_A_REGISTERED_XX_COMPLETE',
  'COMPLETE',
  'COMPLETE',
  0,
  'A_PARTLYSUBMITTED'],
 173694: ['W_Wijzigen contractgegevens',
  'W_Valideren aanvraag',
  'W_Wijzigen contractgegevens_XX_10000_XX_W_Valideren aanvraag_XX_COMPLETE',
  'W_Valideren aanvraag_XX_10000_XX_A_APPROVED_XX_COMPLETE',
  'SCHEDULE',
  'COMPLETE',
  10000,
  'W_Wijzigen contractgegevens'],
 173697: ['A_DECLINED',
  'A_PARTLYSUBMITTED',
  'A_DECLINED_XX_20000_XX_A_PARTLYSUBMITTED_XX_COMPLETE',
  'A_PARTLYSUBMITTED_XX_20000_XX_A_SUBMITTED_XX_COMPLETE',
  'COMPLETE',
  'COMPLETE',
  20000,
  'A_PARTLYSUBMITTED'],
 173700: ['A_DECLINED',
  'A_PARTLYSUBMITTED',
  'A_DECLI

In [164]:
errorStorage

{1: ['A_SUBMITTED', 'COMPLETE', 1, 'A_PARTLYSUBMITTED', 'A_PARTLYSUBMITTED'],
 2: ['A_PARTLYSUBMITTED', 'COMPLETE', 0, 'A_PARTLYSUBMITTED', 'A_PREACCEPTED'],
 3: ['A_PREACCEPTED',
  'COMPLETE',
  0,
  'A_PARTLYSUBMITTED',
  'W_Completeren aanvraag'],
 5: ['A_SUBMITTED', 'COMPLETE', 1, 'A_PARTLYSUBMITTED', 'A_PARTLYSUBMITTED'],
 6: ['A_PARTLYSUBMITTED', 'COMPLETE', 0, 'A_PARTLYSUBMITTED', 'A_PREACCEPTED'],
 7: ['A_PREACCEPTED',
  'COMPLETE',
  0,
  'A_PARTLYSUBMITTED',
  'W_Completeren aanvraag'],
 9: ['A_SUBMITTED', 'COMPLETE', 1, 'A_PARTLYSUBMITTED', 'A_PARTLYSUBMITTED'],
 11: ['A_SUBMITTED', 'COMPLETE', 1, 'A_PARTLYSUBMITTED', 'A_PARTLYSUBMITTED'],
 12: ['A_PARTLYSUBMITTED',
  'COMPLETE',
  0,
  'A_PARTLYSUBMITTED',
  'A_PREACCEPTED'],
 13: ['A_PREACCEPTED',
  'COMPLETE',
  0,
  'A_PARTLYSUBMITTED',
  'W_Completeren aanvraag'],
 14: ['A_PARTLYSUBMITTED', 'COMPLETE', 0, 'A_PREACCEPTED', 'A_DECLINED'],
 16: ['A_SUBMITTED', 'COMPLETE', 1, 'A_PARTLYSUBMITTED', 'A_PARTLYSUBMITTED'],
 17: 

In [172]:
dataset_test = dataset_test.sort_values([case_column,timestamp_column])
dataset_test['y'] = dataset_test[event_column].shift(periods= -1)
dataset_test['previous_event']=0
dataset_test['previous_lifecycle']=0
all_ids = list(dataset_test[case_column].unique())

for ids in all_ids:
    df2= dataset_test[dataset_test[case_column]==ids].copy()
    dataset_test.loc[dataset_test[case_column]==ids, 'previous_event']= df2[event_column].shift(periods=1)
    dataset_test.loc[dataset_test[case_column]==ids, 'previous_lifecycle']= df2['event lifecycle:transition'].shift(periods=1)
dataset_test.loc[dataset_test['previous_event'].isnull(), 'previous_event']=None
dataset_test.loc[dataset_test['previous_lifecycle'].isnull(), 'previous_lifecycle']=None
dataset_test.reset_index(drop=True, inplace=True)

In [173]:
def getPrediction_test(event, amount, previousEvent, previousLifecycle):

    event_id = str(event)+'_XX_'+str(amount)+'_XX_'+str(previousEvent)+'_XX_'+str(previousLifecycle)


    # if this activity is already in the predictionModel, then return the predicted duration
    # try & except works slightly faster then if key in dict
    # but only for valid keys, since you skip the if statement
    try:
        return max(predictionModel[event_id],key=predictionModel[event_id].get)

    except:
        return 'W_Nabellen offertes'


In [174]:
predictions = [getPrediction_test(event, amount, previousEvent, previousLifecycle) for event, amount, previousEvent, previousLifecycle in zip(dataset_test[event_column], dataset_test[amount_column], dataset_test['previous_event'], dataset_test['previous_lifecycle'])]

In [175]:
predictionModel.keys()

dict_keys(['A_SUBMITTED_XX_20000_XX_None_XX_None', 'A_PARTLYSUBMITTED_XX_20000_XX_A_SUBMITTED_XX_COMPLETE', 'A_PREACCEPTED_XX_20000_XX_A_PARTLYSUBMITTED_XX_COMPLETE', 'A_SUBMITTED_XX_0_XX_None_XX_None', 'A_PARTLYSUBMITTED_XX_0_XX_A_SUBMITTED_XX_COMPLETE', 'A_PREACCEPTED_XX_0_XX_A_PARTLYSUBMITTED_XX_COMPLETE', 'A_SUBMITTED_XX_10000_XX_None_XX_None', 'A_PARTLYSUBMITTED_XX_10000_XX_A_SUBMITTED_XX_COMPLETE', 'A_PREACCEPTED_XX_10000_XX_A_PARTLYSUBMITTED_XX_COMPLETE', 'A_SUBMITTED_XX_30000_XX_None_XX_None', 'A_PARTLYSUBMITTED_XX_30000_XX_A_SUBMITTED_XX_COMPLETE', 'A_SUBMITTED_XX_40000_XX_None_XX_None', 'A_PARTLYSUBMITTED_XX_40000_XX_A_SUBMITTED_XX_COMPLETE', 'A_PREACCEPTED_XX_40000_XX_A_PARTLYSUBMITTED_XX_COMPLETE', 'W_Afhandelen leads_XX_30000_XX_A_PARTLYSUBMITTED_XX_COMPLETE', 'W_Afhandelen leads_XX_30000_XX_W_Afhandelen leads_XX_SCHEDULE', 'A_PREACCEPTED_XX_30000_XX_W_Afhandelen leads_XX_START', 'W_Completeren aanvraag_XX_30000_XX_A_PREACCEPTED_XX_COMPLETE', 'W_Afhandelen leads_XX_20000_X

In [176]:
dataset_test['y']

0         A_PARTLYSUBMITTED
1                A_DECLINED
2               A_SUBMITTED
3         A_PARTLYSUBMITTED
4             A_PREACCEPTED
                ...        
47818    W_Afhandelen leads
47819    W_Afhandelen leads
47820            A_DECLINED
47821    W_Afhandelen leads
47822                   NaN
Name: y, Length: 47823, dtype: object

In [177]:
count=0
for i in range(len(predictions)):
    if predictions[i]==dataset_test['y'][i]:
        count+=1
print(f'accuracy: {count/len(predictions)}')


accuracy: 0.775066390648851


concurent_df = pd.DataFrame.from_dict(concurentEvents, orient='index', columns=['event_id', 'concurent_events', 'timePassed','row_nr'])
print(concurent_df.head())


In [None]:
### THE PREDICTION ERROR ###
# plot different things to get more insight
# first the difference in time passed
# then the log of the difference in time passed
# then the ratio between the difference in time passed and the real time passed

# flatten the error dictionary to a dataframe
error_df = pd.DataFrame.from_dict(errorStorage, orient='index', columns=['event_column', 'timestamp', 'predictionError', 'predictionErrorRatio', 'predictedTimePassed', 'timePassed'])
error_df.loc[:,'timestamp'] = pd.to_datetime(error_df.loc[:,'timestamp'])
error_df = error_df.sort_values(by='timestamp')
error_df['event_column_basic'] = error_df.event_column.str.split('_XX_').str.get(0)

# calculate the log of the prediction error
error_df['log_predictionError'] = [np.log2(x) for x in error_df['predictionError']]

# calculate statistics
mae = np.mean(error_df['predictionError'])
print('The mean absolute error on the training-set is: ' + str(round(mae, 2)) + ' minutes.')


In [None]:
### HISTOGRAMS ###

if printPlots == True:
    # make the prediction error plot
    fig= px.line(error_df, x='timestamp', y='predictionError', color='event_column')
    fig.update_layout(title='Prediction error (difference in timePassed between real timePassed an predicted timePassed) per activity')
    fig.show()
    if writePlots == True:
        fig.write_html("images/predictionError_withboth.html")

    # make the log prediction error plot
    fig= px.line(error_df, x='timestamp', y='log_predictionError', color='event_column')
    fig.update_layout(title='Prediction error ratio (log of the difference in timePassed) per activity')
    fig.show()
    if writePlots == True:
        fig.write_html("images/log_predictionError_withboth.html")

    # make the ratio prediction error plot
    fig= px.line(error_df, x='timestamp', y='predictionErrorRatio', color='event_column')
    fig.update_layout(title='Prediction error ratio (difference in timePassed divided by real timePassed) per activity')
    fig.show()
    if writePlots==True:
        fig.write_html("images/predictionErrorRatio_withboth.html")

In [None]:
### BOXPLOTS ###

if printPlots == True:
    fig = px.box(error_df, y='predictionError', color='event_column_basic')
    fig.show()
    if writePlots == True:
        fig.write_html('images/box_predictionError_withboth.html')


    fig = px.box(error_df, y='predictionErrorRatio', color='event_column_basic')
    fig.show()
    if writePlots == True:
        fig.write_html('images/box_predictionErrorRatio_withboth.html')


In [None]:
if printPlots == True:
    for activity in eventList:
        fig=px.box(error_df[error_df['event_column_basic'] == activity], y='predictionError', color='event_column')
        fig.show()
        if writePlots == True:
            fig.write_html('images/box_predictionError_splitForConcurentEvents.html')

# there is a difference in error distribution, so it looks like it does have an effect.
# I don't save the timePassed anywhere, so I can't check on that. But in the InfluenceOnTime file I have tested that there is indeed a
# different distribution

In [None]:
### RUN ON THE TEST SET ###
# You can use the model that you have created during the train, so you don't have to start with empty model
# but you do have to start with an empty storage

instanceStorage = {}
errorStorage = {}

# Start the stopwatch / counter
start_time = time.process_time()

# running the on-the-fly model for each row
# I have tested it, ... for ... in ... does iterate over all events in order
# so we're not using information from the future.
[processEvent(case, event, additional, timestamp, row_nr, includeAdditional=True) for case,event,additional,timestamp,row_nr in zip(dataset_test[case_column],dataset_test[event_column], dataset_test[additionalInfo].iloc[:,0], dataset_test[timestamp_column], dataset_test[row_nr_column])]

# Stop the stopwatch / counter
end_time = time.process_time()

print("dict: Elapsed time during the whole program on the testset in seconds:", end_time-start_time)



In [None]:
### THE PREDICTION ERROR ###
# plot different things to get more insight
# first the difference in time passed
# then the log of the difference in time passed
# then the ratio between the difference in time passed and the real time passed

# flatten the error dictionary to a dataframe
error_df_test = pd.DataFrame.from_dict(errorStorage, orient='index', columns=['event_column', 'timestamp', 'predictionError', 'predictionErrorRatio', 'predictedTimePassed', 'timePassed'])
error_df_test.loc[:,'timestamp'] = pd.to_datetime(error_df_test.loc[:,'timestamp'])
error_df_test = error_df_test.sort_values(by='timestamp')
error_df_test['event_column_basic'] = error_df_test.event_column.str.split('_XX_').str.get(0)

# calculate the log of the prediction error
error_df_test['log_predictionError'] = [np.log2(x) for x in error_df_test['predictionError']]

# calculate statistics
mae = np.mean(error_df_test['predictionError'])
print('The mean absolute error on the test-set is: ' + str(round(mae, 2)) + ' minutes.')


In [None]:
### BOXPLOTS ###

if printPlots == True:
    fig = px.box(error_df_test, y='predictionError', color='event_column_basic', title='PredictionError on the test-dataset')
    fig.show()
    if writePlots == True:
        fig.write_html('images/TEST_box_predictionError_withboth.html')


    fig = px.box(error_df_test, y='predictionErrorRatio', color='event_column_basic', title='PredictionErrorRatio on the test-dataset')
    fig.show()
    if writePlots == True:
        fig.write_html('images/TEST_box_predictionErrorRatio_withboth.html')

In [None]:
print(error_df_test.head(5))


# to make an accurate prediction, it is important to know how much variance there is when you only take the previous event into account.
# because if there is still a lot of variance, you can never make an accurate prediction.

activities = pd.unique(error_df['event_column_basic'])


for activity in activities:

    print(activity)
    subset = error_df[error_df['event_column_basic'] == activity]['predictionError']
    print(np.mean(subset))
    print(np.std(subset))
    print(' ')

concurent_df[event_column] = concurent_df.event_id.str.split('_XX_').str.get(0)
concurent_df2 = concurent_df.dropna()

for activity in pd.unique(concurent_df2[event_column]):

    print(activity)
    fig= px.box(concurent_df2[concurent_df2[event_column] == activity], y='timePassed', color='concurent_events')
    fig.show()
    fig.write_html(activity+'_concurentEvents.html')

concurent_df2.to_csv('data/concurent_df.csv')