In [7]:
### IMPORT PACKAGES ###

import pandas as pd
import numpy as np

# to make pretty plots
import plotly
import plotly.express as px
import plotly.graph_objects as go

# to play with time :-)
import datetime
import time

dict_keys(['__name__', '__doc__', '__package__', '__loader__', '__spec__', '__builtin__', '__builtins__', '_ih', '_oh', '_dh', 'In', 'Out', 'get_ipython', 'exit', 'quit', '_', '__', '___', 'DataFrame', 'MultiIndex', 'print_columns', '_i', '_ii', '_iii', '_i1', '_1', 'sys', 'remove_imported_pydev_package', '_pydevd_bundle', 'pydev_jupyter_vars', '_i2', '_i3', '_3', '_i4', '_4', '_i5', '_i6', '_6', '_i7', 'pd', 'np', 'plotly', 'px', 'go', 'datetime', 'time', '_i8', '_8', '_i9', 'df', 'df_test', 'case_column', 'event_column', 'timestamp_column', 'timeformat_timestamp', 'lifecycle_column', 'amount_column', 'row_nr_column', 'pred_event_otf', '_i10'])

In [9]:
### IMPORT THE DATA ###

if (('df' not in globals()) or ('df_test' not in globals())):
    ### IMPORT THE DATA ###
    df = pd.read_csv('data/BPI_Challenge_2012-training.csv')
    df_test = pd.read_csv('data/BPI_Challenge_2012-test.csv')

# Defining database-specific variables
case_column = "case concept:name"
event_column = "event concept:name"
timestamp_column = "event time:timestamp"
timeformat_timestamp = "%d-%m-%Y %H:%M:%S.%f"
lifecycle_column = 'event lifecycle:transition'
amount_column = 'case AMOUNT_REQ'
row_nr_column = 'row_nr'
pred_event_otf = "pred_event_otf"

In [291]:
#make a list with all possible events
all_events=list(df[event_column].unique())

In [292]:
# get most common event
df_most_common_event =pd.DataFrame(df[event_column].value_counts())
df_most_common_event.reset_index(inplace=True)
most_common_event = df_most_common_event.iloc[0,0]

In [293]:
# additional columns that you want to include
additionalInfo = [amount_column]

# events that are influenced by concurency
eventList = ['A_SUBMITTED', 'A_PARTLYSUBMITTED', 'A_PREACCEPTED', 'O_SELECTED', 'A_FINALIZED', 'O_ACCEPTED']

# make a list of the columns you need
columns = [case_column, event_column, lifecycle_column]
columns.extend(additionalInfo)


# you can get rid of all the other columns, to make things faster
dataset = df[columns]
dataset_test = df_test[columns]
dataset[row_nr_column] = dataset.index
dataset_test[row_nr_column] = dataset_test.index

# round the amount column on 10000
if amount_column in list(dataset):
    dataset[amount_column] = [round(x,-4) for x in dataset[amount_column]]
    dataset_test[amount_column] = [round(x,-4) for x in dataset_test[amount_column]]
print(dataset.head())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset[row_nr_column] = dataset.index
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_test[row_nr_column] = dataset_test.index
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset[amount_column] = [round(x,-4) for x in dataset[amount_column]]
A value is trying to be set on a copy of a sli

   case concept:name      event concept:name event lifecycle:transition  \
0             173688             A_SUBMITTED                   COMPLETE   
1             173688       A_PARTLYSUBMITTED                   COMPLETE   
2             173688           A_PREACCEPTED                   COMPLETE   
3             173688  W_Completeren aanvraag                   SCHEDULE   
4             173691             A_SUBMITTED                   COMPLETE   

   case AMOUNT_REQ  row_nr  
0            20000       0  
1            20000       1  
2            20000       2  
3            20000       3  
4                0       4  


In [294]:
### STORAGE ###
# we want to keep track of things that happen within one case

# caseStorage stores: key=instance, values= [event, previousEvent, event_id, previousEvent_id, lifecycle, previousLifecycle, amount, predictedEvent]
# to look up the previous event in this instance, and it's predicted TimePassed
caseStorage = {}

# we also want to keep track of the predictionError
# errorStorage stores: key=unique integer, values= [previousEvent, previousLifecycle, predictionError,predictedEvent, event]
errorStorage = {}

# we also want to add a log feature => we don't have to save this seperatly, just needed to check if having it makes sense.
# how many events of the same activty are running at the same time might be interesting
# concurentEvents = {}

def getFeatures(case, event):
    """
    This function extracts extra info of incomming event from the caseStorage
    """

    # if the case is already in storage, info of previous event is extracted from caseStorage
    if case in caseStorage:

        previousEvent = caseStorage[case][0]
        previousLifecycle = caseStorage[case][4]
        predictedEvent = caseStorage[case][7]
        previousEvent_id = caseStorage[case][2]
        previousRow_nr = caseStorage[case][8]

        # if prediction of previous event is correct, change predictionError to 1 and add the event to errorstorage
        if predictedEvent == event:
            predictionError = 1
        else:
            predictionError = 0
        errorStorage[previousRow_nr] = [previousEvent, previousLifecycle, predictionError,predictedEvent, event]

    # if it is a new case set info of previous event to None
    else:
        previousEvent = None
        previousEvent_id = None
        previousLifecycle = None

    if event in eventList:
        # you want to find how many times this activity occurs without any additional information
        howManyConcurentEvents = [x[0] for x in caseStorage.values()].count(event)
    else:
        howManyConcurentEvents = ''

    return previousEvent, previousEvent_id, previousLifecycle, howManyConcurentEvents



In [295]:
### THE MODEL ###
# We start with only one feature: the historical average time between two events based on the previous event
# You don't need a second model for the test set, since you train your model on the training-set and then use it on the test-set

predictionModel = {}

def getPrediction(event_id):
    """
    gets the prediction of a certain event_id
    """

    try:
        # get the event of an event_id with the highest occurrence
        return max(predictionModel[event_id],key=predictionModel[event_id].get)

    except:
        # if event_id not in predictionModel return the most frequent event in train data as prediction
        return most_common_event



def updatePredictionModel(this_event_id, event):
    """
    updates PredictionModel with the true event of a event_id
    """
    # makes dict with all events as key and 0 as value
    all_events_dict=dict([(event,0) for event in all_events])

    if this_event_id != None:

        # if event_id already in predictionModel update it by +1 to the true event
        try:
            predictionModel[this_event_id][event] +=1
        # if event_id not in predictionModel, add it to the dict with the all_events_dict as key and update it by +1 tot the true event
        except:
            predictionModel[this_event_id] = all_events_dict
            predictionModel[this_event_id][event] +=1



In [296]:
### RUNNING THE MODEL ###
# We have to run the model line by line, we can't do things in parallel, otherwise it's not on the fly

def processEvent(case, event, lifecycle, row_nr, amount=None):
    """
    the input is the the raw info of a case
    the function updates the predictionModel, caseStorage and the errorStorage
    """
    #gets info of the incomming event
    previousEvent, previousEvent_id, previousLifecycle, howManyConcurentEvents = getFeatures(case, event)

    # makes the event_id based on the features and is used to make the prediction
    event_id = str(event)+'_XX_'+str(amount)+'_XX_'+str(previousEvent)+'_XX_'+str(previousLifecycle)+'_XX_'+str(howManyConcurentEvents)

    # gets the prediction of event_id from the predictionModel
    predictedEvent = getPrediction(event_id)

    # update storage
    caseStorage[case] = [event, previousEvent, event_id, previousEvent_id, lifecycle, previousLifecycle, amount, predictedEvent, row_nr]

    # update predictionModel
    updatePredictionModel(previousEvent_id, event)






### Run over all lines =>

# Start the stopwatch / counter
start_time = time.process_time()

# running the on-the-fly model for each row

[processEvent(case, event, lifecycle, row_nr, amount) for case, event, lifecycle, row_nr, amount in zip(dataset[case_column],dataset[event_column], dataset[lifecycle_column], dataset[row_nr_column], dataset[amount_column])]

# Stop the stopwatch / counter
end_time = time.process_time()

print("dict: Elapsed time during the whole program in seconds:", end_time-start_time)


dict: Elapsed time during the whole program in seconds: 18.421875


In [297]:
# count keeps track on how many the model was correct and divides by the amount of predictions made
count_train=0
predictions_index_train = []
for key in errorStorage:
    count_train += errorStorage[key][2]
    predictions_index_train.append([key,errorStorage[key][3]])
len_set_train = len(errorStorage)
train_acc_event_otf = count_train/len_set_train
print(f'train accuracy:{train_acc_event_otf}')
for case in caseStorage:
    predictions_index_train.append([caseStorage[case][8], None])
print(f'length df = {len(df)}, length predictions = {len(predictions_index_train)}')
train_pred_event_otf = [x[1] for x in sorted(predictions_index_train)]

train accuracy:0.8166329913490398
length df = 214377, length predictions = 214377


In [298]:
# empty errorStorage and run model on test set
errorStorage = {}
caseStorage = {}
[processEvent(case, event, lifecycle, row_nr, amount) for case, event, lifecycle, row_nr, amount in zip(dataset_test[case_column],dataset_test[event_column], dataset_test[lifecycle_column], dataset_test[row_nr_column], dataset[amount_column])]

# count keeps track on how many the model was correct and divides by the amount of predictions made

count_test=0
predictions_index_test = []
for key in errorStorage:
    count_test += errorStorage[key][2]
    predictions_index_test.append([key, errorStorage[key][3]])
len_set_test = len(errorStorage)
test_acc_event_otf = count_test/len_set_test
print(f'test accuracy:{test_acc_event_otf}')
for case in caseStorage:
    predictions_index_test.append([caseStorage[case][8], None])
print(f'length df = {len(df_test)}, length predictions = {len(predictions_index_test)}')
test_pred_event_otf = [x[1] for x in sorted(predictions_index_test)]

test accuracy:0.8156398628470302
length df = 47823, length predictions = 47823


In [300]:
df[pred_event_otf] = train_pred_event_otf
df_test[pred_event_otf] = test_pred_event_otf
df_test

Unnamed: 0,eventID,case concept:name,case REG_DATE,case AMOUNT_REQ,event concept:name,event lifecycle:transition,event time:timestamp,pred,pred_event_otf
0,44964012621824,206324,2012-02-03T17:17:11.047+01:00,0,A_SUBMITTED,COMPLETE,03-02-2012 17:17:11.047,A_PARTLYSUBMITTED,A_PARTLYSUBMITTED
1,44964012621825,206324,2012-02-03T17:17:11.047+01:00,0,A_PARTLYSUBMITTED,COMPLETE,03-02-2012 17:17:11.323,A_PREACCEPTED,A_PREACCEPTED
2,44964012621826,206324,2012-02-03T17:17:11.047+01:00,0,A_DECLINED,COMPLETE,03-02-2012 17:17:42.964,,
3,44968307589120,206327,2012-02-03T17:23:41.949+01:00,10000,A_SUBMITTED,COMPLETE,03-02-2012 17:23:41.949,A_PARTLYSUBMITTED,A_PARTLYSUBMITTED
4,44968307589121,206327,2012-02-03T17:23:41.949+01:00,10000,A_PARTLYSUBMITTED,COMPLETE,03-02-2012 17:23:42.504,W_Afhandelen leads,W_Afhandelen leads
...,...,...,...,...,...,...,...,...,...
47818,54666343743523,213276,2012-02-27T14:12:41.868+01:00,20000,W_Nabellen incomplete dossiers,START,14-03-2012 15:59:28.309,W_Nabellen incomplete dossiers,W_Nabellen incomplete dossiers
47819,54666343743524,213276,2012-02-27T14:12:41.868+01:00,20000,W_Nabellen incomplete dossiers,COMPLETE,14-03-2012 16:00:09.680,,
47820,49495203119136,209595,2012-02-15T10:10:36.503+01:00,10000,W_Nabellen offertes,START,14-03-2012 16:02:03.883,,
47821,52342766436386,211624,2012-02-21T23:38:40.044+01:00,40000,W_Nabellen incomplete dossiers,START,14-03-2012 16:04:46.192,W_Nabellen incomplete dossiers,W_Nabellen incomplete dossiers
