In [298]:
### IMPORT PACKAGES ###

import pandas as pd
import numpy as np

# to make pretty plots
import plotly
import plotly.express as px
import plotly.graph_objects as go

# to play with time :-)
import datetime
import time

In [299]:
# This notebook can be run separately from the deliverable tool.
#if (('df' not in globals()) or ('df_test' not in globals())):
    ### IMPORT THE DATA ###
df = pd.read_csv('data/BPI_Challenge_2012-training.csv')
df_test = pd.read_csv('data/BPI_Challenge_2012-test.csv')

# Defining database-specific variables
case_column = "case concept:name"
event_column = "event concept:name"
timestamp_column = "event time:timestamp"
timeformat_timestamp = "%d-%m-%Y %H:%M:%S.%f"
lifecycle_column = 'event lifecycle:transition'
amount_column = 'case AMOUNT_REQ'
df[amount_column] = [round(x,-4) for x in df[amount_column]]
df_test[amount_column] = [round(x,-4) for x in df_test[amount_column]]
all_events=list(df[event_column].unique())
#df[event_column] = [x if x!='A_SUBMITTED' else 'End_case/Begin_case' for x in df[event_column]]

In [300]:
dict_all_events = dict([(event,0) for event in all_events])
dict_all_events

{'A_SUBMITTED': 0,
 'A_PARTLYSUBMITTED': 0,
 'A_PREACCEPTED': 0,
 'W_Completeren aanvraag': 0,
 'A_DECLINED': 0,
 'W_Afhandelen leads': 0,
 'A_ACCEPTED': 0,
 'O_SELECTED': 0,
 'A_FINALIZED': 0,
 'O_CREATED': 0,
 'O_SENT': 0,
 'W_Nabellen offertes': 0,
 'O_CANCELLED': 0,
 'A_CANCELLED': 0,
 'W_Beoordelen fraude': 0,
 'O_SENT_BACK': 0,
 'W_Valideren aanvraag': 0,
 'W_Nabellen incomplete dossiers': 0,
 'O_ACCEPTED': 0,
 'A_APPROVED': 0,
 'A_ACTIVATED': 0,
 'A_REGISTERED': 0,
 'O_DECLINED': 0,
 'W_Wijzigen contractgegevens': 0}

In [301]:
#W_Nabellen offertes
#W_Completeren aanvraag
#W_Nabellen incomplete
df[event_column].value_counts()


W_Nabellen offertes               43880
W_Completeren aanvraag            43480
W_Nabellen incomplete dossiers    21075
W_Valideren aanvraag              17089
W_Afhandelen leads                13662
A_SUBMITTED                       10469
A_PARTLYSUBMITTED                 10469
A_DECLINED                         6152
A_PREACCEPTED                      5884
O_SELECTED                         5686
O_CREATED                          5686
O_SENT                             5686
A_ACCEPTED                         4099
A_FINALIZED                        4024
O_CANCELLED                        3120
O_SENT_BACK                        2812
A_CANCELLED                        2419
A_APPROVED                         1871
A_ACTIVATED                        1871
A_REGISTERED                       1871
O_ACCEPTED                         1868
O_DECLINED                          668
W_Beoordelen fraude                 524
W_Wijzigen contractgegevens          12
Name: event concept:name, dtype: int64

In [302]:
row_nr_column = 'row_nr'
additionalInfo = ['event lifecycle:transition']

# do you want any error plots?
printPlots = False
# do you want to write the plots to html?
writePlots = False


# events that are influenced by concurency
eventList = ['A_SUBMITTED', 'A_PARTLYSUBMITTED', 'A_PREACCEPTED', 'O_SELECTED', 'A_FINALIZED', 'O_ACCEPTED']

# variables
weight_newtimePassed = 0.1

# make a list of the columns you need
columns = [case_column, event_column, lifecycle_column, amount_column, timestamp_column]
#columns.extend(additionalInfo)
all_events_dict = {}
for event in list(df[event_column].unique()):
    all_events_dict[event] = 0

# you can get rid of all the other columns, to make things faster
dataset = df[columns]
dataset_test = df_test[columns]
dataset[row_nr_column] = dataset.index
dataset_test[row_nr_column] = dataset_test.index

print(dataset.head())
print(all_events_dict)

   case concept:name      event concept:name event lifecycle:transition  \
0             173688             A_SUBMITTED                   COMPLETE   
1             173688       A_PARTLYSUBMITTED                   COMPLETE   
2             173688           A_PREACCEPTED                   COMPLETE   
3             173688  W_Completeren aanvraag                   SCHEDULE   
4             173691             A_SUBMITTED                   COMPLETE   

   case AMOUNT_REQ     event time:timestamp  row_nr  
0            20000  01-10-2011 00:38:44.546       0  
1            20000  01-10-2011 00:38:44.880       1  
2            20000  01-10-2011 00:39:37.906       2  
3            20000  01-10-2011 00:39:38.875       3  
4                0  01-10-2011 08:08:58.256       4  
{'A_SUBMITTED': 0, 'A_PARTLYSUBMITTED': 0, 'A_PREACCEPTED': 0, 'W_Completeren aanvraag': 0, 'A_DECLINED': 0, 'W_Afhandelen leads': 0, 'A_ACCEPTED': 0, 'O_SELECTED': 0, 'A_FINALIZED': 0, 'O_CREATED': 0, 'O_SENT': 0, 'W_Nabelle

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset[row_nr_column] = dataset.index
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset_test[row_nr_column] = dataset_test.index


In [303]:
### ON THE FLY ###
# This has two practical implications that we should be aware of
# 1) You can't preprocess the data, when a new event comes in, you have to be able to process it on the spot
# 2) You don't have any historical data, so your predictions can only rely on the things you have seen before

# This means that the data needs to be sorted on timestamp_column
# you need to first convert to a datetime, because otherwise you'll sort the strings
# and since the day of the month is the first part of the string, you will first get all firsts of all months, etc.
dataset.loc[:,timestamp_column] = pd.to_datetime(dataset.loc[:,timestamp_column], format=timeformat_timestamp)
dataset = dataset.sort_values(by=timestamp_column)

dataset_test.loc[:,timestamp_column] = pd.to_datetime(dataset_test.loc[:,timestamp_column], format=timeformat_timestamp)
dataset_test = dataset_test.sort_values(by=timestamp_column)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [304]:
### STORAGE ###
# we want to keep track of things that happen within one instance (also called trace or case)

# instanceStorage stores: key=instance, values= [event_column, timestamp_column, predictedDuration],
# to look up the previous event in this instance, and it's predicted TimePassed
instanceStorage = {}

# we also want to keep track of the predictionError
# errorStorage stores: key=unique integer, values= [previousEvent, previousTimestamp, predictionError, predictionErrorRatio, predictedTimePassed, timePassed]


predictionError = 0
# we also want to add a log feature => we don't have to save this seperatly, just needed to check if having it makes sense.
# how many events of the same activty are running at the same time might be interesting
# concurentEvents = {}

def getFeatures(case, event, row_nr, amount, predictionError):


    if case in instanceStorage:

        previousEvent = instanceStorage[case][0]
        previousLifecycle = instanceStorage[case][4]
        predictedEvent = instanceStorage[case][7]
        previousEvent_id = instanceStorage[case][2]



        if predictedEvent == event:
            predictionError +=1

        errorStorage[row_nr] = [previousEvent, previousLifecycle, predictionError,predictedEvent, event]

    else:
        previousEvent = None
        previousEvent_id = None
        previousLifecycle = None

    if event in eventList:
        # you want to find how many times this activity occurs without any additional information
        howManyConcurentEvents = [x[0] for x in instanceStorage.values()].count(event)
    else:
        howManyConcurentEvents = ''

        # the features that are in the event_id are used to make the prediction
    event_id = str(event)+'_XX_'+str(amount)+'_XX_'+str(previousEvent)+'_XX_'+str(previousLifecycle)+'_XX_'+str(howManyConcurentEvents)

    return previousEvent, previousEvent_id, previousLifecycle, event_id



In [305]:
### THE MODEL ###
# We start with only one feature: the historical average time between two events based on the previous event
# You don't need a second model for the test set, since you train your model on the training-set and then use it on the test-set

predictionModel = {}

def getPrediction(event_id):

    try:
        return max(predictionModel[event_id],key=predictionModel[event_id].get)

    except:
        return 'W_Completeren aanvraag'



def updatePredictionModel(this_event_id, event):

    all_events_dict=dict([(event,0) for event in all_events])

    if this_event_id != None:

        # if this activity is already in the predictionModel, update the model
        # try & except works slightly faster then if key in dict
        # but only for valid keys, since you skip the if statement
        try:
            predictionModel[this_event_id][event] +=1

        except:
            predictionModel[this_event_id] = all_events_dict
            predictionModel[this_event_id][event] +=1



In [306]:
### RUNNING THE MODEL ###
# We have to run the model line by line, we can't do things in parallel, otherwise it's not on the fly

def processEvent(case, event, amount, lifecycle, row_nr):

    previousEvent, previousEvent_id, previousLifecycle, event_id = getFeatures(case, event, row_nr, amount, predictionError)
    predictedEvent = getPrediction(event_id)

    # update storage
    instanceStorage[case] = [event, previousEvent, event_id, previousEvent_id, lifecycle, previousLifecycle, amount, predictedEvent]

    updatePredictionModel(previousEvent_id, event)



### Run over all lines =>
# let's first do a simple for-loop, maybe I can improve the running time by using .apply later on
#mini_dataset = dataset.loc[0:1000, :]
#print(mini_dataset)
# Start the stopwatch / counter

# Start the stopwatch / counter
start_time = time.process_time()

# running the on-the-fly model for each row
# I have tested it, ... for ... in ... does iterate over all events in order
# so we're not using information from the future.
errorStorage = {}
[processEvent(case, event, amount, lifecycle, row_nr) for case, event, amount, lifecycle, row_nr in zip(dataset[case_column],dataset[event_column], dataset[amount_column], dataset[lifecycle_column], dataset[row_nr_column])]

# Stop the stopwatch / counter
end_time = time.process_time()

print("dict: Elapsed time during the whole program in seconds:", end_time-start_time)


dict: Elapsed time during the whole program in seconds: 13.40625


In [307]:
count=0
for i in errorStorage.values():
    count+=i[2]
len_set = max(errorStorage.keys())+1
print(f'train accuracy:{count/len_set}')


train accuracy:0.7769536843971135


In [308]:
predictionModel

{'A_SUBMITTED_XX_20000_XX_None_XX_None_XX_0': {'A_SUBMITTED': 0,
  'A_PARTLYSUBMITTED': 2354,
  'A_PREACCEPTED': 0,
  'W_Completeren aanvraag': 0,
  'A_DECLINED': 0,
  'W_Afhandelen leads': 0,
  'A_ACCEPTED': 0,
  'O_SELECTED': 0,
  'A_FINALIZED': 0,
  'O_CREATED': 0,
  'O_SENT': 0,
  'W_Nabellen offertes': 0,
  'O_CANCELLED': 0,
  'A_CANCELLED': 0,
  'W_Beoordelen fraude': 0,
  'O_SENT_BACK': 0,
  'W_Valideren aanvraag': 0,
  'W_Nabellen incomplete dossiers': 0,
  'O_ACCEPTED': 0,
  'A_APPROVED': 0,
  'A_ACTIVATED': 0,
  'A_REGISTERED': 0,
  'O_DECLINED': 0,
  'W_Wijzigen contractgegevens': 0},
 'A_PARTLYSUBMITTED_XX_20000_XX_A_SUBMITTED_XX_COMPLETE_XX_0': {'A_SUBMITTED': 0,
  'A_PARTLYSUBMITTED': 0,
  'A_PREACCEPTED': 992,
  'W_Completeren aanvraag': 0,
  'A_DECLINED': 392,
  'W_Afhandelen leads': 816,
  'A_ACCEPTED': 0,
  'O_SELECTED': 0,
  'A_FINALIZED': 0,
  'O_CREATED': 0,
  'O_SENT': 0,
  'W_Nabellen offertes': 0,
  'O_CANCELLED': 0,
  'A_CANCELLED': 0,
  'W_Beoordelen fraude': 

In [309]:
errorStorage = {}
[processEvent(case, event, amount, lifecycle, row_nr) for case, event, amount, lifecycle, row_nr in zip(dataset_test[case_column],dataset_test[event_column], dataset_test[amount_column], dataset_test[lifecycle_column], dataset_test[row_nr_column])]

count=0
for i in errorStorage.values():
    count+=i[2]
len_set = max(errorStorage.keys())+1
print(f'test accuracy:{count/len_set}')

test accuracy:0.7755682412228425


In [272]:
### THIS CELL DOES NEEDS UPDATING BEFORE BEING USED, PRINTED ACCURACY IS NOT CORRECT  ###


dataset_test = dataset_test.sort_values([case_column,timestamp_column])
dataset_test['y'] = dataset_test[event_column].shift(periods= -1)
dataset_test['previous_event']=0
dataset_test['previous_lifecycle']=0
all_ids = list(dataset_test[case_column].unique())

for ids in all_ids:
    df2= dataset_test[dataset_test[case_column]==ids].copy()
    dataset_test.loc[dataset_test[case_column]==ids, 'previous_event']= df2[event_column].shift(periods=1)
    dataset_test.loc[dataset_test[case_column]==ids, 'previous_lifecycle']= df2['event lifecycle:transition'].shift(periods=1)
dataset_test.loc[dataset_test['previous_event'].isnull(), 'previous_event']=None
dataset_test.loc[dataset_test['previous_lifecycle'].isnull(), 'previous_lifecycle']=None
dataset_test.reset_index(drop=True, inplace=True)

def getPrediction_test(event, amount, previousEvent, previousLifecycle):

    event_id = str(event)+'_XX_'+str(amount)+'_XX_'+str(previousEvent)+'_XX_'+str(previousLifecycle)


    # if this activity is already in the predictionModel, then return the predicted duration
    # try & except works slightly faster then if key in dict
    # but only for valid keys, since you skip the if statement
    try:
        return max(predictionModel[event_id],key=predictionModel[event_id].get)

    except:
        return 'W_Completeren aanvraag'

predictions = [getPrediction_test(event, amount, previousEvent, previousLifecycle) for event, amount, previousEvent, previousLifecycle in zip(dataset_test[event_column], dataset_test[amount_column], dataset_test['previous_event'], dataset_test['previous_lifecycle'])]
count=0

for i in range(len(predictions)):
    if predictions[i]==dataset_test['y'][i]:
        count+=1
print(f'accuracy: {count/len(predictions)}')

accuracy: 0.6484745833594714
