In [1]:
### IMPORT PACKAGES ###

import pandas as pd
import numpy as np

import plotly
import plotly.express as px
import plotly.graph_objects as go

In [2]:
### IMPORT THE DATA ###
df = pd.read_csv('data/BPI_Challenge_2012-training.csv')

# select columns of interest
instance_ID = "case concept:name"
event_ID = "event concept:name"
timestamp = "event time:timestamp"
additionalInfo = []

# variables
timestampFormat = "%d-%m-%Y %H:%M:%S.%f"
weight_newtimePassed = 0.1

# make a list of the columns you need
columns = [instance_ID, event_ID, timestamp]
columns.extend(additionalInfo)

# you can get rid of all the other columns, to make things faster
dataset = df[columns]

print(dataset.head())

   case concept:name      event concept:name     event time:timestamp
0             173688             A_SUBMITTED  01-10-2011 00:38:44.546
1             173688       A_PARTLYSUBMITTED  01-10-2011 00:38:44.880
2             173688           A_PREACCEPTED  01-10-2011 00:39:37.906
3             173688  W_Completeren aanvraag  01-10-2011 00:39:38.875
4             173691             A_SUBMITTED  01-10-2011 08:08:58.256


In [3]:
### ON THE FLY ###
# This has two practical implications that we should be aware of
# 1) You can't preprocess the data, when a new event comes in, you have to be able to process it on the spot
# 2) You don't have any historical data, so your predictions can only rely on the things you have seen before

# This means that the data needs to be sorted on timestamp
# you need to first convert to a datetime, because otherwise you'll sort the strings
# and since the day of the month is the first part of the string, you will first get all firsts of all months, etc.
dataset.loc[:,timestamp] = pd.to_datetime(dataset.loc[:,timestamp], format=timestampFormat)
dataset = dataset.sort_values(by=timestamp)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [4]:
### STORAGE ###
# we want to keep track of things that happen within one instance (also called trace or case)

instanceStorage = {}

def updateInstanceStorage(event, instance_ID, event_ID, timestamp, timestampFormat, instanceStorage):

    this_instance_id = event[instance_ID]

    if this_instance_id in instanceStorage:
        previousEvent = instanceStorage[this_instance_id][0]
        this_timestamp = pd.to_datetime(event[timestamp], format=timestampFormat)
        previous_timestamp = pd.to_datetime(instanceStorage[this_instance_id][1], format=timestampFormat)
        timePassed = this_timestamp - previous_timestamp

    else:
        previousEvent = None
        timePassed = None

    instanceStorage[this_instance_id] = [event[event_ID], event[timestamp]]

    return previousEvent, timePassed, instanceStorage

In [5]:
print(dataset.loc[0,:])
previousEvent, timePassed, instanceStorage = updateInstanceStorage(dataset.loc[0,:], instance_ID, event_ID, timestamp, timestampFormat, instanceStorage)

print(previousEvent, timePassed, instanceStorage)

case concept:name                           173688
event concept:name                     A_SUBMITTED
event time:timestamp    2011-10-01 00:38:44.546000
Name: 0, dtype: object
None None {173688: ['A_SUBMITTED', Timestamp('2011-10-01 00:38:44.546000')]}


In [6]:
### THE MODEL ###
# We start with only one feature: the historical average time between two events based on the previous event

predictionModel = {}

def getPrediction(this_event_id, predictionModel):

    # if this event is already in the predictionModel, then return the predicted duration
    if this_event_id in predictionModel:
        return predictionModel[this_event_id]

    # otherwise you don't have an estimate yet, so just return None
    else:
        return None


def updatePredictionModel(this_event_id, timePassed, weight_newtimePassed):

    if timePassed != None:

        # if this event is already in the predictionModel, update the model
        if this_event_id in predictionModel:
            predictionModel[this_event_id] = timePassed*weight_newtimePassed + predictionModel[this_event_id]*(1-weight_newtimePassed)

        # otherwise, just just the new value as the best estimate for the next time you see this activty
        else:
            predictionModel[this_event_id] = timePassed

    return predictionModel

In [7]:
### RUNNING THE MODEL ###
# We have to run the model line by line, we can't do things in parallel, otherwise it's not on the fly

### Run over all lines =>
# let's first do a simple for-loop, maybe I can improve this to a .apply later on

def processEvent(event, event_ID, predictionModel, instanceStorage):

    # step 1: predict
    # look up if this event is already in predictionModel
    # if it is, use the predictedDuration as current prediction
    # otherwise predict: unknown
    predictedDuration =  getPrediction(event[event_ID], predictionModel)

    # step 2: loop up historical data
    # you only know things of the past, so you want to check if there is a previous event in the instanceStorage
    # if there is, you can calculate the duration between that event and this one
    previousEvent, timePassed, instanceStorage = updateInstanceStorage(event, instance_ID, event_ID, timestamp, timestampFormat, instanceStorage)

    # step 3: update predictionModel
    predictionModel = updatePredictionModel(previousEvent, timePassed, weight_newtimePassed)

    return  instanceStorage, predictionModel, predictedDuration

mini_dataset = dataset.loc[0:10, :]
print(mini_dataset)

for index, row in mini_dataset.iterrows():

    instanceStorage, predictionModel, mini_dataset.loc[index,'predictedDuration'] = processEvent(row, event_ID, predictionModel, instanceStorage)

print(mini_dataset)

    case concept:name      event concept:name    event time:timestamp
0              173688             A_SUBMITTED 2011-10-01 00:38:44.546
1              173688       A_PARTLYSUBMITTED 2011-10-01 00:38:44.880
2              173688           A_PREACCEPTED 2011-10-01 00:39:37.906
3              173688  W_Completeren aanvraag 2011-10-01 00:39:38.875
4              173691             A_SUBMITTED 2011-10-01 08:08:58.256
5              173691       A_PARTLYSUBMITTED 2011-10-01 08:09:02.195
6              173691           A_PREACCEPTED 2011-10-01 08:09:56.648
7              173691  W_Completeren aanvraag 2011-10-01 08:09:59.578
8              173694             A_SUBMITTED 2011-10-01 08:10:30.287
9              173694       A_PARTLYSUBMITTED 2011-10-01 08:10:30.591
10             173697             A_SUBMITTED 2011-10-01 08:11:08.866
    case concept:name      event concept:name    event time:timestamp  \
0              173688             A_SUBMITTED 2011-10-01 00:38:44.546   
1             

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
