In [1]:
import numpy as np
import pandas as pd
import os

np.random.seed(42)

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

## Load data
Here we load the data, and make sure that the entire data set is contained in one dataframe, for manual splitting. This manual splitting will later be done so that we can control the that all events used for training, will take place before the test set, avoiding the future data problem. The data will be sorted according to its time values.

In [2]:
path = r"C:/Users/20193555/DBL Process Mining/Data/BPI Challenge 2012/"
df_train = pd.read_csv(f'{path}BPI_Challenge_2012-training.csv')
df_test = pd.read_csv(f'{path}BPI_Challenge_2012-test.csv')


df_data = pd.concat([df_train, df_test])

In [3]:
# df_data['event time:timestamp'] = pd.to_datetime(df_data['event time:timestamp'], dayfirst=True).dt.date
df_data.sort_values(by=['event time:timestamp'])
df_data.reset_index(inplace=True, drop=True)
# df_data.drop(['case description', 'event org:resource'], axis=1, inplace=True)

# remove whitespace at beginning and end of column name
df_data.columns = df_data.columns.str.strip()

In [4]:
df_data

Unnamed: 0,eventID,case concept:name,case REG_DATE,case AMOUNT_REQ,event concept:name,event lifecycle:transition,event time:timestamp
0,0,173688,2011-10-01T00:38:44.546+02:00,20000,A_SUBMITTED,COMPLETE,01-10-2011 00:38:44.546
1,1,173688,2011-10-01T00:38:44.546+02:00,20000,A_PARTLYSUBMITTED,COMPLETE,01-10-2011 00:38:44.880
2,2,173688,2011-10-01T00:38:44.546+02:00,20000,A_PREACCEPTED,COMPLETE,01-10-2011 00:39:37.906
3,3,173688,2011-10-01T00:38:44.546+02:00,20000,W_Completeren aanvraag,SCHEDULE,01-10-2011 00:39:38.875
4,4294967296,173691,2011-10-01T08:08:58.256+02:00,5000,A_SUBMITTED,COMPLETE,01-10-2011 08:08:58.256
...,...,...,...,...,...,...,...
262195,54666343743523,213276,2012-02-27T14:12:41.868+01:00,15000,W_Nabellen incomplete dossiers,START,14-03-2012 15:59:28.309
262196,54666343743524,213276,2012-02-27T14:12:41.868+01:00,15000,W_Nabellen incomplete dossiers,COMPLETE,14-03-2012 16:00:09.680
262197,49495203119136,209595,2012-02-15T10:10:36.503+01:00,13000,W_Nabellen offertes,START,14-03-2012 16:02:03.883
262198,52342766436386,211624,2012-02-21T23:38:40.044+01:00,35000,W_Nabellen incomplete dossiers,START,14-03-2012 16:04:46.192


## Pre processing and feature engineering

In [5]:
# assign long column names to variables for easier use
cases = "case concept:name"
reg_date = "case REG_DATE"
amount_req = "case AMOUNT_REQ"
event_name = "event concept:name"
lifecycle = "event lifecycle:transition"
tmstmp = "event time:timestamp"
nxt_event = "next event"
dtime = "delta time"

In [6]:
print(f'{df_data[lifecycle].unique()[0]} happens {len(df_data[df_data[lifecycle] == df_data[lifecycle].unique()[0]])}')
print(f'{df_data[lifecycle].unique()[1]} happens {len(df_data[df_data[lifecycle] == df_data[lifecycle].unique()[1]])}')
print(f'{df_data[lifecycle].unique()[2]} happens {len(df_data[df_data[lifecycle] == df_data[lifecycle].unique()[2]])}')

COMPLETE happens 164506
SCHEDULE happens 26318
START happens 71376


In [7]:
first_clean = df_data[df_data[lifecycle] == df_data[lifecycle].unique()[0]].copy()

In [8]:
first_clean[lifecycle].unique()

array(['COMPLETE'], dtype=object)

In [9]:
print(f'The number of unique cases/traces in the dataset is {len(first_clean[cases].unique())}')

The number of unique cases/traces in the dataset is 13087


In [10]:
lst_events = first_clean[cases].unique().tolist()

In [11]:
first_clean[nxt_event] = np.nan

In [12]:
first_clean[first_clean[cases] == 173688]

Unnamed: 0,eventID,case concept:name,case REG_DATE,case AMOUNT_REQ,event concept:name,event lifecycle:transition,event time:timestamp,next event
0,0,173688,2011-10-01T00:38:44.546+02:00,20000,A_SUBMITTED,COMPLETE,01-10-2011 00:38:44.546,
1,1,173688,2011-10-01T00:38:44.546+02:00,20000,A_PARTLYSUBMITTED,COMPLETE,01-10-2011 00:38:44.880,
2,2,173688,2011-10-01T00:38:44.546+02:00,20000,A_PREACCEPTED,COMPLETE,01-10-2011 00:39:37.906,
94,5,173688,2011-10-01T00:38:44.546+02:00,20000,A_ACCEPTED,COMPLETE,01-10-2011 11:42:43.308,
98,7,173688,2011-10-01T00:38:44.546+02:00,20000,A_FINALIZED,COMPLETE,01-10-2011 11:45:09.243,
99,6,173688,2011-10-01T00:38:44.546+02:00,20000,O_SELECTED,COMPLETE,01-10-2011 11:45:09.243,
100,8,173688,2011-10-01T00:38:44.546+02:00,20000,O_CREATED,COMPLETE,01-10-2011 11:45:11.197,
101,9,173688,2011-10-01T00:38:44.546+02:00,20000,O_SENT,COMPLETE,01-10-2011 11:45:11.380,
103,11,173688,2011-10-01T00:38:44.546+02:00,20000,W_Completeren aanvraag,COMPLETE,01-10-2011 11:45:13.917,
118,13,173688,2011-10-01T00:38:44.546+02:00,20000,W_Nabellen offertes,COMPLETE,01-10-2011 12:17:08.924,


In [13]:
# test environment
temp_data = first_clean[: 1000]
test_lst = temp_data[cases].unique().tolist()

In [14]:
def next_event(data, lst, case, nxt, name):
    """function to add the next event of a trace"""
    for i in lst:
        data.loc[data[case] == i, nxt] = data.loc[data[case] == i, name].shift(-1)
    return data

In [15]:
next_event(first_clean, lst_events, cases, nxt_event, event_name)

Unnamed: 0,eventID,case concept:name,case REG_DATE,case AMOUNT_REQ,event concept:name,event lifecycle:transition,event time:timestamp,next event
0,0,173688,2011-10-01T00:38:44.546+02:00,20000,A_SUBMITTED,COMPLETE,01-10-2011 00:38:44.546,A_PARTLYSUBMITTED
1,1,173688,2011-10-01T00:38:44.546+02:00,20000,A_PARTLYSUBMITTED,COMPLETE,01-10-2011 00:38:44.880,A_PREACCEPTED
2,2,173688,2011-10-01T00:38:44.546+02:00,20000,A_PREACCEPTED,COMPLETE,01-10-2011 00:39:37.906,A_ACCEPTED
4,4294967296,173691,2011-10-01T08:08:58.256+02:00,5000,A_SUBMITTED,COMPLETE,01-10-2011 08:08:58.256,A_PARTLYSUBMITTED
5,4294967297,173691,2011-10-01T08:08:58.256+02:00,5000,A_PARTLYSUBMITTED,COMPLETE,01-10-2011 08:09:02.195,A_PREACCEPTED
...,...,...,...,...,...,...,...,...
262190,52342766436385,211624,2012-02-21T23:38:40.044+01:00,35000,W_Nabellen incomplete dossiers,COMPLETE,14-03-2012 15:56:07.999,W_Nabellen incomplete dossiers
262192,55465207660584,213855,2012-02-28T20:01:24.679+01:00,37500,W_Nabellen offertes,COMPLETE,14-03-2012 15:57:49.624,
262194,53846004989989,212689,2012-02-25T11:24:12.200+01:00,2000,W_Nabellen incomplete dossiers,COMPLETE,14-03-2012 15:58:57.002,
262196,54666343743524,213276,2012-02-27T14:12:41.868+01:00,15000,W_Nabellen incomplete dossiers,COMPLETE,14-03-2012 16:00:09.680,


In [16]:
# temp_data["delta"] = np.nan
# temp_data["time in seconds"] = temp_data[reg_date]

In [17]:
# tmp = temp_data[temp_data[cases] == 173688].copy()

In [18]:
# tmp

In [19]:
# temp_data.loc[temp_data[cases] == 173688, nxt_event] = temp_data.loc[temp_data[cases] == 173688, event_name].shift(-1)

In [20]:
# temp_data[temp_data[cases] == 173688]

Here we will split the data

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
x = np.sort(np.random.randint(1,101,25))
y = np.sort(np.random.randint(1,101,25))


In [23]:
x, y

(array([  2,   2,   3,  15,  21,  21,  22,  24,  30,  33,  38,  52,  53,
         60,  61,  64,  72,  75,  75,  83,  87,  88,  88,  93, 100]),
 array([ 3,  7, 15, 21, 22, 39, 42, 47, 49, 51, 51, 55, 58, 59, 60, 62, 62,
        62, 64, 73, 76, 80, 89, 91, 92]))

In [24]:
# by specifying shuffle = False, it will split based the last bits as demonstrated here
X_train_t, X_test_t, y_train_t, y_test_t = train_test_split(x, y, test_size=0.2, random_state=42, shuffle=False)

In [25]:
X_train_t, X_test_t, y_train_t, y_test_t

(array([ 2,  2,  3, 15, 21, 21, 22, 24, 30, 33, 38, 52, 53, 60, 61, 64, 72,
        75, 75, 83]),
 array([ 87,  88,  88,  93, 100]),
 array([ 3,  7, 15, 21, 22, 39, 42, 47, 49, 51, 51, 55, 58, 59, 60, 62, 62,
        62, 64, 73]),
 array([76, 80, 89, 91, 92]))