In [27]:
import pandas as pd
import numpy as np

import plotly
import plotly.express as px

from collections import Counter

In [28]:
train_data = pd.read_csv('data/BPI_Challenge_2012-training.csv')
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214377 entries, 0 to 214376
Data columns (total 7 columns):
 #   Column                      Non-Null Count   Dtype 
---  ------                      --------------   ----- 
 0   eventID                     214377 non-null  int64 
 1   case concept:name           214377 non-null  int64 
 2   case REG_DATE               214377 non-null  object
 3   case AMOUNT_REQ             214377 non-null  int64 
 4   event concept:name          214377 non-null  object
 5   event lifecycle:transition  214377 non-null  object
 6   event time:timestamp        214377 non-null  object
dtypes: int64(3), object(4)
memory usage: 11.4+ MB


In [29]:
case_column = "case concept:name"
event_column = "event concept:name"
timestamp_column = "event time:timestamp"
position_column = "eventID"

In [30]:
#train_data.head(10)

# for eventpad I need a dataset without spaces

new_df = train_data.rename(columns={'eventID ': 'eventID', 'case concept:name': 'caseName','event concept:name': 'eventName', 'event time:timestamp': 'timestamp'})
new_df = new_df.drop(columns=['case REG_DATE', 'case AMOUNT_REQ','event lifecycle:transition'])
new_df['eventName'] = new_df['eventName'].str.replace(' ', '_')

new_df.to_csv('train_data.csv')

In [31]:
pd.unique(train_data[case_column])

array([173688, 173691, 173694, ..., 206315, 206318, 206321], dtype=int64)

In [32]:
first_100 = pd.unique(train_data[case_column]).tolist()[0:100]

In [33]:
train_data[train_data[case_column].isin(first_100)]

Unnamed: 0,eventID,case concept:name,case REG_DATE,case AMOUNT_REQ,event concept:name,event lifecycle:transition,event time:timestamp
0,0,173688,2011-10-01T00:38:44.546+02:00,20000,A_SUBMITTED,COMPLETE,01-10-2011 00:38:44.546
1,1,173688,2011-10-01T00:38:44.546+02:00,20000,A_PARTLYSUBMITTED,COMPLETE,01-10-2011 00:38:44.880
2,2,173688,2011-10-01T00:38:44.546+02:00,20000,A_PREACCEPTED,COMPLETE,01-10-2011 00:39:37.906
3,3,173688,2011-10-01T00:38:44.546+02:00,20000,W_Completeren aanvraag,SCHEDULE,01-10-2011 00:39:38.875
4,4294967296,173691,2011-10-01T08:08:58.256+02:00,5000,A_SUBMITTED,COMPLETE,01-10-2011 08:08:58.256
...,...,...,...,...,...,...,...
89575,382252089429,173955,2011-10-02T23:15:19.164+02:00,10000,A_APPROVED,COMPLETE,29-11-2011 15:05:21.679
89576,382252089430,173955,2011-10-02T23:15:19.164+02:00,10000,A_REGISTERED,COMPLETE,29-11-2011 15:05:21.679
89577,382252089431,173955,2011-10-02T23:15:19.164+02:00,10000,A_ACTIVATED,COMPLETE,29-11-2011 15:05:21.680
89578,382252089432,173955,2011-10-02T23:15:19.164+02:00,10000,W_Nabellen incomplete dossiers,COMPLETE,29-11-2011 15:05:31.239


In [34]:
train_data[timestamp_column] = pd.to_datetime(train_data[timestamp_column])
train_data

Unnamed: 0,eventID,case concept:name,case REG_DATE,case AMOUNT_REQ,event concept:name,event lifecycle:transition,event time:timestamp
0,0,173688,2011-10-01T00:38:44.546+02:00,20000,A_SUBMITTED,COMPLETE,2011-01-10 00:38:44.546
1,1,173688,2011-10-01T00:38:44.546+02:00,20000,A_PARTLYSUBMITTED,COMPLETE,2011-01-10 00:38:44.880
2,2,173688,2011-10-01T00:38:44.546+02:00,20000,A_PREACCEPTED,COMPLETE,2011-01-10 00:39:37.906
3,3,173688,2011-10-01T00:38:44.546+02:00,20000,W_Completeren aanvraag,SCHEDULE,2011-01-10 00:39:38.875
4,4294967296,173691,2011-10-01T08:08:58.256+02:00,5000,A_SUBMITTED,COMPLETE,2011-01-10 08:08:58.256
...,...,...,...,...,...,...,...
214372,38835094290529,201854,2012-01-18T02:09:07.029+01:00,50000,O_CANCELLED,COMPLETE,2012-03-14 15:30:19.361
214373,38835094290528,201854,2012-01-18T02:09:07.029+01:00,50000,A_CANCELLED,COMPLETE,2012-03-14 15:30:19.361
214374,38835094290530,201854,2012-01-18T02:09:07.029+01:00,50000,W_Nabellen incomplete dossiers,COMPLETE,2012-03-14 15:30:23.187
214375,35858681954366,199678,2012-01-10T19:16:52.800+01:00,30000,W_Nabellen offertes,START,2012-03-14 15:36:15.299


In [35]:
px.scatter(train_data[train_data[case_column].isin(first_100)], x=timestamp_column, y=case_column, color=event_column)

In [36]:
start_event_list = []

for name in pd.unique(train_data[case_column]):
    
    this_timestamp = np.min(train_data[train_data[case_column] == name][timestamp_column])
    start_event_list.append(train_data[train_data[timestamp_column] == this_timestamp][event_column].values[0])

In [37]:
Counter(start_event_list)

Counter({'A_SUBMITTED': 8482,
         'W_Nabellen incomplete dossiers': 106,
         'W_Completeren aanvraag': 309,
         'W_Nabellen offertes': 1272,
         'W_Valideren aanvraag': 248,
         'W_Afhandelen leads': 36,
         'W_Beoordelen fraude': 4,
         'O_CANCELLED': 9,
         'A_CANCELLED': 3})

In [38]:
train_data[event_column].value_counts()

W_Nabellen offertes               43880
W_Completeren aanvraag            43480
W_Nabellen incomplete dossiers    21075
W_Valideren aanvraag              17089
W_Afhandelen leads                13662
A_SUBMITTED                       10469
A_PARTLYSUBMITTED                 10469
A_DECLINED                         6152
A_PREACCEPTED                      5884
O_SELECTED                         5686
O_CREATED                          5686
O_SENT                             5686
A_ACCEPTED                         4099
A_FINALIZED                        4024
O_CANCELLED                        3120
O_SENT_BACK                        2812
A_CANCELLED                        2419
A_APPROVED                         1871
A_ACTIVATED                        1871
A_REGISTERED                       1871
O_ACCEPTED                         1868
O_DECLINED                          668
W_Beoordelen fraude                 524
W_Wijzigen contractgegevens          12
Name: event concept:name, dtype: int64