In [1]:
#pip install pm4py

In [2]:
#import and preprocess data
import numpy as np
import pandas as pd
import pm4py

#Enode Prefix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.sequence import pad_sequences

HelpDesk Log

1. Load data and keep necessary columns

In [3]:
log = pm4py.read_xes("helpdesk.xes") #read in log data
logdata = pm4py.convert_to_dataframe(log) #convert your log to dataframe
logdata.head()

parsing log, completed traces ::   0%|          | 0/4580 [00:00<?, ?it/s]

Unnamed: 0,concept:name,lifecycle:transition,org:resource,time:timestamp,Activity,Resource,case:concept:name,case:variant,case:variant-index,case:creator
0,Assign seriousness,complete,Value 1,2012-10-09 14:50:17+00:00,Assign seriousness,Value 1,Case1,Variant 12,12,Fluxicon Disco
1,Take in charge ticket,complete,Value 1,2012-10-09 14:51:01+00:00,Take in charge ticket,Value 1,Case1,Variant 12,12,Fluxicon Disco
2,Take in charge ticket,complete,Value 2,2012-10-12 15:02:56+00:00,Take in charge ticket,Value 2,Case1,Variant 12,12,Fluxicon Disco
3,Resolve ticket,complete,Value 1,2012-10-25 11:54:26+00:00,Resolve ticket,Value 1,Case1,Variant 12,12,Fluxicon Disco
4,Closed,complete,Value 3,2012-11-09 12:54:39+00:00,Closed,Value 3,Case1,Variant 12,12,Fluxicon Disco


In [4]:
logdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21348 entries, 0 to 21347
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype              
---  ------                --------------  -----              
 0   concept:name          21348 non-null  object             
 1   lifecycle:transition  21348 non-null  object             
 2   org:resource          21348 non-null  object             
 3   time:timestamp        21348 non-null  datetime64[ns, UTC]
 4   Activity              21348 non-null  object             
 5   Resource              21348 non-null  object             
 6   case:concept:name     21348 non-null  object             
 7   case:variant          21348 non-null  object             
 8   case:variant-index    21348 non-null  int64              
 9   case:creator          21348 non-null  object             
dtypes: datetime64[ns, UTC](1), int64(1), object(8)
memory usage: 1.6+ MB


In [5]:
#extract the columns, sort by time, convert activity to lower case
logdata.rename(columns={"concept:name":"activity", "time:timestamp":"timestamp","case:concept:name":"case_id","case:variant":"variant","org:resource":"resource"},inplace=True)
logdata = logdata.sort_values(by = ["timestamp","case_id"])
df = logdata[["timestamp","activity","case_id"]].copy()
df["activity"] = df["activity"].str.lower()
df.head()

Unnamed: 0,timestamp,activity,case_id
16857,2010-01-13 08:40:25+00:00,assign seriousness,Case3608
12863,2010-01-13 12:26:04+00:00,assign seriousness,Case2748
19959,2010-01-13 12:30:37+00:00,assign seriousness,Case4284
7168,2010-01-13 13:09:31+00:00,assign seriousness,Case1534
1864,2010-01-13 17:25:25+00:00,assign seriousness,Case406


In [6]:
#Check if there are duplicates
print(df.duplicated().any())
print(df.duplicated().sum())

True
127


In [7]:
#Check activities for a case
df.drop_duplicates(inplace=True)
df.duplicated().sum()

0

In [8]:
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,timestamp,activity,case_id
0,2010-01-13 08:40:25+00:00,assign seriousness,Case3608
1,2010-01-13 12:26:04+00:00,assign seriousness,Case2748
2,2010-01-13 12:30:37+00:00,assign seriousness,Case4284
3,2010-01-13 13:09:31+00:00,assign seriousness,Case1534
4,2010-01-13 17:25:25+00:00,assign seriousness,Case406


In [9]:
#timestamps are not unique
df["timestamp"].duplicated().any()

True

In [10]:
df.to_csv("helpdesk_df.csv", index = False)

BPI_Challenge 2021

1. Load data and keep necessary columns

In [12]:
log = pm4py.read_xes("BPI_Challenge_2012.xes") #read in log data
logdata = pm4py.convert_to_dataframe(log) #convert your log to dataframe
logdata.head()

parsing log, completed traces ::   0%|          | 0/13087 [00:00<?, ?it/s]

Unnamed: 0,org:resource,lifecycle:transition,concept:name,time:timestamp,case:REG_DATE,case:concept:name,case:AMOUNT_REQ
0,112.0,COMPLETE,A_SUBMITTED,2011-10-01 00:38:44.546000+00:00,2011-10-01 00:38:44.546000+00:00,173688,20000
1,112.0,COMPLETE,A_PARTLYSUBMITTED,2011-10-01 00:38:44.880000+00:00,2011-10-01 00:38:44.546000+00:00,173688,20000
2,112.0,COMPLETE,A_PREACCEPTED,2011-10-01 00:39:37.906000+00:00,2011-10-01 00:38:44.546000+00:00,173688,20000
3,112.0,SCHEDULE,W_Completeren aanvraag,2011-10-01 00:39:38.875000+00:00,2011-10-01 00:38:44.546000+00:00,173688,20000
4,,START,W_Completeren aanvraag,2011-10-01 11:36:46.437000+00:00,2011-10-01 00:38:44.546000+00:00,173688,20000


In [13]:
logdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 262200 entries, 0 to 262199
Data columns (total 7 columns):
 #   Column                Non-Null Count   Dtype              
---  ------                --------------   -----              
 0   org:resource          244190 non-null  object             
 1   lifecycle:transition  262200 non-null  object             
 2   concept:name          262200 non-null  object             
 3   time:timestamp        262200 non-null  datetime64[ns, UTC]
 4   case:REG_DATE         262200 non-null  datetime64[ns, UTC]
 5   case:concept:name     262200 non-null  object             
 6   case:AMOUNT_REQ       262200 non-null  object             
dtypes: datetime64[ns, UTC](2), object(5)
memory usage: 14.0+ MB


In [14]:
logdata["org:resource"].unique()

array(['112', nan, '10862', '10913', '11049', '10629', '11120', '10809',
       '10912', '11201', '11119', '10861', '11203', '11181', '11189',
       '10609', '11111', '10982', '11019', '11180', '10899', '10138',
       '11002', '11122', '10889', '10972', '11121', '10939', '11029',
       '11009', '11000', '10863', '11169', '11179', '11001', '10971',
       '10228', '11202', '10789', '10881', '10909', '10188', '10910',
       '10929', '10931', '11259', '11200', '10779', '10880', '10914',
       '10859', '11339', '10933', '11079', '10932', '10935', '11254',
       '11003', '10125', '11269', '10821', '11289', '10124', '11299',
       '11309', '11300', '11302', '11319', '11304'], dtype=object)

In [15]:
logdata["concept:name"].unique()

array(['A_SUBMITTED', 'A_PARTLYSUBMITTED', 'A_PREACCEPTED',
       'W_Completeren aanvraag', 'A_ACCEPTED', 'O_SELECTED',
       'A_FINALIZED', 'O_CREATED', 'O_SENT', 'W_Nabellen offertes',
       'O_SENT_BACK', 'W_Valideren aanvraag', 'A_REGISTERED',
       'A_APPROVED', 'O_ACCEPTED', 'A_ACTIVATED', 'O_CANCELLED',
       'W_Wijzigen contractgegevens', 'A_DECLINED', 'A_CANCELLED',
       'W_Afhandelen leads', 'O_DECLINED',
       'W_Nabellen incomplete dossiers', 'W_Beoordelen fraude'],
      dtype=object)

In [16]:
logdata["lifecycle:transition"].unique()

array(['COMPLETE', 'SCHEDULE', 'START'], dtype=object)

In [17]:
#case:concept:name is caseid
logdata[logdata["case:concept:name"] == "173688"].shape

(26, 7)

In [18]:
#extract the columns, sort by time, convert activity to lower case
logdata.rename(columns={"concept:name":"activity", "time:timestamp":"timestamp","case:concept:name":"case_id"},inplace=True)
logdata = logdata.sort_values(by = ["timestamp","case_id"])
df = logdata[["timestamp","activity","case_id"]].copy()
df["activity"] = df["activity"].str.lower()
df.head()

Unnamed: 0,timestamp,activity,case_id
0,2011-10-01 00:38:44.546000+00:00,a_submitted,173688
1,2011-10-01 00:38:44.880000+00:00,a_partlysubmitted,173688
2,2011-10-01 00:39:37.906000+00:00,a_preaccepted,173688
3,2011-10-01 00:39:38.875000+00:00,w_completeren aanvraag,173688
26,2011-10-01 08:08:58.256000+00:00,a_submitted,173691


In [19]:
df["timestamp"] = df["timestamp"].dt.tz_localize(None) 

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 262200 entries, 0 to 246923
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   timestamp  262200 non-null  datetime64[ns]
 1   activity   262200 non-null  object        
 2   case_id    262200 non-null  object        
dtypes: datetime64[ns](1), object(2)
memory usage: 8.0+ MB


In [21]:
#Check if there are duplicates
print(df.duplicated().any())
print(df.duplicated().sum())

False
0


In [22]:
#Check activities for a case
df.drop_duplicates(inplace=True)
df.duplicated().sum()

0

In [23]:
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,timestamp,activity,case_id
0,2011-10-01 00:38:44.546,a_submitted,173688
1,2011-10-01 00:38:44.880,a_partlysubmitted,173688
2,2011-10-01 00:39:37.906,a_preaccepted,173688
3,2011-10-01 00:39:38.875,w_completeren aanvraag,173688
4,2011-10-01 08:08:58.256,a_submitted,173691


In [24]:
#timestamps are not unique
df["timestamp"].duplicated().any()

True

In [25]:
df.to_csv("bpi_df.csv",index=False)

Road Process Management

1. Load data and keep necessary columns

In [27]:
log = pm4py.read_xes("Road_Traffic_Fine_Management_Process.xes") #read in log data
logdata = pm4py.convert_to_dataframe(log) #convert your log to dataframe
logdata.head()

parsing log, completed traces ::   0%|          | 0/150370 [00:00<?, ?it/s]

Unnamed: 0,amount,org:resource,dismissal,concept:name,vehicleClass,totalPaymentAmount,lifecycle:transition,time:timestamp,article,points,case:concept:name,expense,notificationType,lastSent,paymentAmount,matricola
0,35.0,561.0,NIL,Create Fine,A,0.0,complete,2006-07-24 00:00:00+00:00,157.0,0.0,A1,,,,,
1,,,,Send Fine,,,complete,2006-12-05 00:00:00+00:00,,,A1,11.0,,,,
2,35.0,561.0,NIL,Create Fine,A,0.0,complete,2006-08-02 00:00:00+00:00,157.0,0.0,A100,,,,,
3,,,,Send Fine,,,complete,2006-12-12 00:00:00+00:00,,,A100,11.0,,,,
4,,,,Insert Fine Notification,,,complete,2007-01-15 00:00:00+00:00,,,A100,,P,P,,


In [28]:
logdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 561470 entries, 0 to 561469
Data columns (total 16 columns):
 #   Column                Non-Null Count   Dtype              
---  ------                --------------   -----              
 0   amount                230230 non-null  float64            
 1   org:resource          150925 non-null  object             
 2   dismissal             155066 non-null  object             
 3   concept:name          561470 non-null  object             
 4   vehicleClass          150370 non-null  object             
 5   totalPaymentAmount    227971 non-null  float64            
 6   lifecycle:transition  561470 non-null  object             
 7   time:timestamp        561470 non-null  datetime64[ns, UTC]
 8   article               150370 non-null  float64            
 9   points                150370 non-null  float64            
 10  case:concept:name     561470 non-null  object             
 11  expense               103987 non-null  float64      

In [29]:
logdata.head()

Unnamed: 0,amount,org:resource,dismissal,concept:name,vehicleClass,totalPaymentAmount,lifecycle:transition,time:timestamp,article,points,case:concept:name,expense,notificationType,lastSent,paymentAmount,matricola
0,35.0,561.0,NIL,Create Fine,A,0.0,complete,2006-07-24 00:00:00+00:00,157.0,0.0,A1,,,,,
1,,,,Send Fine,,,complete,2006-12-05 00:00:00+00:00,,,A1,11.0,,,,
2,35.0,561.0,NIL,Create Fine,A,0.0,complete,2006-08-02 00:00:00+00:00,157.0,0.0,A100,,,,,
3,,,,Send Fine,,,complete,2006-12-12 00:00:00+00:00,,,A100,11.0,,,,
4,,,,Insert Fine Notification,,,complete,2007-01-15 00:00:00+00:00,,,A100,,P,P,,


In [30]:
logdata["concept:name"].unique()

array(['Create Fine', 'Send Fine', 'Insert Fine Notification',
       'Add penalty', 'Send for Credit Collection', 'Payment',
       'Insert Date Appeal to Prefecture', 'Send Appeal to Prefecture',
       'Receive Result Appeal from Prefecture',
       'Notify Result Appeal to Offender', 'Appeal to Judge'],
      dtype=object)

In [31]:
logdata["lifecycle:transition"].unique()

array(['complete'], dtype=object)

In [32]:
#case:concept:name is caseid
logdata["case:concept:name"].unique()

array(['A1', 'A100', 'A10000', ..., 'V9997', 'V9998', 'V9999'],
      dtype=object)

In [33]:
#extract the columns, sort by time, convert activity to lower case
logdata.rename(columns={"concept:name":"activity", "time:timestamp":"timestamp","case:concept:name":"case_id"},inplace=True)
logdata = logdata.sort_values(by = ["timestamp","case_id"])
df = logdata[["timestamp","activity","case_id"]].copy()
df["activity"] = df["activity"].str.lower()
df.head()

Unnamed: 0,timestamp,activity,case_id
429367,2000-01-01 00:00:00+00:00,create fine,S38735
431898,2000-01-02 00:00:00+00:00,create fine,S44306
436195,2000-01-02 00:00:00+00:00,create fine,S49055
436201,2000-01-02 00:00:00+00:00,create fine,S49056
436206,2000-01-02 00:00:00+00:00,create fine,S49057


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 561470 entries, 429367 to 428806
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype              
---  ------     --------------   -----              
 0   timestamp  561470 non-null  datetime64[ns, UTC]
 1   activity   561470 non-null  object             
 2   case_id    561470 non-null  object             
dtypes: datetime64[ns, UTC](1), object(2)
memory usage: 17.1+ MB


In [35]:
#Check if there are duplicates
print(df.duplicated().any())
print(df.duplicated().sum())

True
30


In [36]:
#Check activities for a case
df.drop_duplicates(inplace=True)
df.duplicated().sum()

0

In [37]:
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,timestamp,activity,case_id
0,2000-01-01 00:00:00+00:00,create fine,S38735
1,2000-01-02 00:00:00+00:00,create fine,S44306
2,2000-01-02 00:00:00+00:00,create fine,S49055
3,2000-01-02 00:00:00+00:00,create fine,S49056
4,2000-01-02 00:00:00+00:00,create fine,S49057


In [38]:
#timestamps are not unique
df["timestamp"].duplicated().any()

True

In [39]:
df.to_csv("rmp_df.csv",index=False)