In [1]:
#pip install pm4py

In [2]:
#import and preprocess data
import numpy as np
import pandas as pd
import pm4py
import joblib

#Enode Prefix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.sequence import pad_sequences

1. Load data and keep necessary columns

In [3]:
#df = pd.read_csv("bpi_df.csv")
#df = pd.read_csv("rmp_df.csv")
df = pd.read_csv('helpdesk_df.csv')
df.head()

Unnamed: 0,timestamp,activity,case_id
0,2010-01-13 08:40:25+00:00,assign seriousness,Case3608
1,2010-01-13 12:26:04+00:00,assign seriousness,Case2748
2,2010-01-13 12:30:37+00:00,assign seriousness,Case4284
3,2010-01-13 13:09:31+00:00,assign seriousness,Case1534
4,2010-01-13 17:25:25+00:00,assign seriousness,Case406


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21221 entries, 0 to 21220
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   timestamp  21221 non-null  object
 1   activity   21221 non-null  object
 2   case_id    21221 non-null  object
dtypes: object(3)
memory usage: 497.5+ KB


In [5]:
df["timestamp"] = pd.to_datetime(df["timestamp"]) # conversion from object to date type

2. Split data into train-validation-test

In [6]:
#split data test
from split_train_test import split_train_test_temporal
train, test, fp_dict = split_train_test_temporal(df,0.2,"case_id","timestamp","preferred")

#split data validation
train, val, fp_dict = split_train_test_temporal(train,0.1,"case_id","timestamp","preferred")

3. Add BOS and EOS¶


add bos and eos at the end of every case for prefix
add eos at the end of every case for target

In [7]:
from bos_eos import add_bos_eos_target
############## train data transformation ########################
train_prefix = add_bos_eos_target(train,"prefix")
train_target = add_bos_eos_target(train)

############## validation data transformation ###################
val_prefix = add_bos_eos_target(val,"prefix")
val_target = add_bos_eos_target(val)

############## test data transformation ########################
test_prefix = add_bos_eos_target(test,"prefix")
test_target = add_bos_eos_target(test)

4. Create Prefix Trace

In [8]:
from prefix_trace import prefix_trace
############## train data transformation ########################
train_prefix_trace = prefix_trace(train_prefix)

############## validation data transformation ###################
val_prefix_trace = prefix_trace(val_prefix)

############## test data transformation ########################
test_prefix_trace = prefix_trace(test_prefix)

5. Prefix Simple Index Encoding

In [9]:
from encoding import find_max_len, si_encoding

#all possible cases. I assume in a business all activities are already known and defined
cases = df["activity"].unique()
cases = np.append(cases,"BOS")
cases = np.append(cases,"zos")

#find the maximum length of the longest case for padding
max_len = find_max_len(train_prefix_trace["prefix"],val_prefix_trace["prefix"],test_prefix_trace["prefix"])

from encoding import si_encoding
############## train data transformation ########################
train_prefix_trace_encoded, label_encoder = si_encoding(train_prefix_trace,cases,max_len)
train_target_encoded, a = si_encoding(train_target,cases,option = "target")

In [10]:
############## validation data transformation ###################
val_prefix_trace_encoded, a = si_encoding(val_prefix_trace,cases,max_len)
val_target_encoded, a = si_encoding(val_target,cases,option="target")

In [11]:
############## test data transformation ########################
test_prefix_trace_encoded, a = si_encoding(test_prefix_trace,cases,max_len)
test_target_encoded, a = si_encoding(test_target,cases,option="target")

6. Playout Probabilities

In [13]:
from dfg_probabilities import dfg_df

############ train #################
#get probability
train_prefix_copy = train_prefix.copy()
probability = dfg_df(train_prefix,cases)

#encode labels for probability index
probability.index = label_encoder.transform(probability.index)
probability.columns = label_encoder.transform(probability.columns)

#reset index
probability.reset_index(inplace=True)
probability.rename(columns = {"index":"activity"},inplace=True)

#encode drop extra columns and encode activity
train_prefix["activity"] = label_encoder.transform(train_prefix["activity"])

#merge to get new dataframe
train_dfg_probability = pd.merge(train_prefix,probability,how="left",on="activity")
train_dfg_probability = train_dfg_probability.drop(columns = ["timestamp","case_id"])

In [14]:
############ validation ################################## 
#encode drop extra columns and encode activity
val_prefix["activity"] = label_encoder.transform(val_prefix["activity"])

#merge to get new dataframe. probability is same as train
val_dfg_probability = pd.merge(val_prefix,probability,how="left",on="activity")
val_dfg_probability = val_dfg_probability.drop(columns = ["timestamp","case_id"])

In [15]:
############ test ################################## 
#probability is combination of train and validation
train_val_prefix = pd.concat([train_prefix_copy,val_prefix])
probability = dfg_df(train_val_prefix,cases)

#encode drop extra columns and encode activity
test_prefix["activity"] = label_encoder.transform(test_prefix["activity"])

#encode labels for probability index
probability.index = label_encoder.transform(probability.index)
probability.columns = label_encoder.transform(probability.columns)

#reset index
probability.reset_index(inplace=True)
probability.rename(columns = {"index":"activity"},inplace=True)

#merge to get new dataframe
test_dfg_probability = pd.merge(test_prefix,probability,how="left",on="activity")
test_dfg_probability = test_dfg_probability.drop(columns = ["timestamp","case_id"])

Save Data

In [16]:
########################################### Help Desk ###########################################################################

##prefix data
np.save("helpdesk_train_prefix.npy",train_prefix_trace_encoded)
np.save("helpdesk_val_prefix.npy",val_prefix_trace_encoded)
np.save("helpdesk_test_prefix.npy",test_prefix_trace_encoded)

##probability data
train_dfg_probability.to_csv("helpdesk_train_dfg_probability.csv",index=False)
val_dfg_probability.to_csv("helpdesk_val_dfg_probability.csv",index=False)
test_dfg_probability.to_csv("helpdesk_test_dfg_probability.csv",index=False)

#target
np.save("helpdesk_train_target.npy",train_target_encoded)
np.save("helpdesk_val_target.npy",val_target_encoded)
np.save("helpdesk_test_target.npy",test_target_encoded)

#original data
train_target.to_csv("helpdesk_train_target_org.csv",index=False)
test_target.to_csv("helpdesk_test_target_org.csv",index=False)

In [17]:
######################################### BPI Data ###############################################################################

##prefix data
np.save("bpi_train_prefix.npy",train_prefix_trace_encoded)
#np.save("bpi_val_prefix.npy",val_prefix_trace_encoded)
np.save("bpi_test_prefix.npy",test_prefix_trace_encoded)

##probability data
train_dfg_probability.to_csv("bpi_train_dfg_probability.csv",index=False)
val_dfg_probability.to_csv("bpi_val_dfg_probability.csv",index=False)
test_dfg_probability.to_csv("bpi_test_dfg_probability.csv",index=False)

#target
np.save("bpi_train_target.npy",train_target_encoded)
#np.save("bpi_val_target.npy",val_target_encoded)
np.save("bpi_test_target.npy",test_target_encoded)

#original data
train_target.to_csv("bpi_train_target_org.csv",index=False)
test_target.to_csv("bpi_test_target_org.csv",index=False)

In [18]:
######################################### RMP Data ###############################################################################

##prefix data
np.save("rmp_train_prefix.npy",train_prefix_trace_encoded)
np.save("rmp_val_prefix.npy",val_prefix_trace_encoded)
np.save("rmp_test_prefix.npy",test_prefix_trace_encoded)

##probability data
train_dfg_probability.to_csv("rmp_train_dfg_probability.csv",index=False)
val_dfg_probability.to_csv("rmp_val_dfg_probability.csv",index=False)
test_dfg_probability.to_csv("rmp_test_dfg_probability.csv",index=False)

#target
np.save("rmp_train_target.npy",train_target_encoded)
np.save("rmp_val_target.npy",val_target_encoded)
np.save("rmp_test_target.npy",test_target_encoded)

#original data
train_target.to_csv("rmp_train_target_org.csv",index=False)
test_target.to_csv("rmp_test_target_org.csv",index=False)

In [19]:
# Helpdesk 
joblib.dump(label_encoder, 'helpdesk_label_encoder.joblib')

['helpdesk_label_encoder.joblib']