# Preprocessing
This notebook filters the data and saves training and test data in the data folder

In [5]:
# import basic libraries
import pandas as pd

# import machine learing library
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import OrdinalEncoder

# import pm4py library to work with XES logs and process mining
import pm4py

In [2]:
log = pm4py.read_xes("data/BPI_Challenge_2017.xes.gz")
log_df = pm4py.convert_to_dataframe(log)
log_df.head()

  from .autonotebook import tqdm as notebook_tqdm
parsing log, completed traces :: 100%|██████████| 31509/31509 [01:33<00:00, 337.77it/s]


Unnamed: 0,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,case:concept:name,case:RequestedAmount,FirstWithdrawalAmount,NumberOfTerms,Accepted,MonthlyCost,Selected,CreditScore,OfferedAmount,OfferID
0,Created,User_1,A_Create Application,Application,Application_652823628,complete,2016-01-01 09:51:15.304000+00:00,Existing loan takeover,New credit,Application_652823628,20000.0,,,,,,,,
1,statechange,User_1,A_Submitted,Application,ApplState_1582051990,complete,2016-01-01 09:51:15.352000+00:00,Existing loan takeover,New credit,Application_652823628,20000.0,,,,,,,,
2,Created,User_1,W_Handle leads,Workflow,Workitem_1298499574,schedule,2016-01-01 09:51:15.774000+00:00,Existing loan takeover,New credit,Application_652823628,20000.0,,,,,,,,
3,Deleted,User_1,W_Handle leads,Workflow,Workitem_1673366067,withdraw,2016-01-01 09:52:36.392000+00:00,Existing loan takeover,New credit,Application_652823628,20000.0,,,,,,,,
4,Created,User_1,W_Complete application,Workflow,Workitem_1493664571,schedule,2016-01-01 09:52:36.403000+00:00,Existing loan takeover,New credit,Application_652823628,20000.0,,,,,,,,



## Ordinal encoding

This encodes all string inputs as integers, which is needed to run models on it. This might not be the best encoding method, as categories do not imply any kind of order, while intergers do.

For future implementations we also want to experiment with:

* One-hot encoding (using pm4py log_to_features) followed by PCA to reduce dimensionality
* Bi-Grams (also using pm4py log_to_features)
* Multisets



In [6]:
# encode string values using ordinal encoding
encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
encoded_log = encoder.fit_transform(log_df)
encoded_df = pd.DataFrame(encoded_log)
encoded_df.fillna(value=-1, inplace=True)
encoded_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,0.0,0.0,4.0,0.0,233979.0,1.0,0.0,5.0,1.0,25893.0,301.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,4.0,0.0,8.0,0.0,62695.0,1.0,1.0,5.0,1.0,25893.0,301.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,0.0,0.0,22.0,2.0,552510.0,3.0,2.0,5.0,1.0,25893.0,301.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,1.0,0.0,22.0,2.0,702398.0,6.0,3.0,5.0,1.0,25893.0,301.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,0.0,0.0,21.0,2.0,631062.0,3.0,4.0,5.0,1.0,25893.0,301.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


## Anomaly detection
We apply a method called IsolationForest to the dataframe. This permits to add a column scores that is lower or equal than 0 when the case needs to be considered anomalous, and is greater than 0 when the case needs not to be considered anomalous.

*Note: based on the results, we think it's better to not remove traces with high anomaly scores. After visual inspection of these traces, they don't seem to have anything weird going on*

In [7]:
scores_df = log_df.copy()

model=IsolationForest()
model.fit(encoded_df)
scores_df["scores"] = model.decision_function(encoded_df)

To see which cases are more anomalous, we can sort the dataframe inserting an index. Then, the print will show which cases are more anomalous

In [8]:
# show highest scores
scores_df.sort_values("scores")

Unnamed: 0,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,case:concept:name,case:RequestedAmount,FirstWithdrawalAmount,NumberOfTerms,Accepted,MonthlyCost,Selected,CreditScore,OfferedAmount,OfferID,scores
1135283,Created,User_98,O_Create Offer,Offer,Offer_1128240057,complete,2016-12-09 09:29:57.748000+00:00,"Other, see explanation",New credit,Application_1491072794,60000.0,60000.0,120.0,True,610.93,True,921.0,60000.0,,-0.255941
1094707,Created,User_10,O_Create Offer,Offer,Offer_1251878868,complete,2016-11-26 09:47:13.602000+00:00,"Other, see explanation",New credit,Application_1579221526,75000.0,35000.0,126.0,True,650.00,True,887.0,65000.0,,-0.251015
1191015,Created,User_77,O_Create Offer,Offer,Offer_961425477,complete,2016-12-28 15:50:00.230000+00:00,"Other, see explanation",New credit,Application_926354715,75000.0,75000.0,120.0,True,763.67,True,989.0,75000.0,,-0.249755
803169,Created,User_11,O_Create Offer,Offer,Offer_1007658926,complete,2016-09-08 09:30:41.005000+00:00,"Other, see explanation",New credit,Application_1331381983,47000.0,19786.9,88.0,True,619.77,True,979.0,47000.0,,-0.249723
834263,Created,User_52,O_Create Offer,Offer,Offer_1688623065,complete,2016-09-14 13:02:41.206000+00:00,"Other, see explanation",New credit,Application_1451333632,75000.0,75000.0,126.0,True,750.00,True,925.0,75000.0,,-0.249723
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
592139,Obtained,User_36,W_Call incomplete files,Workflow,Workitem_1362611761,resume,2016-07-19 12:42:41.334000+00:00,Home improvement,New credit,Application_306805974,15000.0,,,,,,,,,0.110387
587679,Obtained,User_39,W_Call incomplete files,Workflow,Workitem_1388193302,resume,2016-07-20 07:55:16.396000+00:00,Existing loan takeover,New credit,Application_238288424,15000.0,,,,,,,,,0.110754
637790,Obtained,User_15,W_Call incomplete files,Workflow,Workitem_1392720890,resume,2016-07-27 13:19:38.903000+00:00,Existing loan takeover,New credit,Application_287520348,15000.0,,,,,,,,,0.110798
538366,Obtained,User_33,W_Call incomplete files,Workflow,Workitem_1685220021,resume,2016-07-19 13:51:00.418000+00:00,Existing loan takeover,New credit,Application_265162816,17000.0,,,,,,,,,0.111716


In [9]:
# show highest average scores per trace
scores_df[["case:concept:name", "scores"]].groupby(["case:concept:name"]).mean().sort_values("scores")

Unnamed: 0_level_0,scores
case:concept:name,Unnamed: 1_level_1
Application_896441766,-0.094284
Application_1562291654,-0.076924
Application_918459127,-0.074000
Application_938297888,-0.072754
Application_83337214,-0.070765
...,...
Application_1728762018,0.072121
Application_1845792027,0.072847
Application_2089806999,0.073162
Application_2137378775,0.073582


## Remaining time
Here we calculate the remaining time per trace

In [10]:
# Remaining time

# add column "event_index_in_trace"
# which indicates the 1st, 2nd ... event in the trace
log_df["event_index_in_trace"] = log_df.groupby("case:concept:name").cumcount()

# add column "remain_time"
# which indicates time from that event until the last event in the trace
log_df["time:timestamp"] = pd.to_datetime(log_df["time:timestamp"], utc=True)
log_df["remaining_time"] = log_df.groupby("case:concept:name")["time:timestamp"].transform(lambda x: x.max() - x).dt.total_seconds() / (24 * 60 * 60)  # convert to float days

log_df.head()

Unnamed: 0,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,case:concept:name,...,FirstWithdrawalAmount,NumberOfTerms,Accepted,MonthlyCost,Selected,CreditScore,OfferedAmount,OfferID,event_index_in_trace,remaining_time
0,Created,User_1,A_Create Application,Application,Application_652823628,complete,2016-01-01 09:51:15.304000+00:00,Existing loan takeover,New credit,Application_652823628,...,,,,,,,,,0,13.248566
1,statechange,User_1,A_Submitted,Application,ApplState_1582051990,complete,2016-01-01 09:51:15.352000+00:00,Existing loan takeover,New credit,Application_652823628,...,,,,,,,,,1,13.248566
2,Created,User_1,W_Handle leads,Workflow,Workitem_1298499574,schedule,2016-01-01 09:51:15.774000+00:00,Existing loan takeover,New credit,Application_652823628,...,,,,,,,,,2,13.248561
3,Deleted,User_1,W_Handle leads,Workflow,Workitem_1673366067,withdraw,2016-01-01 09:52:36.392000+00:00,Existing loan takeover,New credit,Application_652823628,...,,,,,,,,,3,13.247628
4,Created,User_1,W_Complete application,Workflow,Workitem_1493664571,schedule,2016-01-01 09:52:36.403000+00:00,Existing loan takeover,New credit,Application_652823628,...,,,,,,,,,4,13.247628


## Event index in trace
We add to each event in the log its position in their trace

In [11]:
# add column "event_index_in_trace"
# which indicates the 1st, 2nd ... event in the trace
log_df = log_df.sort_values(by=["case:concept:name", "time:timestamp"])
log_df["event_index_in_trace"] = log_df.groupby("case:concept:name").cumcount()

## Time being executed
We also will store the time that the trace is being executed from the first event in the trace until the current event.

In [12]:
log_df["time:timestamp"] = pd.to_datetime(log_df["time:timestamp"], utc=True)

# Time in execution: time that has been the trace in execution from the first event
log_df["execution_time"] = log_df.groupby("case:concept:name")["time:timestamp"].transform(lambda x: x - x.min()).dt.total_seconds() / (24 * 60 * 60)

## Date formatting for training

In [13]:
# Convert timestamp to a pandas datetime object
log_df['timestamp'] = pd.to_datetime(log_df['time:timestamp'],format='ISO8601')

# Extract relevant features
log_df['year'] = log_df['timestamp'].dt.year
log_df['month'] = log_df['timestamp'].dt.month
log_df['day'] = log_df['timestamp'].dt.day
log_df['hour'] = log_df['timestamp'].dt.hour
log_df['minute'] = log_df['timestamp'].dt.minute
log_df['second'] = log_df['timestamp'].dt.second
log_df['microsecond'] = log_df['timestamp'].dt.microsecond 

# Drop the original timestamp column
log_df = log_df.drop(['timestamp'], axis=1)

## Split train and test
Using the pm4py.split_train_test resulted in traces in train that ended after the start of traces in test unfortunately. This is not a good split, so we implement it manually by sorting traces on timestamp

In [14]:
log_df = log_df.sort_values(by=["case:concept:name", "time:timestamp"])
trace_start_df = log_df[["case:concept:name", "time:timestamp"]].groupby(["case:concept:name"]).min()
trace_end_df = log_df[["case:concept:name", "time:timestamp"]].groupby(["case:concept:name"]).max()

In [15]:
# take the last 10% of the traces as test set
test_size = round(len(trace_start_df)*0.1)
test_cases = trace_start_df.sort_values("time:timestamp").tail(test_size)

In [16]:
# train cases must end before test cases start
train_cases = trace_end_df[trace_end_df["time:timestamp"] < test_cases["time:timestamp"].min()]

In [17]:
train_df = log_df[log_df["case:concept:name"].isin(train_cases.index)]
test_df = log_df[log_df["case:concept:name"].isin(test_cases.index)]

In [18]:
# double check that the timestamps don't overlap
# all traces in train must end before the start of traces in test
print(train_df["time:timestamp"].max())
print(test_df["time:timestamp"].min())

2016-11-22 09:21:30.939000+00:00
2016-11-22 09:22:17.274000+00:00


## Feature encoding
For now we use the basic feature encoding from pm4py, but we want to experiment with using complex index encoding, where we encode the previous 10 activities (or add padding). Furthermore, we add the index of the activity in the log


In [19]:
# select the features we are going to encode
columns_to_encode = ['Action', 'concept:name', 'case:LoanGoal']

# one-hot encode the data
train_df_encode = pd.get_dummies(train_df[columns_to_encode], dtype=int)
test_df_encode = pd.get_dummies(test_df[columns_to_encode], dtype=int)
test_df_encode.head(3)

Unnamed: 0,Action_Created,Action_Deleted,Action_Obtained,Action_Released,Action_statechange,concept:name_A_Accepted,concept:name_A_Cancelled,concept:name_A_Complete,concept:name_A_Concept,concept:name_A_Create Application,...,case:LoanGoal_Debt restructuring,case:LoanGoal_Existing loan takeover,case:LoanGoal_Extra spending limit,case:LoanGoal_Home improvement,case:LoanGoal_Motorcycle,case:LoanGoal_Not speficied,"case:LoanGoal_Other, see explanation",case:LoanGoal_Remaining debt home,case:LoanGoal_Tax payments,case:LoanGoal_Unknown
1093289,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1093290,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1093291,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
train_df.columns

Index(['Action', 'org:resource', 'concept:name', 'EventOrigin', 'EventID',
       'lifecycle:transition', 'time:timestamp', 'case:LoanGoal',
       'case:ApplicationType', 'case:concept:name', 'case:RequestedAmount',
       'FirstWithdrawalAmount', 'NumberOfTerms', 'Accepted', 'MonthlyCost',
       'Selected', 'CreditScore', 'OfferedAmount', 'OfferID',
       'event_index_in_trace', 'remaining_time', 'execution_time', 'year',
       'month', 'day', 'hour', 'minute', 'second', 'microsecond'],
      dtype='object')

In [21]:
columns_to_keep = ['org:resource', 'EventOrigin', 'EventID',
       'lifecycle:transition', 'time:timestamp',
       'case:ApplicationType', 'case:concept:name', 'case:RequestedAmount',
       'FirstWithdrawalAmount', 'NumberOfTerms', 'MonthlyCost',
       'Selected', 'CreditScore', 'OfferedAmount',
       'event_index_in_trace', 'remaining_time', 'execution_time']

# Concatenate the DataFrames based on the index
full_train_df = pd.concat([train_df[columns_to_keep], train_df_encode], axis=1)
full_test_df = pd.concat([test_df[columns_to_keep], test_df_encode], axis=1)

full_train_df.head(2)

Unnamed: 0,org:resource,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:ApplicationType,case:concept:name,case:RequestedAmount,FirstWithdrawalAmount,NumberOfTerms,...,case:LoanGoal_Debt restructuring,case:LoanGoal_Existing loan takeover,case:LoanGoal_Extra spending limit,case:LoanGoal_Home improvement,case:LoanGoal_Motorcycle,case:LoanGoal_Not speficied,"case:LoanGoal_Other, see explanation",case:LoanGoal_Remaining debt home,case:LoanGoal_Tax payments,case:LoanGoal_Unknown
686058,User_1,Application,Application_1000086665,complete,2016-08-03 15:57:21.673000+00:00,New credit,Application_1000086665,5000.0,,,...,0,0,0,0,0,0,1,0,0,0
686059,User_1,Application,ApplState_161925113,complete,2016-08-03 15:57:21.734000+00:00,New credit,Application_1000086665,5000.0,,,...,0,0,0,0,0,0,1,0,0,0


## Split and save features X and targets y

In [23]:
#Save the one-hot encoded dataframes
X_train = full_train_df.drop(columns=["remaining_time"])
X_train.to_csv("data/generated/onehot/X_train.csv")

X_test = full_test_df.drop(columns=["remaining_time"])
X_test.to_csv("data/generated/onehot/X_test.csv")

y_train = full_train_df["remaining_time"]
y_train.to_csv("data/generated/onehot/y_train.csv")

y_test = full_test_df["remaining_time"]
y_test.to_csv("data/generated/onehot/y_test.csv")

### Frequency encoding and save train test files

In [29]:
# Create frequency encoding for test and train df
# Select columns that start with "concept:" or "Action_"
relevant_columns = [c for c in train_df_encode.columns if c.startswith("concept:") or c.startswith("Action_")]

for trace_id, trace_df in full_train_df.groupby("case:concept:name"):
    trace_df_sorted = trace_df.sort_values(by='event_index_in_trace')
    # Update only the selected columns with the cumulative sum
    full_train_df.loc[trace_df_sorted.index, relevant_columns] = trace_df_sorted[relevant_columns].cumsum()

In [30]:
# Select columns that start with "concept:" or "Action_"
relevant_columns = [c for c in test_df_encode.columns if c.startswith("concept:") or c.startswith("Action_")]

for trace_id, trace_df in full_test_df.groupby("case:concept:name"):
    trace_df_sorted = trace_df.sort_values(by='event_index_in_trace')
    # Update only the selected columns with the cumulative sum
    full_test_df.loc[trace_df_sorted.index, relevant_columns] = trace_df_sorted[relevant_columns].cumsum()

In [31]:
#Save the frequency encoded dataframes
X_train = full_train_df.drop(columns=["remaining_time"])
X_train.to_csv("data/generated/frequency/X_train.csv")

X_test = full_test_df.drop(columns=["remaining_time"])
X_test.to_csv("data/generated/frequency/X_test.csv")

y_train = full_train_df["remaining_time"]
y_train.to_csv("data/generated/frequency/y_train.csv")

y_test = full_test_df["remaining_time"]
y_test.to_csv("data/generated/frequency/y_test.csv")