# Preprocessing
This notebook filters the data and saves training and test data in the data folder

In [1]:
# import basic libraries
import pandas as pd

# import pm4py library to work with XES logs and process mining
import pm4py

In [None]:
log = pm4py.read_xes("data/BPI_Challenge_2017.xes.gz")
log_df = pm4py.convert_to_dataframe(log)
log_df.head()

## Event index in trace

We add to each event in the log its position in their trace

In [None]:
# add column "event_index_in_trace"
# which indicates the 1st, 2nd ... event in the trace
log_df = log_df.sort_values(by=["case:concept:name", "time:timestamp"])
log_df["event_index_in_trace"] = log_df.groupby("case:concept:name").cumcount()

## Remaining time

Here we calculate the remaining time per trace

In [None]:
# which indicates time from that event until the last event in the trace
log_df["time:timestamp"] = pd.to_datetime(log_df["time:timestamp"], utc=True)
log_df["remaining_time"] = log_df.groupby("case:concept:name")["time:timestamp"].transform(lambda x: x.max() - x).dt.total_seconds() / (24 * 60 * 60)  # convert to float days

## Time being executed

We also will store the time that the trace is being executed from the first event in the trace until the current event.

In [None]:
log_df["time:timestamp"] = pd.to_datetime(log_df["time:timestamp"], utc=True)

# Time in execution: time that has been the trace in execution from the first event
log_df["execution_time"] = log_df.groupby("case:concept:name")["time:timestamp"].transform(lambda x: x - x.min()).dt.total_seconds() / (24 * 60 * 60)

## Construct the data 5 prefix length

in joins of 5 events of traces that happens following to each other.

## Split train and test

Using the pm4py.split_train_test resulted in traces in train that ended after the start of traces in test unfortunately. This is not a good split, so we implement it manually by sorting traces on timestamp

In [None]:
log_df = log_df.sort_values(by=["case:concept:name", "time:timestamp"])
trace_start_df = log_df[["case:concept:name", "time:timestamp"]].groupby(["case:concept:name"]).min()
trace_end_df = log_df[["case:concept:name", "time:timestamp"]].groupby(["case:concept:name"]).max()

In [None]:
# take the last 10% of the traces as test set
test_size = round(len(trace_start_df)*0.1)
test_cases = trace_start_df.sort_values("time:timestamp").tail(test_size)

In [None]:
# train cases must end before test cases start
train_cases = trace_end_df[trace_end_df["time:timestamp"] < test_cases["time:timestamp"].min()]

In [None]:
train_df = log_df[log_df["case:concept:name"].isin(train_cases.index)]
test_df = log_df[log_df["case:concept:name"].isin(test_cases.index)]

In [None]:
# double check that the timestamps don't overlap
# all traces in train must end before the start of traces in test
print(train_df["time:timestamp"].max())
print(test_df["time:timestamp"].min())

## Feature encoding

For now we use the basic feature encoding from pm4py, but we want to experiment with using complex index encoding, where we encode the previous 10 activities (or add padding). Furthermore, we add the index of the activity in the log

In [None]:
# select the features we are going to encode
columns_to_encode = ['Action', 'concept:name', 'case:LoanGoal']

# one-hot encode the data
train_df_encode = pd.get_dummies(train_df[columns_to_encode], dtype=int)
test_df_encode = pd.get_dummies(test_df[columns_to_encode], dtype=int)
test_df_encode.head(3)

In [None]:
train_df.columns

In [None]:
columns_to_keep = ['org:resource', 'EventOrigin', 'EventID',
       'lifecycle:transition', 'time:timestamp',
       'case:ApplicationType', 'case:concept:name', 'case:RequestedAmount',
       'FirstWithdrawalAmount', 'NumberOfTerms', 'MonthlyCost',
       'Selected', 'CreditScore', 'OfferedAmount',
       'event_index_in_trace', 'remaining_time', 'execution_time']

# Concatenate the DataFrames based on the index
full_train_df = pd.concat([train_df[columns_to_keep], train_df_encode], axis=1)
full_test_df = pd.concat([test_df[columns_to_keep], test_df_encode], axis=1)

full_train_df.head(2)

## Save features X and targets y of train and test

In [None]:
#Save the one-hot encoded dataframes
X_train = full_train_df.drop(columns=["remaining_time"])
X_train.to_csv("data/generated/onehot/X_train.csv")

X_test = full_test_df.drop(columns=["remaining_time"])
X_test.to_csv("data/generated/onehot/X_test.csv")

y_train = full_train_df["remaining_time"]
y_train.to_csv("data/generated/onehot/y_train.csv")

y_test = full_test_df["remaining_time"]
y_test.to_csv("data/generated/onehot/y_test.csv")

## Frequency encoding and save train test files


In [None]:
# Create frequency encoding for test and train df
# Select columns that start with "concept:" or "Action_"
relevant_columns = [c for c in train_df_encode.columns if c.startswith("concept:") or c.startswith("Action_")]

for trace_id, trace_df in full_train_df.groupby("case:concept:name"):
    trace_df_sorted = trace_df.sort_values(by='event_index_in_trace')
    # Update only the selected columns with the cumulative sum
    full_train_df.loc[trace_df_sorted.index, relevant_columns] = trace_df_sorted[relevant_columns].cumsum()

In [None]:
# Select columns that start with "concept:" or "Action_"
relevant_columns = [c for c in test_df_encode.columns if c.startswith("concept:") or c.startswith("Action_")]

for trace_id, trace_df in full_test_df.groupby("case:concept:name"):
    trace_df_sorted = trace_df.sort_values(by='event_index_in_trace')
    # Update only the selected columns with the cumulative sum
    full_test_df.loc[trace_df_sorted.index, relevant_columns] = trace_df_sorted[relevant_columns].cumsum()

## Save features X and targets y of train and test

In [None]:
#Save the frequency encoded dataframes
X_train = full_train_df.drop(columns=["remaining_time"])
X_train.to_csv("data/generated/frequency/X_train.csv")

X_test = full_test_df.drop(columns=["remaining_time"])
X_test.to_csv("data/generated/frequency/X_test.csv")

y_train = full_train_df["remaining_time"]
y_train.to_csv("data/generated/frequency/y_train.csv")

y_test = full_test_df["remaining_time"]
y_test.to_csv("data/generated/frequency/y_test.csv")