# Load Event Log Data

In [1]:
import numpy as np
import pandas as pd
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import pickle as pickle
from src.Log import Reformat
from src.Utils import DataUtil
from src.Preprocessing import Extraction
from src.Preprocessing import Preprocess

from sklearn import preprocessing

In [2]:
logs = DataUtil.read_files("../../data/SimulatedLogs", ".json")

In [3]:
t1 = logs[list(logs.keys())[0]]
data_trace = Reformat.roll_sequence(Extraction.extract(t1), time_column="endDate", case_column="Case ID")
data_trace = data_trace.reset_index()

In [4]:
data_trace["RemTime"] = data_trace["endDate"].apply(Preprocess.cal_remtime)
data_trace["LapseTime"] = data_trace["endDate"].apply(Preprocess.cal_lapse)

In [5]:
train, val, test, le = Preprocess.split_encode(data_trace, ["Activity"])

In [6]:
train

Unnamed: 0,Case ID,Activity,endDate,Start Time,RemTime,LapseTime
0,2251799814781332,"[0, 1, 2, 3, 4, 5]","[2026-07-07 23:29:47.287000+00:00, 2026-07-07 ...",2026-07-07 23:29:47.287000+00:00,"[6000, 4800, 3300, 1200, 600, 0]","[0, 1200, 2700, 4800, 5400, 6000]"
1,2251799814781359,"[6, 7, 8, 9]","[2026-07-07 23:29:47.287000+00:00, 2026-07-07 ...",2026-07-07 23:29:47.287000+00:00,"[5100, 3900, 0, 0]","[0, 1200, 5100, 5100]"
2,2251799814781350,"[6, 7, 8, 9]","[2026-07-07 23:29:47.287000+00:00, 2026-07-07 ...",2026-07-07 23:29:47.287000+00:00,"[2700, 1500, 0, 0]","[0, 1200, 2700, 2700]"
3,2251799814781341,"[0, 1, 2, 3, 4, 5]","[2026-07-07 23:29:47.287000+00:00, 2026-07-07 ...",2026-07-07 23:29:47.287000+00:00,"[6000, 4800, 3300, 1200, 600, 0]","[0, 1200, 2700, 4800, 5400, 6000]"
4,2251799814781323,"[6, 7, 8, 9]","[2026-07-07 23:29:47.287000+00:00, 2026-07-07 ...",2026-07-07 23:29:47.287000+00:00,"[2700, 1500, 0, 0]","[0, 1200, 2700, 2700]"
...,...,...,...,...,...,...
282,2251799814790112,"[6, 7, 8, 9]","[2026-07-09 17:57:47.287000+00:00, 2026-07-09 ...",2026-07-09 17:57:47.287000+00:00,"[1560, 600, 0, 0]","[0, 960, 1560, 1560]"
283,2251799814790121,"[6, 7, 8, 9]","[2026-07-09 17:57:47.287000+00:00, 2026-07-09 ...",2026-07-09 17:57:47.287000+00:00,"[1860, 600, 0, 0]","[0, 1260, 1860, 1860]"
284,2251799814790130,"[6, 7, 8, 9]","[2026-07-09 17:57:47.287000+00:00, 2026-07-09 ...",2026-07-09 17:57:47.287000+00:00,"[1860, 600, 60, 0]","[0, 1260, 1800, 1860]"
285,2251799814790157,"[6, 7, 8, 9]","[2026-07-09 17:57:47.287000+00:00, 2026-07-09 ...",2026-07-09 17:57:47.287000+00:00,"[1560, 600, 0, 0]","[0, 960, 1560, 1560]"


## Preprocessing

In [77]:
def label_encoding(column):
    le = preprocessing.LabelEncoder()
    le.fit(column)
    le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    le_class_mapping = dict(zip(le.transform(le.classes_), le.classes_))
    return le.transform(column), le_name_mapping, le_class_mapping

def cal_remtime(time_stamp_arr):
    return (time_stamp_arr[-1] - time_stamp_arr).astype('timedelta64[s]').astype(int)

def cal_lapse(time_stamp_arr):
    return -(time_stamp_arr[0] - time_stamp_arr).astype('timedelta64[s]').astype(int)

def preprocess(log, key):
    data = log[log["nodeState"] == "COMPLETED"].drop(columns=["nodeState"])
    data = data[data["processState"] == "COMPLETED"].drop(columns=["processState"])
    data = data[data["nodeType"] == "USER_TASK"]
    data["startDate"] = data["startDate"].apply(pd.to_datetime)
    data["endDate"] = data["endDate"].apply(pd.to_datetime)
    data = data.rename(columns={"processInstanceKey": "Case ID", "flowNodeId": "Activity"})
    data_core = data[["Case ID", "Activity", "endDate"]]
    
    le_map = {}
    for column in data_core.columns:
        if column != "Case ID" and type(data_core[column].iloc[0]) == str:
            data_core[column], name_mapping, class_mapping = label_encoding(data_core[column])
            le_map[column] = {"name_mapping": name_mapping, "class_mapping": class_mapping}
            
    le_map_pd = pd.DataFrame(data=le_map)
    le_map_pd.to_pickle("../../data/Test/" + key + "_label_encode.pkl")
    
    data_trace = Reformat.roll_sequence(data_core, time_column="endDate", case_column="Case ID")
    data_trace = data_trace.reset_index()
    
    data_trace["RemTime"] = data_trace["endDate"].apply(cal_remtime)
    data_trace["LapseTime"] = data_trace["endDate"].apply(cal_lapse)
    
    return data_trace

In [25]:
t1_p = preprocess(t1, "1")

## Split

In [26]:
t1_p.shape[0]

In [27]:
def split_data(log, train=0.64, val=0.16):
    data_size = log.shape[0]
    training_set = log[:int(train * data_size)]
    validation_set = log[int(train * data_size): int(train * data_size) + int(val * data_size)]
    test_set = log[int(train * data_size) + int(val * data_size): ]
    return training_set, validation_set, test_set


In [37]:
t1_train, t1_val, t1_test = split_data(t1_p)

In [38]:
t1_train["RemTime"].explode("RemtTime").max()

## Normalize Time Feature

In [36]:
def norm_remtime(train, val, test, columns=["RemTime", "LapseTime"]):
    for column in columns:
        max_value = train[column].explode(column).max()
        train[column] = train[column] / max_value
        val[column] = val[column] / max_value
        test[column] = test[column] / max_value
    
    return train, val, test

In [39]:
t1_train_n, t1_val_n, t1_test_n = norm_remtime(t1_train, t1_val, t1_test)

## Pipeline

In [78]:
for data_set_name in logs.keys():
    log_p = preprocess(logs[data_set_name], data_set_name)
    train, val, test = split_data(log_p)
    train, val, test = norm_remtime(train, val, test)
    train.to_pickle("../../data/Test/" + data_set_name + "_train.pkl")
    val.to_pickle("../../data/Test/" + data_set_name + "_val.pkl")
    test.to_pickle("../../data/Test/" + data_set_name + "_test.pkl")

In [85]:
test["Activity"]