# Read and preprocess data

In [1]:
import numpy as np
import pandas as pd
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import pickle as pickle
from src.Log import Reformat

from sklearn import preprocessing

## Read data

In [2]:
data = pd.read_json("../../data/2024-06_04_camunda_test_export.json")
data["startDate"] = data["startDate"].apply(pd.to_datetime)
data["endDate"] = data["endDate"].apply(pd.to_datetime)

FileNotFoundError: File ../../data/2024-06_04_camunda_test_export.json does not exist

## Filter unfinished

In [5]:
data_fin = data[data["nodeState"] == "COMPLETED"].drop(columns=["nodeState"])
data_fin = data_fin[data_fin["processState"] == "COMPLETED"].drop(columns=["processState"])
data_fin = data_fin.rename(columns={"processInstanceKey": "Case ID", "flowNodeId": "Activity"})

In [6]:
data_fin

Unnamed: 0,key,Case ID,Activity,nodeType,nodeIncident,nextFlowNodeId,startDate,endDate,processVersion,bpmnProcessId,processDefinitionKey,variablesString,variablesDouble,variablesBool,executionFlow,nodeDuration,processDuration
0,2251799814059556,2251799814059549,Activity_0kwkee3,USER_TASK,False,Activity_10j4ubx,2024-06-03 08:35:03.943000+00:00,2024-06-03 08:35:15.705000+00:00,6,Sim_Test_1,2251799814059278,{},{},"{'guestArrival': False, 'fittingRoom': True, '...",{'StartEvent_1': 1},11762,47174
1,2251799814059616,2251799814059549,Activity_10j4ubx,USER_TASK,False,Activity_1qen0ov,2024-06-03 08:35:15.705000+00:00,2024-06-03 08:35:29.939000+00:00,6,Sim_Test_1,2251799814059278,{},{},"{'guestArrival': False, 'fittingRoom': True, '...","{'Gateway_1iifgjl': 1, 'Activity_0kwkee3': 1, ...",14234,47174
2,2251799814059716,2251799814059549,Activity_1qen0ov,USER_TASK,False,Event_1m6kx19,2024-06-03 08:35:29.939000+00:00,2024-06-03 08:35:51.117000+00:00,6,Sim_Test_1,2251799814059278,{},{},"{'guestArrival': False, 'fittingRoom': True, '...","{'Gateway_1iifgjl': 1, 'Gateway_097vsc3': 1, '...",21178,47174
3,2251799814059565,2251799814059558,Activity_0kwkee3,USER_TASK,False,Activity_10j4ubx,2024-06-03 08:35:04.466000+00:00,2024-06-03 08:35:08.788000+00:00,6,Sim_Test_1,2251799814059278,{},{},"{'guestArrival': False, 'fittingRoom': True, '...",{'StartEvent_1': 1},4322,13780
4,2251799814059583,2251799814059558,Activity_10j4ubx,USER_TASK,False,Activity_1qen0ov,2024-06-03 08:35:08.788000+00:00,2024-06-03 08:35:13.785000+00:00,6,Sim_Test_1,2251799814059278,{},{},"{'guestArrival': False, 'fittingRoom': True, '...","{'Gateway_1iifgjl': 1, 'Activity_0kwkee3': 1, ...",4997,13780
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4447,2251799814113254,2251799814112976,Activity_0btrsy1,USER_TASK,False,Activity_0systjc,2024-06-03 11:09:49.276000+00:00,2024-06-03 11:10:13.344000+00:00,6,Sim_Test_1,2251799814059278,{},{},"{'guestArrival': True, 'fittingRoom': True, 'c...","{'Gateway_1iifgjl': 1, 'Gateway_097vsc3': 1, '...",24068,139091
4448,2251799814113371,2251799814112976,Activity_0systjc,USER_TASK,False,Activity_0systjc,2024-06-03 11:10:13.344000+00:00,2024-06-03 11:10:22.384000+00:00,6,Sim_Test_1,2251799814059278,{},{},"{'guestArrival': True, 'fittingRoom': True, 'c...","{'Gateway_1iifgjl': 1, 'Gateway_097vsc3': 1, '...",9040,139091
4449,2251799814113412,2251799814112976,Activity_0systjc,USER_TASK,False,Activity_0rdg80t,2024-06-03 11:10:22.384000+00:00,2024-06-03 11:10:44.463000+00:00,6,Sim_Test_1,2251799814059278,{},{},"{'guestArrival': True, 'fittingRoom': True, 'c...","{'Gateway_1iifgjl': 1, 'Gateway_097vsc3': 1, '...",22079,139091
4450,2251799814113530,2251799814112976,Activity_0rdg80t,USER_TASK,False,Activity_1qen0ov,2024-06-03 11:10:44.463000+00:00,2024-06-03 11:10:58.621000+00:00,6,Sim_Test_1,2251799814059278,{},{},"{'guestArrival': True, 'fittingRoom': True, 'c...","{'Gateway_1iifgjl': 1, 'Gateway_097vsc3': 1, '...",14158,139091


## Encode Cat feature

In [7]:
def label_encoding(column):
    le = preprocessing.LabelEncoder()
    le.fit(column)
    le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    le_class_mapping = dict(zip(le.transform(le.classes_), le.classes_))
    return le.transform(column), le_name_mapping, le_class_mapping

le_map = {}
for column in data_fin.columns:
    if column != "processInstanceKey" and type(data_fin[column][0]) == str:
        data_fin[column], name_mapping, class_mapping = label_encoding(data_fin[column])
        le_map[column] = {"name_mapping": name_mapping, "class_mapping": class_mapping}

In [8]:
le_map_pd = pd.DataFrame(data=le_map)
le_map_pd.to_pickle("../../data/t1_label_encode.pkl")

In [9]:
data_core = data_fin[["Case ID", "Activity", "endDate"]]

In [10]:
data_core

Unnamed: 0,Case ID,Activity,endDate
0,2251799814059549,1,2024-06-03 08:35:15.705000+00:00
1,2251799814059549,5,2024-06-03 08:35:29.939000+00:00
2,2251799814059549,6,2024-06-03 08:35:51.117000+00:00
3,2251799814059558,1,2024-06-03 08:35:08.788000+00:00
4,2251799814059558,5,2024-06-03 08:35:13.785000+00:00
...,...,...,...
4447,2251799814112976,0,2024-06-03 11:10:13.344000+00:00
4448,2251799814112976,4,2024-06-03 11:10:22.384000+00:00
4449,2251799814112976,4,2024-06-03 11:10:44.463000+00:00
4450,2251799814112976,3,2024-06-03 11:10:58.621000+00:00


In [11]:
data_trace = Reformat.roll_sequence(data_core, time_column="endDate", case_column="Case ID")

In [12]:
data_trace = data_trace.reset_index()

In [13]:
def cal_remtime(time_stamp_arr):
    return (time_stamp_arr[-1] - time_stamp_arr).astype('timedelta64[s]').astype(int)

def cal_lapse(time_stamp_arr):
    return -(time_stamp_arr[0] - time_stamp_arr).astype('timedelta64[s]').astype(int)

data_trace["RemTime"] = data_trace["endDate"].apply(cal_remtime)
data_trace["LapseTime"] = data_trace["endDate"].apply(cal_lapse)

In [17]:
data_trace["RemTime"] = data_trace["RemTime"]/295
data_trace["LapseTime"] = data_trace["LapseTime"]/295

In [20]:
data_trace

Unnamed: 0,Case ID,Activity,endDate,Start Time,RemTime,LapseTime
0,2251799814059558,"[1, 5, 6]","[2024-06-03 08:35:08.788000+00:00, 2024-06-03 ...",2024-06-03 08:35:08.788000+00:00,"[0.030508474576271188, 0.013559322033898305, 0.0]","[0.0, 0.01694915254237288, 0.03389830508474576]"
1,2251799814059549,"[1, 5, 6]","[2024-06-03 08:35:15.705000+00:00, 2024-06-03 ...",2024-06-03 08:35:15.705000+00:00,"[0.11864406779661017, 0.0711864406779661, 0.0]","[0.0, 0.05084745762711865, 0.12203389830508475]"
2,2251799814059567,"[1, 5, 6]","[2024-06-03 08:35:20.021000+00:00, 2024-06-03 ...",2024-06-03 08:35:20.021000+00:00,"[0.0847457627118644, 0.061016949152542375, 0.0]","[0.0, 0.023728813559322035, 0.08813559322033898]"
3,2251799814059618,"[1, 5, 0, 4, 3, 6]","[2024-06-03 08:35:28.738000+00:00, 2024-06-03 ...",2024-06-03 08:35:28.738000+00:00,"[0.17627118644067796, 0.1152542372881356, 0.08...","[0.0, 0.06440677966101695, 0.09152542372881356..."
4,2251799814059631,"[1, 2]","[2024-06-03 08:35:31.857000+00:00, 2024-06-03 ...",2024-06-03 08:35:31.857000+00:00,"[0.030508474576271188, 0.0]","[0.0, 0.03389830508474576]"
...,...,...,...,...,...,...
995,2251799814112619,"[1, 2, 5, 0, 4, 3, 6]","[2024-06-03 11:08:24.761000+00:00, 2024-06-03 ...",2024-06-03 11:08:24.761000+00:00,"[0.4135593220338983, 0.34576271186440677, 0.29...","[0.0, 0.0711864406779661, 0.12203389830508475,..."
996,2251799814112667,"[1, 5, 6]","[2024-06-03 11:08:32.824000+00:00, 2024-06-03 ...",2024-06-03 11:08:32.824000+00:00,"[0.1016949152542373, 0.09152542372881356, 0.0]","[0.0, 0.013559322033898305, 0.10508474576271186]"
997,2251799814112774,"[1, 5, 0, 4, 4, 3, 6]","[2024-06-03 11:08:34.839000+00:00, 2024-06-03 ...",2024-06-03 11:08:34.839000+00:00,"[0.46779661016949153, 0.3898305084745763, 0.28...","[0.0, 0.07796610169491526, 0.1864406779661017,..."
998,2251799814112976,"[1, 5, 0, 4, 4, 3, 6]","[2024-06-03 11:09:24.179000+00:00, 2024-06-03 ...",2024-06-03 11:09:24.179000+00:00,"[0.3728813559322034, 0.288135593220339, 0.2067...","[0.0, 0.08813559322033898, 0.1694915254237288,..."


In [19]:
data_trace[:640].to_pickle("../../data/t1_train.pkl")
data_trace[640:800].to_pickle("../../data/t1_val.pkl")
data_trace[800:].to_pickle("../../data/t1_test.pkl")

In [30]:
data_trace.to_pickle("../../data/test1.pkl")

In [25]:
data_read = pd.read_pickle("../data/test1.pkl")

In [28]:
data_read.columns.values

array(['Case ID', 'Activity', 'endDate', 'Start Time', 'RemTime',
       'LapseTime'], dtype=object)

In [16]:
len(le_map["Activity"]['name_mapping'])

6