# LSTM

In [1]:
import xgboost as xgb
import numpy as np
import copy
import pickle
from torch.nn import functional as F
from sklearn import preprocessing
from joblib import dump, load
import torch
from torch import nn
import importlib

import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from src.Trainer import CaseDataSet
from src.Model import DLModels
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

from src.Trainer import Regressor
from src.Trainer import Classifier

In [2]:
def split_data(log, train=0.64, val=0.16):
    data_size = log.shape[0]
    training_set = log[:int(train * data_size)]
    validation_set = log[int(train * data_size): int(train * data_size) + int(val * data_size)]
    test_set = log[int(train * data_size) + int(val * data_size): ]
    return training_set, validation_set, test_set


def norm_remtime(train, val, test, columns=["RemTime", "LapseTime"]):
    for column in columns:
        max_value = train[column].explode(column).max()
        train.loc[:, column] = train[column] / max_value
        val.loc[:, column] = val[column] / max_value
        test.loc[:, column] = test[column] / max_value 
    return train, val, test

with open("../../presets/cs1.pkl", 'rb') as f:
    data = pickle.load(f)

d1 = data["traces_dict"]["BPMN_cfc5_0_trace"]
train, val, test = split_data(d1)
train, val, test = norm_remtime(train, val, test)
train_t1 = CaseDataSet.CaseDataset(train, feature_list=["flowNodeId", "LapseTime"], label="RemTime", encoding="all")
val_t1 = CaseDataSet.CaseDataset(val, feature_list=["flowNodeId", "LapseTime"], label="RemTime", encoding="all")
test_t1 = CaseDataSet.CaseDataset(test, feature_list=["flowNodeId", "LapseTime"], label="RemTime", encoding="all")

train_t2 = CaseDataSet.CaseDataset(train, feature_list=["flowNodeId", "LapseTime"], label="Next_flowNodeId", encoding="all")
val_t2 = CaseDataSet.CaseDataset(val, feature_list=["flowNodeId", "LapseTime"], label="Next_flowNodeId", encoding="all")
test_t2 = CaseDataSet.CaseDataset(test, feature_list=["flowNodeId", "LapseTime"], label="Next_flowNodeId", encoding="all")

train_t3 = CaseDataSet.CaseDataset(train, feature_list=["LapseTime"], label="RemTime", encoding="all")
val_t3 = CaseDataSet.CaseDataset(val, feature_list=["LapseTime"], label="RemTime", encoding="all")
test_t3 = CaseDataSet.CaseDataset(test, feature_list=["LapseTime"], label="RemTime", encoding="all")

In [35]:
optimizer = torch.optim.NAdam
loss = nn.L1Loss()
r1 = Regressor.LstmRegressor(train_t1, val_t1, 256, 2, optimizer, loss, 10, 40, 10)
r1.train()
print(r1.val_score)
r1.predict(test_t1)
r1.evaluation_list

[0.12450831 0.11135403 0.10062728 0.08927501 0.08432638 0.08398965
 0.07227605 0.07219254 0.0678773  0.07050477 0.06774218 0.06912424
 0.06965837 0.06776809 0.07040636 0.06695155 0.06758011 0.06733677
 0.06712783 0.06773037 0.06761889 0.06664499 0.06643182 0.06718372
 0.06689198 0.06652468 0.06646566 0.06660349 0.06723823 0.06724643
 0.06651557 0.06638288 0.06638517 0.06643062]


[[array([[0.209874],
         [0.209874],
         [0.209874],
         [0.209874],
         [0.209874],
         [0.209874],
         [0.209874],
         [0.209874],
         [0.209874],
         [0.209874]], dtype=float32),
  array([[0.24752475],
         [0.26308346],
         [0.24045262],
         [0.27015558],
         [0.14144272],
         [0.24752475],
         [0.13437058],
         [0.14851485],
         [0.281471  ],
         [0.19943424]], dtype=float32)],
 [array([[0.1474337 ],
         [0.14681104],
         [0.14681104],
         [0.14681104],
         [0.1471217 ],
         [0.14681104],
         [0.1474337 ],
         [0.14681104],
         [0.14650175],
         [0.14650175]], dtype=float32),
  array([[0.23338048],
         [0.23479491],
         [0.21216407],
         [0.24186705],
         [0.12022631],
         [0.21923621],
         [0.12022631],
         [0.12022631],
         [0.24611032],
         [0.16407356]], dtype=float32)],
 [array([[0.11858921],
       

In [7]:
optimizer = torch.optim.NAdam
loss = nn.L1Loss()
r2 = Regressor.LstmRegressor(train_t3, val_t3, 256, 2, optimizer, loss, 10, 40, 10)
r2.train()
print(r2.val_score)
r2.predict(test_t3)
r2.evaluation_list

KeyboardInterrupt: 

In [3]:
optimizer = torch.optim.NAdam
loss = nn.CrossEntropyLoss()
num_class = train_t2[:][0].shape[-1]-1
c1 = Classifier.LstmClassifier(train_t2, val_t2, 256, 2, num_class, optimizer, loss, 10, 40, 10)
c1.train()
c1.val_score

array([0.43692731, 0.18493647, 0.16269908, 0.15132849, 0.14558674,
       0.18359129, 0.18846746, 0.15515916, 0.15858121, 0.16469393,
       0.15647641, 0.14367554, 0.14895851, 0.15724686, 0.15263314,
       0.14575758, 0.14443267, 0.14309181, 0.17925664, 0.15606715,
       0.14401011, 0.16560391, 0.15992853, 0.1431518 , 0.14297299,
       0.15214764, 0.14506433, 0.18105437, 0.14808999, 0.15533164,
       0.1462348 , 0.1483689 , 0.16836962, 0.14643556, 0.15438226,
       0.14644883])

In [4]:
c1.predict(test_t2)
c1.evaluation_list

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 10 and the array at index 6 has size 2

In [30]:
test.iloc[:10]["flowNodeId"].apply(lambda x: np.argmax(x, axis=-1)).values

array([array([7, 6, 8, 1, 2, 9, 5, 3, 0, 4], dtype=int64),
       array([7, 6, 8, 1, 2, 9, 5, 3, 0, 4], dtype=int64),
       array([7, 6, 8, 1, 2, 9, 5, 3, 0, 4], dtype=int64),
       array([7, 6, 8, 1, 2, 9, 5, 3, 0, 4], dtype=int64),
       array([7, 6, 8, 1, 2, 9, 5, 3, 0, 4], dtype=int64),
       array([7, 6, 8, 1, 2, 9, 5, 3, 0, 4], dtype=int64),
       array([7, 6, 8, 1, 2, 9, 5, 3, 0, 4], dtype=int64),
       array([7, 6, 8, 1, 2, 9, 5, 3, 0, 4], dtype=int64),
       array([7, 6, 8, 1, 2, 9, 5, 3, 0, 4], dtype=int64),
       array([7, 6, 8, 1, 2, 9, 5, 3, 0, 4], dtype=int64)], dtype=object)

In [56]:

if importlib.util.find_spec("torch.backends.mps") is not None:
    if torch.backends.mps.is_available():
        torch_device = torch.device("mps")
        device_package = torch.mps
if torch.cuda.is_available():
    torch_device = torch.device("cuda")
    device_package = torch.cuda
            
def evaluate_model(model, test_set, torch_device, device_package, batch_size=100):
    training_data_set = test_set
    res_list = []
    ref_list = []
    model.flatten()
    device_package.empty_cache()
    for prefix_len in range(1, test_set.max_case_len+1):
        training_data_set.set_prefix_length(prefix_len)
        input_data = training_data_set[:]
        if input_data is None:
            # print("Max length reached, abort")
            break
        sample_num = input_data[0].shape[0]

        output_list = []
        label_list = []
        batch_num = int(sample_num / batch_size)
        for i in range(batch_num):
            x = input_data[0][int(batch_size * i): int(batch_size * (i+1))].float().to(torch_device)
            y = input_data[1][int(batch_size * i): int(batch_size * (i+1))].float().argmax(dim=-1)
            outputs = model(x).detach().argmax(dim=-1)
            output_list.append(outputs.cpu().numpy())
            label_list.append(y.cpu().numpy().T)

            device_package.empty_cache()

        if sample_num > batch_size * batch_num:
            x = input_data[0][batch_size * batch_num:].float().to(torch_device)
            y = input_data[1][batch_size * batch_num:].float().argmax(dim=-1)
            outputs = model(x).detach().argmax(dim=-1)
            output_list.append(outputs.cpu().numpy())
            label_list.append(y.cpu().numpy().T)

            device_package.empty_cache()

        res_list.append(np.hstack(output_list))
        ref_list.append(np.hstack(label_list))

    return np.hstack(res_list), np.hstack(ref_list)


In [57]:
res, ref = evaluate_model(c1.model, test_t2, torch_device, device_package, batch_size=10)

In [58]:
res

array([8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
       8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
       8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
       9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
       9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,

In [59]:
ref

array([[8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
        8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
        8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 6, 6, 6, 6,
        6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
        6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
        6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 9, 3, 9, 3, 9, 3, 5, 3, 9, 5, 9, 9, 9, 9, 9, 9, 5, 3, 9, 5,
        9, 9, 9, 5, 9, 9, 9, 9, 9, 9, 5, 5, 9, 9, 9, 9, 3, 3, 5, 3, 9, 9,
        3, 3, 9, 5, 3, 9, 5, 9, 5, 9, 