# LSTM

In [2]:
import xgboost as xgb
import numpy as np
import copy
import pickle
from torch.nn import functional as F
from sklearn import preprocessing
from joblib import dump, load
import torch
from torch import nn
import importlib

import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from src.Trainer import CaseDataSet
from src.Model import DLModels
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

from src.Trainer import Regressor
from src.Trainer import Classifier

In [32]:
def split_data(log, train=0.64, val=0.16):
    data_size = log.shape[0]
    training_set = log[:int(train * data_size)]
    validation_set = log[int(train * data_size): int(train * data_size) + int(val * data_size)]
    test_set = log[int(train * data_size) + int(val * data_size): ]
    return training_set, validation_set, test_set


def norm_remtime(train, val, test, columns=["RemTime", "LapseTime"]):
    for column in columns:
        max_value = train[column].explode(column).max()
        train.loc[:, column] = train[column] / max_value
        val.loc[:, column] = val[column] / max_value
        test.loc[:, column] = test[column] / max_value 
    return train, val, test

with open("../../presets/t2.pkl", 'rb') as f:
    data = pickle.load(f)

d1 = data["traces_dict"]["bpmn1-log_trace"]
train, val, test = split_data(d1)
train, val, test = norm_remtime(train, val, test)
train_t1 = CaseDataSet.CaseDataset(train, feature_list=["flowNodeId", "LapseTime"], label="RemTime", encoding="all")
val_t1 = CaseDataSet.CaseDataset(val, feature_list=["flowNodeId", "LapseTime"], label="RemTime", encoding="all")
test_t1 = CaseDataSet.CaseDataset(test.iloc[:10], feature_list=["flowNodeId", "LapseTime"], label="RemTime", encoding="all")

train_t2 = CaseDataSet.CaseDataset(train, feature_list=["flowNodeId", "LapseTime"], label="Next_flowNodeId", encoding="all")
val_t2 = CaseDataSet.CaseDataset(val, feature_list=["flowNodeId", "LapseTime"], label="Next_flowNodeId", encoding="all")
test_t2 = CaseDataSet.CaseDataset(test.iloc[:10], feature_list=["flowNodeId", "LapseTime"], label="Next_flowNodeId", encoding="all")

train_t3 = CaseDataSet.CaseDataset(train, feature_list=["LapseTime"], label="RemTime", encoding="all")
val_t3 = CaseDataSet.CaseDataset(val, feature_list=["LapseTime"], label="RemTime", encoding="all")
test_t3 = CaseDataSet.CaseDataset(test.iloc[:10], feature_list=["LapseTime"], label="RemTime", encoding="all")

In [35]:
optimizer = torch.optim.NAdam
loss = nn.L1Loss()
r1 = Regressor.LstmRegressor(train_t1, val_t1, 256, 2, optimizer, loss, 10, 40, 10)
r1.train()
print(r1.val_score)
r1.predict(test_t1)
r1.evaluation_list

[0.12450831 0.11135403 0.10062728 0.08927501 0.08432638 0.08398965
 0.07227605 0.07219254 0.0678773  0.07050477 0.06774218 0.06912424
 0.06965837 0.06776809 0.07040636 0.06695155 0.06758011 0.06733677
 0.06712783 0.06773037 0.06761889 0.06664499 0.06643182 0.06718372
 0.06689198 0.06652468 0.06646566 0.06660349 0.06723823 0.06724643
 0.06651557 0.06638288 0.06638517 0.06643062]


[[array([[0.209874],
         [0.209874],
         [0.209874],
         [0.209874],
         [0.209874],
         [0.209874],
         [0.209874],
         [0.209874],
         [0.209874],
         [0.209874]], dtype=float32),
  array([[0.24752475],
         [0.26308346],
         [0.24045262],
         [0.27015558],
         [0.14144272],
         [0.24752475],
         [0.13437058],
         [0.14851485],
         [0.281471  ],
         [0.19943424]], dtype=float32)],
 [array([[0.1474337 ],
         [0.14681104],
         [0.14681104],
         [0.14681104],
         [0.1471217 ],
         [0.14681104],
         [0.1474337 ],
         [0.14681104],
         [0.14650175],
         [0.14650175]], dtype=float32),
  array([[0.23338048],
         [0.23479491],
         [0.21216407],
         [0.24186705],
         [0.12022631],
         [0.21923621],
         [0.12022631],
         [0.12022631],
         [0.24611032],
         [0.16407356]], dtype=float32)],
 [array([[0.11858921],
       

In [33]:
optimizer = torch.optim.NAdam
loss = nn.L1Loss()
r2 = Regressor.LstmRegressor(train_t3, val_t3, 256, 2, optimizer, loss, 10, 40, 10)
r2.train()
print(r2.val_score)
r2.predict(test_t3)
r2.evaluation_list

[0.12735227 0.12551778 0.12334696 0.12197108 0.11922642 0.11844719
 0.11018989 0.10475467 0.09869641 0.09850224 0.09305568 0.089002
 0.08406051 0.08234837 0.08179742 0.07886125 0.08124647 0.07715702
 0.07876909 0.07556665 0.07451895 0.07530215 0.07727634 0.07661998
 0.07481021 0.07503108 0.07286779 0.07262547 0.07235212 0.07208129
 0.07412994 0.07307745 0.07208864 0.0738491  0.07350873 0.0730037
 0.07124095 0.07196375 0.07119023 0.07450675]


RuntimeError: input.size(-1) must be equal to input_size. Expected 1, got 11

In [11]:
optimizer = torch.optim.NAdam
loss = nn.CrossEntropyLoss()
num_class = train_t2[:][0].shape[-1]-1
c1 = Classifier.LstmClassifier(train_t2, val_t2, 256, 2, num_class, optimizer, loss, 10, 40, 10)
c1.train()
c1.val_score

array([7.54436871e-01, 1.88913205e-03, 9.69038412e-04, 6.15036423e-04,
       4.30429294e-04, 3.18632625e-04, 2.45185731e-04, 1.93974939e-04,
       1.56518674e-04, 1.28160336e-04, 1.06118688e-04, 8.86355526e-05,
       7.45818825e-05, 6.31474356e-05, 5.37730299e-05, 4.60373633e-05,
       3.95470127e-05, 3.40473456e-05, 2.94032762e-05, 2.54316904e-05,
       2.20207973e-05, 1.90784212e-05, 1.66236383e-05])

In [22]:
c1.predict(test_t2)
c1.evaluation_list

[[array([[6, 6, 6, 6, 6, 6, 6, 6, 6, 6]], dtype=int64),
  array([[6],
         [6],
         [6],
         [6],
         [6],
         [6],
         [6],
         [6],
         [6],
         [6]], dtype=int64)],
 [array([[8, 8, 8, 8, 8, 8, 8, 8, 8, 8]], dtype=int64),
  array([[8],
         [8],
         [8],
         [8],
         [8],
         [8],
         [8],
         [8],
         [8],
         [8]], dtype=int64)],
 [array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=int64),
  array([[1],
         [1],
         [1],
         [1],
         [1],
         [1],
         [1],
         [1],
         [1],
         [1]], dtype=int64)],
 [array([[2, 2, 2, 2, 2, 2, 2, 2, 2, 2]], dtype=int64),
  array([[2],
         [2],
         [2],
         [2],
         [2],
         [2],
         [2],
         [2],
         [2],
         [2]], dtype=int64)],
 [array([[9, 9, 9, 9, 9, 9, 9, 9, 9, 9]], dtype=int64),
  array([[9],
         [9],
         [9],
         [9],
         [9],
         [9],
         [9]

In [30]:
test.iloc[:10]["flowNodeId"].apply(lambda x: np.argmax(x, axis=-1)).values

array([array([7, 6, 8, 1, 2, 9, 5, 3, 0, 4], dtype=int64),
       array([7, 6, 8, 1, 2, 9, 5, 3, 0, 4], dtype=int64),
       array([7, 6, 8, 1, 2, 9, 5, 3, 0, 4], dtype=int64),
       array([7, 6, 8, 1, 2, 9, 5, 3, 0, 4], dtype=int64),
       array([7, 6, 8, 1, 2, 9, 5, 3, 0, 4], dtype=int64),
       array([7, 6, 8, 1, 2, 9, 5, 3, 0, 4], dtype=int64),
       array([7, 6, 8, 1, 2, 9, 5, 3, 0, 4], dtype=int64),
       array([7, 6, 8, 1, 2, 9, 5, 3, 0, 4], dtype=int64),
       array([7, 6, 8, 1, 2, 9, 5, 3, 0, 4], dtype=int64),
       array([7, 6, 8, 1, 2, 9, 5, 3, 0, 4], dtype=int64)], dtype=object)

In [31]:
test.iloc[:10]["LapseTime"].values

array([array([0.        , 0.01414427, 0.02828854, 0.04243281, 0.04243281,
              0.05657709, 0.09193777, 0.12729844, 0.23338048, 0.24752475]),
       array([0.        , 0.02828854, 0.05657709, 0.08486563, 0.08486563,
              0.12022631, 0.22630835, 0.24045262, 0.25459689, 0.26308345]),
       array([0.        , 0.02828854, 0.04950495, 0.06364922, 0.06364922,
              0.07779349, 0.10608204, 0.21216407, 0.22630835, 0.24045262]),
       array([0.        , 0.02828854, 0.05657709, 0.09193777, 0.09193777,
              0.1980198 , 0.21216407, 0.22630835, 0.23479491, 0.27015559]),
       array([0.        , 0.02121641, 0.03536068, 0.04950495, 0.04950495,
              0.07072136, 0.08486563, 0.10042433, 0.12871287, 0.14144272]),
       array([0.        , 0.02828854, 0.06364922, 0.20509194, 0.20509194,
              0.2135785 , 0.22065064, 0.22772277, 0.23479491, 0.24752475]),
       array([0.        , 0.01414427, 0.02828854, 0.04950495, 0.04950495,
              0.06364922, 