# Pipeline for next activity prediction

In [1]:
import xgboost as xgb
import os
import sys
import pickle
import pandas as pd
import numpy as np
from joblib import dump, load
import copy

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torch.nn import functional as F

from sklearn.metrics import mean_absolute_error
import importlib.util

module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.Trainer import CaseDataSet
from src.Trainer import NextActPredictor
from src.Model import DLModels

from sklearn import preprocessing
from sklearn.metrics import f1_score
    


torch_device = "cpu"
device_package = torch.cpu
if importlib.util.find_spec("torch.backends.mps") is not None:
    if torch.backends.mps.is_available():
        torch_device = torch.device("mps")
        device_package = torch.mps
if torch.cuda.is_available():
    torch_device = torch.device("cuda")
    device_package = torch.cuda
    
torch_device

In [6]:
encoding = "Agg_Mean"
train = CaseDataSet.CaseDataset(project_data_path="../../data/Test/", input_data="bpmn19-log", data_version="_train",
                                    feature_list=["Activity", "LapseTime"],
                                    encoding=encoding, label="Next_Activity")

val = CaseDataSet.CaseDataset(project_data_path="../../data/Test/", input_data="bpmn19-log", data_version="_val",
                                    feature_list=["Activity", "LapseTime"],
                                    encoding=encoding, label="Next_Activity")

test = CaseDataSet.CaseDataset(project_data_path="../../data/Test/", input_data="bpmn19-log", data_version="_test",
                                    feature_list=["Activity", "LapseTime"],
                                    encoding=encoding, label="Next_Activity")

t1 = NextActPredictor.XGBTrainer(train, val, model_path="../../models/bpmn19-log", num_class=9)
t1.train()
print(t1.score())

In [7]:
encoding = "Last"
train = CaseDataSet.CaseDataset(project_data_path="../../data/Test/", input_data="bpmn1-log", data_version="_train",
                                    feature_list=["Activity"],
                                    encoding=encoding, label="Next_Activity")

val = CaseDataSet.CaseDataset(project_data_path="../../data/Test/", input_data="bpmn1-log", data_version="_val",
                                    feature_list=["Activity"],
                                    encoding=encoding, label="Next_Activity")

test = CaseDataSet.CaseDataset(project_data_path="../../data/Test/", input_data="bpmn1-log", data_version="_test",
                                    feature_list=["Activity"],
                                    encoding=encoding, label="Next_Activity")

t1 = NextActPredictor.XGBTrainer(train, val, model_path="../../models/bpmn1-log", num_class=9)
t1.train()
print(t1.score())

In [5]:
t1.data_list[1][1][20]

In [39]:
test[:][0].shape

In [3]:
t1 = NextActPredictor.XGBTrainer(train, val, model_path="../../models/bpmn19-log", num_class=9)
t1.train()
print(t1.score())
t1.save_model()

In [41]:
t2 = NextActPredictor.XGBPredictor(test, model_path="../../models/bpmn19-log", num_class=9)
print(t2.predict())


In [8]:
encoding = "All"
train = CaseDataSet.CaseDataset(project_data_path="../../data/Test/", input_data="bpmn1-log", data_version="_train",
                                    feature_list=["Activity", "LapseTime"],
                                    encoding=encoding, label="Next_Activity")

val = CaseDataSet.CaseDataset(project_data_path="../../data/Test/", input_data="bpmn1-log", data_version="_val",
                                    feature_list=["Activity", "LapseTime"],
                                    encoding=encoding, label="Next_Activity")

test = CaseDataSet.CaseDataset(project_data_path="../../data/Test/", input_data="bpmn1-log", data_version="_test",
                                    feature_list=["Activity", "LapseTime"],
                                    encoding=encoding, label="Next_Activity")

In [9]:
# Hyperparameters
input_size = 11  # The number of expected features in the input x
hidden_size = 512  # The number of features in the hidden state h
num_layers = 1  # Number of recurrent layers
num_classes = 10 
learning_rate = 0.002


model = DLModels.SimpleLSTM(input_size, hidden_size, num_layers, num_classes).to(torch_device)
optimizer = torch.optim.NAdam(model.parameters(), lr=learning_rate)
loss = nn.CrossEntropyLoss()

In [10]:
trained_model, train_score, test_score = NextActPredictor.train_model(model, optimizer, loss, loss, train, val, 2, torch_device, device_package, print_iter=True)

In [8]:
NextActPredictor.evaluate_model(trained_model, test, torch_device, device_package, batch_size=100)

In [31]:
test.set_prefix_length(1)
test[:]