# Remaining time predictor

In [3]:
import xgboost as xgb
import os
import sys
import pickle
import pandas as pd
import numpy as np
from joblib import dump, load
import copy

import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader
from torch.nn import functional as F

from sklearn.metrics import mean_absolute_error
import importlib.util

module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.Trainer import CaseDataSet
from src.Trainer import NextActPredictor
from src.Model import DLModels

from sklearn import preprocessing
from sklearn.metrics import f1_score
    


torch_device = "cpu"
device_package = torch.cpu
if importlib.util.find_spec("torch.backends.mps") is not None:
    if torch.backends.mps.is_available():
        torch_device = torch.device("mps")
        device_package = torch.mps
if torch.cuda.is_available():
    torch_device = torch.device("cuda")
    device_package = torch.cuda
    
torch_device

In [30]:
class XGBTrainer():
    def __init__(self, training_set, validation_set, num_class, model_path, tree_method="hist", early_stopping_rounds=2):
        self.training_set = training_set
        self.validation_set = validation_set
        self.model_path = model_path
        self.dataset_list = [self.training_set, self.validation_set]
        self.data_list = []
        self.clf = xgb.XGBClassifier(objective='multi:softprob', tree_method=tree_method, early_stopping_rounds=early_stopping_rounds, num_class=num_class)
        self.le = preprocessing.LabelEncoder()
        self.generate_data_set()
        
    def generate_data_set(self):
       
        for i in range(2):
            feature_list = []
            label_list = []
            for prefix_len in range(1, self.dataset_list[i].max_case_len-1):
                self.dataset_list[i].set_prefix_length(prefix_len)
                feature_list.append(self.dataset_list[i][:][0].numpy())
                label_list.append(self.dataset_list[i][:][1].numpy())
        
            self.data_list.append([np.vstack(feature_list), np.argmax(np.vstack(label_list), axis=-1)])
            if i == 0:
                self.le.fit(self.data_list[0][1])
                
            self.data_list[i][1] = self.le.transform(self.data_list[i][1])
            
                   
    
    def train(self):
        self.clf.fit(self.data_list[0][0], self.data_list[0][1], eval_set=[(self.data_list[1][0], self.data_list[1][1])])
        
    def score(self):
        return self.clf.score(self.data_list[1][0], self.data_list[1][1])
    
    def save_model(self):
        self.clf.save_model(self.model_path + "_XGB.json")
        dump(self.le, self.model_path + "_le.joblib")
        
    def load_model(self):
        self.clf.load_model(self.model_path + "_XGB.json")
        self.le = load(self.model_path + "_le.joblib")
        

class XGBPredictor():
    def __init__(self, test_set, num_class, model_path, tree_method="hist", early_stopping_rounds=2):
        self.test_set = test_set
        self.model_path = model_path
        self.data_list = []
        self.clf = xgb.XGBClassifier(objective='multi:softprob', tree_method=tree_method, early_stopping_rounds=early_stopping_rounds, num_class=num_class)
        self.clf.load_model(self.model_path + "_XGB.json")
        self.le = load(self.model_path + "_le.joblib")
        self.generate_data_set()
    
    def generate_data_set(self):
        feature_list = []
        label_list = []
        for prefix_len in range(1, self.test_set.max_case_len-1):
            self.test_set.set_prefix_length(prefix_len)
            feature_list.append(self.test_set[:][0].numpy())
            label_list.append(self.test_set[:][1].numpy())

        self.data_list = [np.vstack(feature_list), np.argmax(np.vstack(label_list), axis=-1)]
        self.data_list[1] = self.le.transform(self.data_list[1])
        
    def predict(self):
        return self.clf.predict(self.data_list[0]), self.data_list[1]
        
        
        

In [5]:
encoding = "Agg_Mean"
train = CaseDataSet.CaseDataset(project_data_path="../../data/", input_data="dummy", data_version="_train", feature_list=["Activity", "LapseTime"],
                                encoding=encoding, label="Next_Activity")

val = CaseDataSet.CaseDataset(project_data_path="../../data/", input_data="dummy", data_version="_val", feature_list=["Activity", "LapseTime"],
                              encoding=encoding, label="Next_Activity")

test = CaseDataSet.CaseDataset(project_data_path="../../data/", input_data="dummy", data_version="_test", feature_list=["Activity", "LapseTime"],
                              encoding=encoding, label="Next_Activity")


In [8]:
t1 = XGBTrainer(train, val, model_path="../../models/test", num_class=6)
t1.train()
print(t1.score())
t1.save_model()

In [31]:
t2 = XGBPredictor(test, model_path="../../models/test", num_class=6)
t2.predict()

In [32]:
t2.data_list[1]

## LSTM

In [25]:
def train_model_epoch(model, training_set, optimizer, criterion, torch_device,
                      batch_size=50, training=True):
    training_data_set = training_set
    batch_size = batch_size
    loss_prefix_list = []
    sample_num_list = []
    for prefix_len in range(1, training_data_set.max_case_len - 1):
        loss_prefix = 0
        training_data_set.set_prefix_length(prefix_len)
        training_data_set.shuffle_data()
        input_data = training_data_set[:]
        if input_data is None:
            break
        sample_num = input_data[0].shape[0]
        sample_num_list.append(sample_num)

        batch_num = int(sample_num / batch_size)
        for i in range(batch_num):
            x = input_data[0][int(batch_size * i) : int(batch_size * (i+1))].float().to(torch_device)
            y = input_data[1][int(batch_size * i) : int(batch_size * (i+1))].float().to(torch_device).argmax(dim=1)
            outputs = model(x)
            loss = criterion(outputs, y)
            if training:
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            loss_prefix = loss_prefix + loss.item()

        if sample_num > batch_size * batch_num:
            x = input_data[0][batch_size * batch_num :].float().to(torch_device)
            y = input_data[1][batch_size * batch_num :].float().to(torch_device).argmax(dim=1)
            outputs = model(x)
            loss = criterion(outputs, y)
            if training:
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            loss_prefix = loss_prefix + loss.item()

        loss_prefix_list.append(loss_prefix)
    return np.array(loss_prefix_list), np.array(sample_num_list)


def train_model(model, optimizer, criterion, criterion_eval, training_set,
                test_set, batch_size, torch_device, device_package,
                max_epoch=100, max_ob_iter=20, score_margin=1e-4, print_iter=False):
    train_score_list = []
    test_score_list = []
    score = 1e5
    best_iter = 0
    best_model = None
    for iter_epoch in range(max_epoch):
        device_package.empty_cache()
        loss_train, sample_num_train = train_model_epoch(model, training_set, batch_size=batch_size,
                                                                 optimizer=optimizer,
                                                                 criterion=criterion,
                                                                 torch_device=torch_device)
        device_package.empty_cache()
        loss_test, sample_num_test = train_model_epoch(model, test_set, batch_size=batch_size,
                                                               optimizer=optimizer,
                                                               criterion=criterion_eval,
                                                               torch_device=torch_device,
                                                               training=False)

        score_train = np.sum(loss_train) / np.sum(sample_num_train)
        score_test = np.sum(loss_test) / np.sum(sample_num_test)
        train_score_list.append(score_train)
        test_score_list.append(score_test)

        if score_test < (score - score_margin):
            score = score_test
            best_model = copy.deepcopy(model)
            best_iter = iter_epoch

        if iter_epoch > best_iter + max_ob_iter:
            break
        if print_iter:
            print("Finished training iteration: ", iter_epoch, " with val loss: ", score_test)
    device_package.empty_cache()
    return best_model, np.array(train_score_list), np.array(test_score_list)


def evaluate_model(model, test_set, torch_device, device_package, batch_size=100):
    training_data_set = test_set
    evaluation_list = []
    sample_num_list = []
    model.flatten()
    device_package.empty_cache()
    for prefix_len in range(training_data_set.max_case_len - 1):
        training_data_set.set_prefix_length(prefix_len + 1)
        training_data_set.shuffle_data()
        input_data = training_data_set[:]
        if input_data is None:
            # print("Max length reached, abort")
            break
        sample_num = input_data[0].shape[0]
        sample_num_list.append(sample_num)
        
        output_list = []
        label_list = []
        batch_num = int(sample_num / batch_size)
        for i in range(batch_num):
            x = input_data[0][int(batch_size * i) : int(batch_size * (i+1))].float().to(torch_device)
            y = input_data[1][int(batch_size * i) : int(batch_size * (i+1))].float()
            outputs = model(x).detach()
            prob = F.softmax(outputs, dim=-1)
            output_list.append(prob.cpu().numpy())
            label_list.append(y.cpu().numpy())
            
            device_package.empty_cache()

        if sample_num > batch_size * batch_num:
            x = input_data[0][batch_size * batch_num :].float().to(torch_device)
            y = input_data[1][batch_size * batch_num :].float()
            outputs = model(x).detach()
            prob = F.softmax(outputs, dim=-1)
            output_list.append(prob.cpu().numpy())
            label_list.append(y.cpu().numpy())
            
            device_package.empty_cache()
        
        evaluation_list.append([np.vstack(output_list),
                                np.vstack(label_list)])
        
        
    return evaluation_list, np.array(sample_num_list)

In [11]:
encoding = "All"
train = CaseDataSet.CaseDataset(project_data_path="../../data/", input_data="test1", data_version="",
                                    feature_list=["Activity", "LapseTime"],
                                    encoding=encoding, label="Next_Activity")

test = CaseDataSet.CaseDataset(project_data_path="../../data/", input_data="test1", data_version="",
                                    feature_list=["Activity", "LapseTime"],
                                    encoding=encoding, label="Next_Activity")

In [29]:
# Hyperparameters
input_size = 7  # The number of expected features in the input x
hidden_size = 512  # The number of features in the hidden state h
num_layers = 1  # Number of recurrent layers
num_classes = 6 
learning_rate = 0.002


model = DLModels.SimpleLSTM(input_size, hidden_size, num_layers, num_classes).to(torch_device)
optimizer = torch.optim.NAdam(model.parameters(), lr=learning_rate)
loss = nn.CrossEntropyLoss()

In [30]:
trained_model, train_score, test_score = train_model(model, optimizer, loss, loss, train, test,
            100, torch_device, device_package, print_iter=True)

In [31]:
evaluate_model(trained_model, test, torch_device, device_package, batch_size=100)