# Regressor Prototype

In [3]:
import xgboost as xgb
import copy

import numpy as np
import pandas as pd
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import pickle as pickle
from src.Utils import DataUtil
from src.Trainer import CaseDataSet
pd.options.mode.chained_assignment = None  # default='warn'

In [44]:
def split_data(log, train=0.64, val=0.16):
    data_size = log.shape[0]
    training_set = log[:int(train * data_size)]
    validation_set = log[int(train * data_size): int(train * data_size) + int(val * data_size)]
    test_set = log[int(train * data_size) + int(val * data_size): ]
    return training_set, validation_set, test_set


def norm_remtime(train, val, test, columns=["RemTime", "LapseTime"]):
    for column in columns:
        max_value = train[column].explode(column).max()
        train.loc[:, column] = train[column] / max_value
        val.loc[:, column] = val[column] / max_value
        test.loc[:, column] = test[column] / max_value 
    return train, val, test

with open("../../presets/t2.pkl", 'rb') as f:
    data = pickle.load(f)

d1 = data["traces_dict"]["bpmn1-log_trace"]
train, val, test = split_data(d1)
train, val, test = norm_remtime(train, val, test)
train_t1 = CaseDataSet.CaseDataset(train, feature_list=["flowNodeId", "LapseTime"], label="RemTime", encoding="Last")
val_t1 = CaseDataSet.CaseDataset(val, feature_list=["flowNodeId", "LapseTime"], label="RemTime", encoding="Last")
test_t1 = CaseDataSet.CaseDataset(test, feature_list=["flowNodeId", "LapseTime"], label="RemTime", encoding="Last")

In [72]:
class XgbRegressor():
    def __init__(self, training_set, validation_set, tree_method="hist", early_stopping_rounds=2):
        self.training_set = training_set
        self.validation_set = validation_set
        self.dataset_list = [self.training_set, self.validation_set]
        self.data_list = []
        self.reg = xgb.XGBRegressor(objective="reg:absoluteerror", tree_method=tree_method,
                                    early_stopping_rounds=early_stopping_rounds)
        self.train_input = self.generate_data_set(self.training_set)
        self.val_input = self.generate_data_set(self.validation_set)
        self.eval_result = {}

    def generate_data_set(self, dataset):
        feature_list = []
        label_list = []
        for prefix_len in range(1, dataset.max_case_len+1):
            dataset.set_prefix_length(prefix_len)
            feature_list.append(dataset[:][0].numpy())
            label_list.append(dataset[:][1].numpy())
        return [np.vstack(feature_list), np.vstack(label_list)]

    def train(self):
        self.reg.fit(self.train_input[0], self.train_input[1],
                     eval_set=[(self.val_input[0], self.val_input[1])], verbose=False)

    def score(self):
        return self.reg.score(self.val_input[0], self.val_input[1])
    
    def predict(self, test_set):
        test_input = self.generate_data_set(test_set)
        return self.reg.predict(test_input[0]), test_input[1]

In [73]:
r1 = XgbRegressor(train_t1, val_t1, tree_method="hist", early_stopping_rounds=5)

In [74]:
r1.train()

In [76]:
r1.predict(test_t1)

(array([2.1165352e-01, 2.1165352e-01, 2.1165352e-01, ..., 2.5715224e-05,
        2.5715224e-05, 2.5715224e-05], dtype=float32),
 array([[0.24752475],
        [0.26308345],
        [0.24045262],
        ...,
        [0.        ],
        [0.        ],
        [0.        ]]))