# Classifier Prototype

In [1]:
import xgboost as xgb
import copy

import numpy as np
import pandas as pd
import os
import sys
module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from sklearn import preprocessing
    
import pickle as pickle
from src.Utils import DataUtil
from src.Trainer import CaseDataSet
pd.options.mode.chained_assignment = None  # default='warn'

In [7]:
def split_data(log, train=0.64, val=0.16):
    data_size = log.shape[0]
    training_set = log[:int(train * data_size)]
    validation_set = log[int(train * data_size): int(train * data_size) + int(val * data_size)]
    test_set = log[int(train * data_size) + int(val * data_size): ]
    return training_set, validation_set, test_set


def norm_remtime(train, val, test, columns=["RemTime", "LapseTime"]):
    for column in columns:
        max_value = train[column].explode(column).max()
        train.loc[:, column] = train[column] / max_value
        val.loc[:, column] = val[column] / max_value
        test.loc[:, column] = test[column] / max_value 
    return train, val, test

with open("../../presets/t2.pkl", 'rb') as f:
    data = pickle.load(f)

d1 = data["traces_dict"]["bpmn1-log_trace"]
train, val, test = split_data(d1)
train, val, test = norm_remtime(train, val, test)
train_t1 = CaseDataSet.CaseDataset(train, feature_list=["flowNodeId", "LapseTime"], label="Next_flowNodeId", encoding="Last")
val_t1 = CaseDataSet.CaseDataset(val, feature_list=["flowNodeId", "LapseTime"], label="Next_flowNodeId", encoding="Last")
test_t1 = CaseDataSet.CaseDataset(test, feature_list=["flowNodeId", "LapseTime"], label="Next_flowNodeId", encoding="Last")

In [8]:
class XgbClassifier():
    def __init__(self, training_set, validation_set, tree_method="hist", early_stopping_rounds=2):
        self.training_set = training_set
        self.validation_set = validation_set
        self.dataset_list = [self.training_set, self.validation_set]
        self.data_list = []
        self.le = preprocessing.LabelEncoder()
        self.train_input = self.generate_data_set(self.training_set, training_set=True)
        self.val_input = self.generate_data_set(self.validation_set)
        self.clf = xgb.XGBClassifier(objective='multi:softprob', tree_method=tree_method,
                                    early_stopping_rounds=early_stopping_rounds, num_class=self.le.classes_.shape[0])
        self.eval_result = {}

    def generate_data_set(self, dataset, training_set=False):
        feature_list = []
        label_list = []
        for prefix_len in range(1, dataset.max_case_len+1):
            dataset.set_prefix_length(prefix_len)
            if dataset:
                feature_list.append(dataset[:][0].numpy())
                label_list.append(dataset[:][1].numpy())
            else:
                break
        output = [np.vstack(feature_list), np.argmax(np.vstack(label_list), axis=-1)]
        if training_set:
            self.le.fit(output[1].ravel())
            
        return [output[0], self.le.transform(output[1].ravel())]

    def train(self):
        self.clf.fit(self.train_input[0], self.train_input[1],
                     eval_set=[(self.val_input[0], self.val_input[1])], verbose=False)

    def score(self):
        return self.clf.score(self.val_input[0], self.val_input[1])
    
    def predict(self, test_set):
        test_input = self.generate_data_set(test_set)
        return self.clf.predict(test_input[0]), test_input[1]

In [10]:
c1 = XgbClassifier(train_t1, val_t1, tree_method="hist", early_stopping_rounds=5)

In [11]:
c1.train()

In [20]:
c1.clf.evals_result_['validation_0']['mlogloss']

[0.9869080781936646,
 0.6740124821662903,
 0.4785560369491577,
 0.34587395191192627,
 0.2524484097957611,
 0.18539001047611237,
 0.13672105967998505,
 0.1011570617556572,
 0.07505466789007187,
 0.05584023147821426,
 0.04166677221655846,
 0.03119615092873573,
 0.02345160767436028,
 0.01771728508174419,
 0.01346680615097284,
 0.01031165756285191,
 0.00796549580991268,
 0.00621637655422091,
 0.00490759033709764,
 0.00392359308898449,
 0.003179234219715,
 0.00261170673184097,
 0.00217489316128194,
 0.00183542026206851,
 0.00156865303870291,
 0.00156865303870291,
 0.00156865303870291,
 0.00156865303870291,
 0.00156865303870291,
 0.00156865303870291]