# Case Data Set

In [1]:
import pandas as pd
import numpy as np

import torch
from torch.nn import functional as F
from torch.utils.data import Dataset

In [12]:
class CaseDataset(Dataset):
    def __init__(self, data, input_data="helpdesk", data_version="_train", features=[],
                 label="Next_Activity", encoding="all"):

        self.data_all = data
        self.feature_list = features
        self.encoding = encoding
        self.label = label
        self.next_event_prediction = False
        if label[:5] == "Next_" or label[:5] == "next_":
            self.label = label[5:]
            self.next_event_prediction = True

        self.data_all["Case_Length"] = self.data_all[label].apply(len)
        self.prefix_length = 1
        self.data_pool = self.data_all.copy()
        self.max_case_len = self.data_pool["Case_Length"].max()

    def filter_caseID(self, ID=None):
        if ID is None:
            self.data_pool = self.data_all.copy()
            self.max_case_len = self.data_pool["Case_Length"].max()
        else:
            self.data_pool = self.data_all.copy()[self.data_all["CaseID"] == ID]
            if self.data_pool.shape[0] == 0:
                self.max_case_len = 0
            else:
                self.max_case_len = self.data_pool["Case_Length"].max()

    def set_prefix_length(self, prefix_len):
        self.prefix_length = prefix_len

    def shuffle_data(self):
        self.data_pool = self.data_pool.sample(frac=1)

    def update_data_pool(self):
        max_prefix = self.prefix_length
        if self.next_event_prediction:
            max_prefix = self.prefix_length + 1
        data_temp = self.data_pool[self.data_pool["Case_Length"] >= max_prefix]
        return data_temp

    def convert_feature_vec(self, data):
        data_com = np.hstack(data.values)
        if self.encoding == "Last":
            return data_com[self.prefix_length-1]
        if self.encoding == "Agg_Mean":
            return np.mean(data_com[:self.prefix_length], axis=0)
        return data_com[:self.prefix_length]

    def convert_label_vec(self, label):
        if self.next_event_prediction:
            return label[self.prefix_length]
        else:
            return label[self.prefix_length-1]

    def __len__(self):
        data_temp = self.update_data_pool()
        return data_temp.shape[0]

    def __getitem__(self, idx):
        data_temp = self.update_data_pool()
        if torch.is_tensor(idx):
            idx = idx.tolist()

        x = data_temp[self.feature_list].apply(self.convert_feature_vec, axis=1).values[idx]
        y = data_temp[self.label].apply(self.convert_label_vec).values[idx]

        if len(x) == 0:
            return None

        y = torch.stack(y.tolist())
        return torch.tensor(np.stack(x)), y

In [18]:
ds1 = CaseDataset(project_data_path="../../data/", input_data="test1", data_version="",
            act_column="Activity", feature_list=["Activity", "LapseTime"], label="Next_Activity", encoding="all")

In [19]:
ds1.data_all

In [20]:
ds1.set_prefix_length(3)
ds1[:]