# Retrieve Data

In [63]:
import pm4py
import pandas as pd
from IPython.display import JSON
from datetime import datetime, timedelta
import itertools as it
import numpy as np

log = pm4py.read_xes('../Data/event_logs/BPI_Challenge_2012.xes')

parsing log, completed traces :: 100%|██████████| 13087/13087 [00:15<00:00, 820.30it/s] 


In [65]:
type(log)

pm4py.objects.log.obj.EventLog

## Get default parameters needed for loading data.

In [2]:
parameters ={}

column_names = {'Case ID': 'caseid',
                    'Activity': 'task',
                    'lifecycle:transition': 'event_type',
                    'Resource': 'user'}
    
parameters['one_timestamp'] = False  # Only one timestamp in the log
parameters['read_options'] = {
    'timeformat': '%Y-%m-%dT%H:%M:%S.%f',
    'column_names': column_names,
    'one_timestamp': parameters['one_timestamp']}

parameters['model_family'] = 'gru_cx'
parameters['opt_method'] = 'rand_hpc' # 'rand_hpc', 'bayesian'
parameters['max_eval'] = 1

In [3]:
def reorder_xes(temp_data):
    """
    this method match the duplicated events on the .xes log
    """
    temp_data = pd.DataFrame(temp_data)
    ordered_event_log = list()
    if parameters['one_timestamp']:
        column_names['Complete Timestamp'] = 'end_timestamp'
        temp_data = temp_data[temp_data.event_type == 'complete']
        ordered_event_log = temp_data.rename(
            columns={'timestamp': 'end_timestamp'})
        ordered_event_log = ordered_event_log.drop(columns='event_type')
        ordered_event_log = ordered_event_log.to_dict('records')
    else:
        column_names['Start Timestamp'] = 'start_timestamp'
        column_names['Complete Timestamp'] = 'end_timestamp'
        for caseid, group in  temp_data.groupby(by=['caseid']):
            trace = group.to_dict('records')
            temp_trace = list()
            for i in range(0, len(trace)-1):
                incomplete = False
                if trace[i]['event_type'] == 'start':
                    c_task_name = trace[i]['task']
                    remaining = trace[i+1:]
                    complete_event = next((event for event in remaining if (event['task'] == c_task_name and event['event_type'] == 'complete')), None)
                    if complete_event:
                        temp_trace.append(
                            {'caseid': caseid,
                              'task': trace[i]['task'],
                              'user': trace[i]['user'],
                              'start_timestamp': trace[i]['timestamp'],
                              'end_timestamp': complete_event['timestamp']})
                    else:
                        incomplete = True
                        break
            if not incomplete:
                ordered_event_log.extend(temp_trace)
    return ordered_event_log

In [4]:
def append_csv_start_end(data):
    '''
    The paper use this function to add start and end tasks to every trace.
    '''
    end_start_times = dict()
    for case, group in pd.DataFrame(data).groupby('caseid'):
        end_start_times[(case, 'Start')] = (
            group.start_timestamp.min()-timedelta(microseconds=1))
        end_start_times[(case, 'End')] = (
            group.end_timestamp.max()+timedelta(microseconds=1))
    new_data = list()
    data = sorted(data, key=lambda x: x['caseid'])
    for key, group in it.groupby(data, key=lambda x: x['caseid']):
        trace = list(group)
        for new_event in ['Start', 'End']:
            idx = 0 if new_event == 'Start' else -1
            temp_event = dict()
            temp_event['caseid'] = trace[idx]['caseid']
            temp_event['task'] = new_event
            temp_event['user'] = new_event
            temp_event['end_timestamp'] = end_start_times[(key, new_event)]
            if not parameters['one_timestamp']:
                temp_event['start_timestamp'] = end_start_times[(key, new_event)]
            if new_event == 'Start':
                trace.insert(0, temp_event)
            else:
                trace.append(temp_event)
        new_data.extend(trace)
    return new_data

In [5]:
try:
    source = log.attributes['source']
except:
    source = ''
    
print(source)




In [6]:
# Flat the event with caseid
flattern_log = ([{**event, 
                    'caseid': trace.attributes['concept:name']} 
                   for trace in log for event in trace])

JSON(flattern_log[:3])

<IPython.core.display.JSON object>

In [7]:
## Construct log to dataframe.
temp_data = pd.DataFrame(flattern_log)

print(len(temp_data))
display(temp_data.head(5))

262200


Unnamed: 0,org:resource,lifecycle:transition,concept:name,time:timestamp,caseid
0,112.0,COMPLETE,A_SUBMITTED,2011-10-01 00:38:44.546000+02:00,173688
1,112.0,COMPLETE,A_PARTLYSUBMITTED,2011-10-01 00:38:44.880000+02:00,173688
2,112.0,COMPLETE,A_PREACCEPTED,2011-10-01 00:39:37.906000+02:00,173688
3,112.0,SCHEDULE,W_Completeren aanvraag,2011-10-01 00:39:38.875000+02:00,173688
4,,START,W_Completeren aanvraag,2011-10-01 11:36:46.437000+02:00,173688


In [8]:
# formate time
temp_data['time:timestamp'] = temp_data.apply(
    lambda x: x['time:timestamp'].strftime(parameters["read_options"]["timeformat"]), axis=1)

temp_data['time:timestamp'] = pd.to_datetime(temp_data['time:timestamp'], 
                                        format=parameters["read_options"]["timeformat"])

print(len(temp_data))
temp_data.head(5)

262200


Unnamed: 0,org:resource,lifecycle:transition,concept:name,time:timestamp,caseid
0,112.0,COMPLETE,A_SUBMITTED,2011-10-01 00:38:44.546,173688
1,112.0,COMPLETE,A_PARTLYSUBMITTED,2011-10-01 00:38:44.880,173688
2,112.0,COMPLETE,A_PREACCEPTED,2011-10-01 00:39:37.906,173688
3,112.0,SCHEDULE,W_Completeren aanvraag,2011-10-01 00:39:38.875,173688
4,,START,W_Completeren aanvraag,2011-10-01 11:36:46.437,173688


In [9]:
## Replacde column name if exist
temp_data.rename(columns={
    'concept:name': 'task',
    'lifecycle:transition': 'event_type',
    'org:resource': 'user',
    'time:timestamp': 'timestamp'}, inplace=True)

print(len(temp_data))
temp_data.head(5)

262200


Unnamed: 0,user,event_type,task,timestamp,caseid
0,112.0,COMPLETE,A_SUBMITTED,2011-10-01 00:38:44.546,173688
1,112.0,COMPLETE,A_PARTLYSUBMITTED,2011-10-01 00:38:44.880,173688
2,112.0,COMPLETE,A_PREACCEPTED,2011-10-01 00:39:37.906,173688
3,112.0,SCHEDULE,W_Completeren aanvraag,2011-10-01 00:39:38.875,173688
4,,START,W_Completeren aanvraag,2011-10-01 11:36:46.437,173688


In [10]:
cond = ~temp_data.task.apply(lambda v: v.lower() in ['start', 'end'])
print('Keeping rate: ' + str(cond.sum()/len(cond)))

temp_data = (temp_data[cond].reset_index(drop=True))

print(len(temp_data))
temp_data.head(5)

Keeping rate: 1.0
262200


Unnamed: 0,user,event_type,task,timestamp,caseid
0,112.0,COMPLETE,A_SUBMITTED,2011-10-01 00:38:44.546,173688
1,112.0,COMPLETE,A_PARTLYSUBMITTED,2011-10-01 00:38:44.880,173688
2,112.0,COMPLETE,A_PREACCEPTED,2011-10-01 00:39:37.906,173688
3,112.0,SCHEDULE,W_Completeren aanvraag,2011-10-01 00:39:38.875,173688
4,,START,W_Completeren aanvraag,2011-10-01 11:36:46.437,173688


In [11]:
# lower event_type
temp_data.event_type = temp_data.event_type.str.lower()
temp_data.head(5)

Unnamed: 0,user,event_type,task,timestamp,caseid
0,112.0,complete,A_SUBMITTED,2011-10-01 00:38:44.546,173688
1,112.0,complete,A_PARTLYSUBMITTED,2011-10-01 00:38:44.880,173688
2,112.0,complete,A_PREACCEPTED,2011-10-01 00:39:37.906,173688
3,112.0,schedule,W_Completeren aanvraag,2011-10-01 00:39:38.875,173688
4,,start,W_Completeren aanvraag,2011-10-01 11:36:46.437,173688


In [12]:
temp_data['event_type'].str.lower()

cond = temp_data.event_type.isin(['start','complete'])
print('Keeping rate: ' + str(cond.sum()/len(cond)))
temp_data = temp_data[cond].reset_index(drop=True)

print(len(temp_data))
temp_data.head(5)

Keeping rate: 0.8996262395118231
235882


Unnamed: 0,user,event_type,task,timestamp,caseid
0,112.0,complete,A_SUBMITTED,2011-10-01 00:38:44.546,173688
1,112.0,complete,A_PARTLYSUBMITTED,2011-10-01 00:38:44.880,173688
2,112.0,complete,A_PREACCEPTED,2011-10-01 00:39:37.906,173688
3,,start,W_Completeren aanvraag,2011-10-01 11:36:46.437,173688
4,10862.0,complete,A_ACCEPTED,2011-10-01 11:42:43.308,173688


In [13]:
if source == 'com.qbpsimulator':
    if len(temp_data.iloc[0].elementId.split('_'))>1: 
        temp_data['etype'] = temp_data.apply(
            lambda x: x.elementId.split('_')[0], axis=1)
        temp_data = (
            temp_data[temp_data.etype=='Task'].reset_index(drop=True))
        


In [14]:
# trnasform to dict for applying functions on it.
raw_data = temp_data.to_dict('records')

# Show some example
JSON(raw_data[:5])

<IPython.core.display.JSON object>

In [15]:
orderedXes = reorder_xes(temp_data)

In [16]:
print("Event count before appending \"Start & End\": {}".format(len(orderedXes)))
data = append_csv_start_end(orderedXes)
print("Event count After appending \"Start & End\": {}".format(len(data)))

JSON(data[:5])

Event count before appending "Start & End": 71373
Event count After appending "Start & End": 90687


<IPython.core.display.JSON object>

In [17]:
# Get the 
log_df = pd.DataFrame(data)
display(log_df.head(5))

Unnamed: 0,caseid,task,user,end_timestamp,start_timestamp
0,173688,Start,Start,2011-10-01 11:36:46.436999,2011-10-01 11:36:46.436999
1,173688,W_Completeren aanvraag,,2011-10-01 11:45:13.917000,2011-10-01 11:36:46.437000
2,173688,W_Nabellen offertes,,2011-10-01 12:17:08.924000,2011-10-01 12:15:41.290000
3,173688,W_Nabellen offertes,10913,2011-10-08 16:32:00.886000,2011-10-08 16:26:57.720000
4,173688,W_Nabellen offertes,11049,2011-10-10 11:33:05.791000,2011-10-10 11:32:22.495000


In [18]:
if set(['Unnamed: 0', 'role']).issubset(set(log_df.columns)):
    log_df.drop(columns=['Unnamed: 0', 'role'], inplace=True)
    
# Drop the task with "Start" and "End"
# Q: why dropping the events that we just added to the traces?
# A: Since the implementation is from different authors, this part can be deleted
originalLength = len(log_df)    
log_df = log_df[~log_df.task.isin(['Start', 'End'])]
lenAfterRemoving = len(log_df)

print("Keeping rate: {}, original length: {}, length after removing: {}".format(lenAfterRemoving/ originalLength, originalLength, lenAfterRemoving))

Keeping rate: 0.7870257037943696, original length: 90687, length after removing: 71373


In [19]:
log_df.head(5)

Unnamed: 0,caseid,task,user,end_timestamp,start_timestamp
1,173688,W_Completeren aanvraag,,2011-10-01 11:45:13.917,2011-10-01 11:36:46.437
2,173688,W_Nabellen offertes,,2011-10-01 12:17:08.924,2011-10-01 12:15:41.290
3,173688,W_Nabellen offertes,10913.0,2011-10-08 16:32:00.886,2011-10-08 16:26:57.720
4,173688,W_Nabellen offertes,11049.0,2011-10-10 11:33:05.791,2011-10-10 11:32:22.495
5,173688,W_Valideren aanvraag,10629.0,2011-10-13 10:37:37.026,2011-10-13 10:05:26.925


In [20]:
def create_index(log_df, column):
    """Creates an idx for a categorical attribute.
    parms:
        log_df: dataframe.
        column: column name.
    Returns:
        index of a categorical attribute pairs.
    """
    temp_list = log_df[[column]].values.tolist()
    subsec_set = {(x[0]) for x in temp_list}
    subsec_set = sorted(list(subsec_set))
    alias = dict()
    for i, _ in enumerate(subsec_set):
        alias[subsec_set[i]] = i + 1
    return alias


In [21]:
def indexing(log):
    # Activities index creation
    ac_index = create_index(log, 'task')
    ac_index['start'] = 0
    ac_index['end'] = len(ac_index)
    index_ac = {v: k for k, v in ac_index.items()}
    # Roles index creation
#     rl_index = create_index(log, 'role')
#     rl_index['start'] = 0
#     rl_index['end'] = len(rl_index)
#     index_rl = {v: k for k, v in rl_index.items()}
    # Add index to the event log
    ac_idx = lambda x: ac_index[x['task']]
    log['ac_index'] = log.apply(ac_idx, axis=1)
#     rl_idx = lambda x: rl_index[x['role']]
#     log['rl_index'] = log.apply(rl_idx, axis=1)
    return log

In [22]:
log_df = indexing(log_df)

In [23]:
log_df.head(5)

Unnamed: 0,caseid,task,user,end_timestamp,start_timestamp,ac_index
1,173688,W_Completeren aanvraag,,2011-10-01 11:45:13.917,2011-10-01 11:36:46.437,3
2,173688,W_Nabellen offertes,,2011-10-01 12:17:08.924,2011-10-01 12:15:41.290,5
3,173688,W_Nabellen offertes,10913.0,2011-10-08 16:32:00.886,2011-10-08 16:26:57.720,5
4,173688,W_Nabellen offertes,11049.0,2011-10-10 11:33:05.791,2011-10-10 11:32:22.495,5
5,173688,W_Valideren aanvraag,10629.0,2011-10-13 10:37:37.026,2011-10-13 10:05:26.925,6


In [54]:
import copy
from operator import itemgetter

def _sort_log(log):
    log = copy.deepcopy(log)
    log = sorted(log.to_dict('records'), key=lambda x: x['caseid'])
    for key, group in it.groupby(log, key=lambda x: x['caseid']):
        events = list(group)
        events = sorted(events, key=itemgetter('end_timestamp'))
        length = len(events)
        for i in range(0, len(events)):
            events[i]['pos_trace'] = i + 1
            events[i]['trace_len'] = length
    log = pd.DataFrame.from_dict(log)
    log.sort_values(by='end_timestamp', ascending=False, inplace=True)
    return log

In [55]:
def timeline_contained(log: pd.DataFrame, size: float, one_timestamp: bool) -> None:
    # log = self.log.data.to_dict('records')
    num_events = int(np.round(len(log)*(1 - size)))

    df_train = log.iloc[num_events:]
    df_test = log.iloc[:num_events]

    # Incomplete final traces
    df_train = df_train.sort_values(by=['caseid'],
                                    ascending=True)
    inc_traces = pd.DataFrame(df_train.groupby('caseid')
                              .last()
                              .reset_index())

    inc_traces = inc_traces[inc_traces.pos_trace != inc_traces.trace_len]
    inc_traces = inc_traces['caseid'].to_list()

    print("Imcomplete rate: {}".format(len(inc_traces)/ len(df_train)))

    # Drop incomplete traces
    df_test = df_test[~df_test.caseid.isin(inc_traces)]
    df_test = df_test.drop(columns=['trace_len', 'pos_trace'],errors='ignore')
    df_train = df_train[~df_train.caseid.isin(inc_traces)]

    print("train set size after dropping: {}".format(len(df_train)))

    df_train = df_train.drop(columns=['trace_len', 'pos_trace'],errors='ignore')
    key = 'end_timestamp' if one_timestamp else 'start_timestamp'
    df_test = (df_test
               .sort_values(key, ascending=True)
               .reset_index(drop=True).to_dict('records'))
    df_train = (df_train
                .sort_values(key, ascending=True)
                .reset_index(drop=True).to_dict('records'))
    return df_train, df_test

def timeline_trace(log: pd.DataFrame, size: float, one_timestamp: bool) -> None:
    # log = self.log.data.to_dict('records')
#     cases = log[log.pos_trace == 1]
    cases = log
    key = 'end_timestamp' if one_timestamp else 'start_timestamp'
    cases = cases.sort_values(key, ascending=False)
    cases = cases.caseid.to_list()
    num_test_cases = int(np.round(len(cases)*(1 - size)))
    test_cases = cases[:num_test_cases]
    train_cases = cases[num_test_cases:]
    df_train = log[log.caseid.isin(train_cases)]
    df_test = log[log.caseid.isin(test_cases)]
    df_train = df_train.drop(columns=['trace_len', 'pos_trace'],errors='ignore')
    df_test = df_test.drop(columns=['trace_len', 'pos_trace'],errors='ignore')
    return df_train, df_test

def random(log: pd.DataFrame, size: float, one_timestamp: bool) -> None:
    cases = list(log.caseid.unique())
    sample_sz = int(np.ceil(len(cases)*size))
    scases = random.sample(cases, sample_sz)
    df_train = log[log.caseid.isin(scases)]
    df_test = log[~log.caseid.isin(scases)]
    return df_train, df_test

In [56]:
def _get_splitter(method):
        if method == 'timeline_contained':
            return timeline_contained
        elif method == 'timeline_trace':
            return timeline_trace
        elif method == 'random':
            return random
        else:
            raise ValueError(method)

In [57]:
def split_log(log: pd.DataFrame, method: str, size: float, one_timestamp: bool):
    splitter = _get_splitter(method)
    return splitter(log, size, one_timestamp)

In [58]:
def split_timeline(size: float, one_ts: bool, log: pd.DataFrame) -> (pd.DataFrame, pd.DataFrame) :
        """
        Split an event log dataframe by time to peform split-validation.
        prefered method time splitting removing incomplete traces.
        If the testing set is smaller than the 10% of the log size
        the second method is sort by traces start and split taking the whole
        traces no matter if they are contained in the timeframe or not

        Parameters
        ----------
        size : float, validation percentage.
        one_ts : bool, Support only one timestamp.
        """
        # Split log data
        sortedLog = _sort_log(log)
        print("sorted log size {}".format(len(sortedLog)))
        # display(sortedLog.head(5))
        print("soted log end -----")

        ## problem is here
        train, test = split_log(sortedLog,'timeline_contained', size, one_ts)
        print("1. train set size: {}".format(len(train)))
        total_events = len(sortedLog)
        # Check size and change time splitting method if necesary
        if len(test) < int(total_events*0.1):
            train, test = split_log(sortedLog, 'timeline_trace', size, one_ts)
        # Set splits
        key = 'end_timestamp' if one_ts else 'start_timestamp'
        test = pd.DataFrame(test)
        train = pd.DataFrame(train)
        print(train)
        print('training set')
        # display(train.head(5))
        
#         print("display set")
#         display(test.head(5))
        
        log_test = (test.sort_values(key, ascending=True)
                         .reset_index(drop=True))
        log_train = (train.sort_values(key, ascending=True)
                          .reset_index(drop=True))
        return log_train, log_test

In [60]:
log_train, log_test = split_timeline(0.8, parameters['one_timestamp'], log_df)

sorted log size 71373
soted log end -----
Imcomplete rate: 0.09129916984833095
train set size after dropping: 7971
1. train set size: 7971
      caseid                    task   user           end_timestamp  \
0     173706      W_Afhandelen leads  10912 2011-10-01 10:16:49.843   
1     173688  W_Completeren aanvraag    NaN 2011-10-01 11:45:13.917   
2     173691  W_Completeren aanvraag    NaN 2011-10-01 11:43:13.178   
3     173706  W_Completeren aanvraag  10912 2011-10-01 11:50:14.483   
4     173706  W_Completeren aanvraag  10912 2011-10-01 11:53:55.769   
...      ...                     ...    ...                     ...   
7966  208133      W_Afhandelen leads  10629 2012-02-10 17:05:23.731   
7967  208268      W_Afhandelen leads  11201 2012-02-10 20:06:30.178   
7968  208208  W_Completeren aanvraag  10913 2012-02-10 20:05:21.713   
7969  208256      W_Afhandelen leads    NaN 2012-02-10 20:04:56.142   
7970  208271      W_Afhandelen leads    NaN 2012-02-10 20:19:31.576   

        

In [62]:
log_train.head(5)

Unnamed: 0,caseid,task,user,end_timestamp,start_timestamp,ac_index
0,173706,W_Afhandelen leads,10912.0,2011-10-01 10:16:49.843,2011-10-01 10:15:43.883,1
1,173688,W_Completeren aanvraag,,2011-10-01 11:45:13.917,2011-10-01 11:36:46.437,3
2,173691,W_Completeren aanvraag,,2011-10-01 11:43:13.178,2011-10-01 11:37:32.393,3
3,173706,W_Completeren aanvraag,10912.0,2011-10-01 11:50:14.483,2011-10-01 11:49:28.183,3
4,173706,W_Completeren aanvraag,10912.0,2011-10-01 11:53:55.769,2011-10-01 11:50:21.451,3


In [None]:
### We need to add resource to apply the same algo

In [None]:
# split validation