# Explainable Outcome Prediction - Preprocessing and EDA

- Author: David Steiner
- Last updated: December 2021

In [28]:
from prep_custom import get_dataset_settings, xes_converter, add_timestap_features, add_inter_case_features, impute_missing_values
from prep_custom import cut_trace_before_activity, create_trace_bucket,remove_features,remove_events
from prep_custom import split_data_temporal,replace_missing_cols,prepare_ml_train_test, aggregate_data, group_infrequent_features
from prep_custom import define_binary_outcome_label

import pandas as pd
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 100)

import numpy as np

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
%matplotlib inline

# Data Preprocessing

### Available XES Eventlogs :

- BPI_Challenge_2013_incidents.xes.gz
- BPI Challenge 2017.xes.gz
- BPI_Challenge_2019.xes
- Hospital Billing - Event Log.xes.gz

In [None]:
%%time
data = xes_converter('BPI Challenge 2017.xes.gz')
print('Shape:', data.shape)
data.head(5)

In [None]:
case_id_col = "case:concept:name"   # Case ID
activity_col = "concept:name"       # Event Name
timestamp_col = 'time:timestamp'    
label_col = 'label'                 # Output Label

#Display shape and one full case
data[data[case_id_col]==data[case_id_col].unique()[0]]

In [None]:
%%time
data = add_timestap_features(data)
data = add_inter_case_features(data)
data = impute_missing_values(data)

#Display shape and one full case
print('Shape:', data.shape)
data[data[case_id_col]==data[case_id_col].unique()[0]]

In [None]:
#Export preprocessed dataset
data.to_csv("BPIC_17.csv", index = False)

## EDA 

### Available Datasets:

- BPIC13
- BPIC17
- BPIC19
- BPICHospital

In [25]:
data, case_id_col, activity_col, timestamp_col, label_col, resource_col, event_categorical_attributes, event_numeric_attributes, case_categorical_attributes, case_numeric_attributes, static_cols, dynamic_cols, cat_cols = get_dataset_settings('BPIC13')
attributes = [event_categorical_attributes, case_categorical_attributes, event_numeric_attributes, case_numeric_attributes]

Categoric Event Attributes: 7 ['org:group', 'resource country', 'org:resource', 'organization involved', 'org:role', 'concept:name', 'lifecycle:transition'] 

Numeric Event Attributes: 8 ['timesincemidnight', 'month', 'weekday', 'hour', 'timesincelastevent', 'timesincecasestart', 'event_nr', 'open_cases'] 

Categoric Case Attributes: 3 ['organization country', 'impact', 'product'] 

Numeric Case Attributes: 0 [] 

Dataset Shape (65533, 21)


In [None]:
data.aggregate(pd.Series.nunique).sort_values(ascending=False)

In [None]:
print("--- event_numeric_attributes ---")
for col in event_numeric_attributes:
    print(col, '=', len(data[col].unique()), 'unique values', sep=' ')
    print(data[col].describe())
    print()
    
print("--- case_numeric_attributes ---")
for col in case_numeric_attributes:
    print(col, '=', len(data[col].unique()), 'unique values', sep=' ')
    print(data[col].describe())
    print()

In [None]:
print("--- Dynamic Cat Columns ---")
for col in event_categorical_attributes:
    print(col, '=', len(data[col].unique()), 'unique values', sep=' ')
    print(data[col].unique())
    print()
    
print("--- case_categorical_attributes ---")
for col in case_categorical_attributes:
    print(col, '=', len(data[col].unique()), 'unique values', sep=' ')
    print(data[col].unique())
    print()

In [None]:
print(data.groupby(activity_col)[case_id_col].count().sort_values(ascending=False))

In [None]:
data.groupby(case_id_col)[activity_col].count().sort_values()

In [None]:
#Display shape and one full case
data[data[case_id_col]==data[case_id_col].unique()[107]]

In [None]:
#data.groupby('case:caseStatus').count()

In [15]:
data

Unnamed: 0,isCancelled,diagnosis,time:timestamp,caseType,speciality,org:resource,concept:name,blocked,isClosed,flagD,flagB,flagA,state,lifecycle:transition,case:concept:name,closeCode,actRed,actOrange,flagC,msgCount,version,msgType,msgCode,timesincemidnight,month,weekday,hour,timesincelastevent,timesincecasestart,event_nr,open_cases,label
0,False,A,2012-12-16 18:33:10+00:00,A,A,ResA,NEW,False,True,True,False,False,In progress,complete,A,missing,missing,missing,missing,0.0,missing,missing,missing,1113,12,6,18,0.000000,0.000000,1,25,empty
1,False,A,2013-12-15 18:00:37+00:00,A,A,ResA,FIN,False,True,True,False,False,Closed,complete,A,A,missing,missing,missing,0.0,missing,missing,missing,1080,12,6,18,524127.450000,524127.450000,2,12761,empty
2,False,A,2013-12-16 02:53:38+00:00,A,A,ResA,RELEASE,False,True,True,False,False,Released,complete,A,A,missing,missing,missing,0.0,missing,missing,missing,173,12,0,2,533.016667,524660.466667,3,12775,empty
3,False,A,2013-12-17 11:56:29+00:00,A,A,ResA,CODE OK,False,True,True,False,False,Released,complete,A,A,False,False,False,0.0,A,missing,missing,716,12,1,11,1982.850000,526643.316667,4,12737,empty
4,False,A,2013-12-19 02:44:31+00:00,A,A,ResB,BILLED,False,True,True,False,False,Billed,complete,A,A,False,False,False,0.0,A,missing,missing,164,12,3,2,2328.033333,528971.350000,5,12729,empty
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
451354,False,UE,2015-06-27 00:01:17+00:00,A,H,ResA,NEW,False,True,True,False,False,In progress,complete,ZZZD,missing,missing,missing,missing,0.0,missing,missing,missing,1,6,5,0,0.000000,0.000000,1,14555,empty
451355,False,UE,2015-07-06 21:57:45+00:00,A,H,ResA,FIN,False,True,True,False,False,Closed,complete,ZZZD,A,missing,missing,missing,0.0,missing,missing,missing,1317,7,0,21,14276.466667,14276.466667,2,14947,empty
451356,False,UE,2015-07-15 03:39:38+00:00,A,H,ResA,RELEASE,False,True,True,False,False,Released,complete,ZZZD,A,missing,missing,missing,0.0,missing,missing,missing,219,7,2,3,11861.883333,26138.350000,3,15175,empty
451357,False,UE,2015-07-15 04:33:01+00:00,A,H,ResA,CODE OK,False,True,True,False,False,Released,complete,ZZZD,A,False,False,False,0.0,E,missing,missing,273,7,2,4,53.383333,26191.733333,4,15175,empty


# Tables for Paper

## Dataset Statistics

In [29]:
def get_dataset_stat(dataset):
    data, case_id_col, activity_col, timestamp_col, label_col, resource_col, event_categorical_attributes, event_numeric_attributes, case_categorical_attributes, case_numeric_attributes, static_cols, dynamic_cols, cat_cols = get_dataset_settings(dataset)
    n_event_categorical_attributes = len(event_categorical_attributes)
    n_event_numeric_attributes = len(event_numeric_attributes) -8
    n_case_categorical_attributes = len(case_categorical_attributes)
    n_case_numeric_attributes = len(case_numeric_attributes)

    n_events = len(data[activity_col].unique())
    tmp = data[[case_id_col, 'event_nr']].groupby([case_id_col]).max()
    n_event_length_max = tmp['event_nr'].max()
    n_event_length_min = tmp['event_nr'].min()
    n_event_length_mean = int(tmp['event_nr'].mean())

    n_cases = len(data[case_id_col].unique())
    
    tmp = data[[case_id_col, 'timesincecasestart']].groupby([case_id_col]).max()
    n_mean_case_duration = round((tmp['timesincecasestart'].mean()/360),1)
    
    dataset_stats = pd.DataFrame()

    dataset_stats = dataset_stats.append(pd.DataFrame({
      'Dataset':[dataset],
      'Cases':[n_cases],
      'Min Case Length': [n_event_length_min], 
      'Max Case Length':[n_event_length_max], 
      'Mean Case Length':[n_event_length_mean], 
      'Mean Duration (hours)':[n_mean_case_duration],
      'Events':[n_events],
      'Cat. Event Attr.':[n_event_categorical_attributes],
      'Num. Event Attr.':[n_event_numeric_attributes],
      'Cat. Case Attr.':[n_case_categorical_attributes],
      'Num. Case Attr. ':[n_case_numeric_attributes]}))
    
    return dataset_stats

In [30]:
datasets = ['BPIC13', 'BPIC17' ,'BPIC19', 'BPICHospital']

dataset_stats = pd.DataFrame()

for dataset in datasets:
    print(dataset)
    dataset_stat = get_dataset_stat(dataset)
    dataset_stats = dataset_stats.append(dataset_stat)
    
dataset_stats.to_csv("Dataset_Stats.csv", sep=";")
dataset_stats

BPIC13
Categoric Event Attributes: 7 ['org:group', 'resource country', 'org:resource', 'organization involved', 'org:role', 'concept:name', 'lifecycle:transition'] 

Numeric Event Attributes: 8 ['timesincemidnight', 'month', 'weekday', 'hour', 'timesincelastevent', 'timesincecasestart', 'event_nr', 'open_cases'] 

Categoric Case Attributes: 3 ['organization country', 'impact', 'product'] 

Numeric Case Attributes: 0 [] 

Dataset Shape (65533, 21)
BPIC17
Categoric Event Attributes: 6 ['concept:name', 'org:resource', 'Action', 'lifecycle:transition', 'Accepted', 'Selected'] 

Numeric Event Attributes: 13 ['CreditScore', 'FirstWithdrawalAmount', 'MonthlyCost', 'NumberOfTerms', 'OfferedAmount', 'timesincelastevent', 'timesincecasestart', 'timesincemidnight', 'event_nr', 'month', 'weekday', 'hour', 'open_cases'] 

Categoric Case Attributes: 3 ['case:ApplicationType', 'case:LoanGoal', 'EventOrigin'] 

Numeric Case Attributes: 1 ['case:RequestedAmount'] 

Dataset Shape (1202267, 26)
BPIC19
Ca

Unnamed: 0,Dataset,Cases,Min Case Length,Max Case Length,Mean Case Length,Mean Duration (hours),Events,Cat. Event Attr.,Num. Event Attr.,Cat. Case Attr.,Num. Case Attr.
0,BPIC13,7554,1,123,8,48.3,4,7,0,3,0
0,BPIC17,31509,10,180,38,87.6,26,6,5,3,1
0,BPIC19,251734,1,990,6,286.1,42,5,1,8,2
0,BPICHospital,77526,2,217,5,656.5,18,12,1,2,0


## Outcome Statistics

In [5]:
outcome_target_list = [
'BPIC17-LoanAccepted',
'BPIC17-PotentialFraud',
'BPIC17-LongRunningCases',
'BPIC13-SupportLevel-1',
'BPIC19-DeletedPO',
'BPICHospital-BillingClosed',
'BPICHospital-CaseReopened',
]

outcome_stats = pd.DataFrame()

for outcome in outcome_target_list:
    print(outcome)
    eventlog = outcome.split('-')[0]

    data, case_id_col, activity_col, timestamp_col, label_col, resource_col, event_categorical_attributes, event_numeric_attributes, case_categorical_attributes, case_numeric_attributes, static_cols, dynamic_cols, cat_cols = get_dataset_settings(eventlog)
    attributes = [event_categorical_attributes, case_categorical_attributes, event_numeric_attributes, case_numeric_attributes]
    data_labeled, drop_events_list, dl_attributes = define_binary_outcome_label(data, attributes, outcome_label=outcome)

    tmp = data_labeled[[case_id_col, label_col]].groupby(case_id_col).max()
    class_1 = tmp[label_col].sum()
    class_0 = len(tmp) - class_1
    class_ratio = round(class_1  / class_0,4)


    outcome_stats = outcome_stats.append(pd.DataFrame({
      'Outcome':[outcome],
      'Class 1':[class_1],
      'Class 0 ': [class_0], 
      'Class Ratio':[class_ratio]
    }))

        
outcome_stats.to_csv("Outcome_Stats.csv", sep=";")

outcome_stats

BPIC17-LoanAccepted
Categoric Event Attributes: 6 ['concept:name', 'org:resource', 'Action', 'lifecycle:transition', 'Accepted', 'Selected'] 

Numeric Event Attributes: 13 ['CreditScore', 'FirstWithdrawalAmount', 'MonthlyCost', 'NumberOfTerms', 'OfferedAmount', 'timesincelastevent', 'timesincecasestart', 'timesincemidnight', 'event_nr', 'month', 'weekday', 'hour', 'open_cases'] 

Categoric Case Attributes: 3 ['case:ApplicationType', 'case:LoanGoal', 'EventOrigin'] 

Numeric Case Attributes: 1 ['case:RequestedAmount'] 

Dataset Shape (1202267, 26)
Assigning class labels...
Set labels to 1 for Outcome: O_Accepted
label
0    18621
1    12792
Name: case:concept:name, dtype: int64
BPIC17-PotentialFraud
Categoric Event Attributes: 6 ['concept:name', 'org:resource', 'Action', 'lifecycle:transition', 'Accepted', 'Selected'] 

Numeric Event Attributes: 13 ['CreditScore', 'FirstWithdrawalAmount', 'MonthlyCost', 'NumberOfTerms', 'OfferedAmount', 'timesincelastevent', 'timesincecasestart', 'timesi