# Feature Selection

In [None]:
#import libraries

In [None]:
import pandas as pd
import numpy as np
import random
import pickle
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import FeatureUnion
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

#user specified packages from 
import EncoderFactory
from DatasetManager import DatasetManager

#IBM packages 
from aix360.algorithms.rbm import FeatureBinarizer
from aix360.algorithms.rbm import LogisticRuleRegression

In [None]:
#TERMINOLOGY
bucket_encoding = "agg"

dataset_ref_to_datasets = {
    "bpic2015": ["bpic2015_%s_f2"%(municipality) for municipality in range(1,6)],
}

encoding_dict = {
    "agg": ["static", "agg"]
}

train_ratio = 0.8
random_state = 22

## Dataset BPIC2015_2_f2

In [None]:
dataset_ref = 'bpic2015_2_f2'
params_dir = './params_dir'
results_dir = './results' 
cls_encoding = 'agg'
gap = 1
n_iter = 1

method_name = "%s_%s"%(cls_method, cls_encoding)

datasets = [dataset_ref] if dataset_ref not in dataset_ref_to_datasets else dataset_ref_to_datasets[dataset_ref]
methods = encoding_dict[cls_encoding]

In [None]:
# print dataset name

In [None]:
for dataset_name in datasets:
    print('Dataset:', dataset_name)

In [None]:
# read the data

In [None]:
dataset_manager = DatasetManager(dataset_name)
data = dataset_manager.read_dataset()
cls_encoder_args = {'case_id_col': dataset_manager.case_id_col, 
                        'static_cat_cols': dataset_manager.static_cat_cols,
                        'static_num_cols': dataset_manager.static_num_cols, 
                        'dynamic_cat_cols': dataset_manager.dynamic_cat_cols,
                        'dynamic_num_cols': dataset_manager.dynamic_num_cols, 
                        'fillna': True}

In [None]:
#dimensions of the data

In [None]:
print('#columns:',len(data.columns))
print('#rows:',data.shape[0])

In [None]:
#data types

In [None]:
data.dtypes

In [None]:
#data description

The event log BPIC2015 assembles event logs from the second Dutch municipality pertaining to the building permit. After performing trace prefixing, trace cutting (max. length of 40) and sequence encoding, the training amount of the event log reduced from 41,202 to 22,221 with a final total of 391 deduced columns.

### original dataset

In [None]:
#original dataset
#this is useful for the original activity names

In [None]:
import pm4py
log = pm4py.read_xes('BPIC15_2.xes')

In [None]:
import pandas as pd
from pm4py.objects.conversion.log import converter as log_converter
dataframe = log_converter.apply(log, variant=log_converter.Variants.TO_DATA_FRAME)

In [None]:
pd.set_option("display.max_columns", dataframe.columns.shape[0])
pd.set_option("display.max_rows", dataframe.index.shape[0])

In [None]:
#show the original activities names next to the new activity names

In [None]:
dataframe.groupby(['concept:name','activityNameEN']).size()

In [None]:
#there is only one activity for that one

In [None]:
len(set(dataframe[dataframe['concept:name']=='08_AWB45_010']['activityNameEN']))

In [None]:
dataframe[dataframe['concept:name']=='08_AWB45_010']['activityNameEN'].head(1)

### correlation

#### dataset

In [None]:
#check the correlation with Pearson's correlation for the dataset

In [None]:
data2 = data.copy()

In [None]:
data2.head()

In [None]:
data2.label.unique()

In [None]:
data2["label"].replace({"regular": 0, "deviant": 1}, inplace=True)

In [None]:
data2.label.unique()

In [None]:
#Using Pearson Correlation
plt.figure(figsize=(12,10))
cor = data2.corr()
sns.heatmap(cor, annot=True, cmap=plt.cm.Reds)
plt.show()

In [None]:
#correlation with label

In [None]:
cor_target = abs(cor["label"])
#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.4]
relevant_features

In [None]:
data.shape[0]

In [None]:
data.shape[1]

#### transformed dataset

In [None]:
# determine min and max (truncated) prefix lengths

In [None]:
min_prefix_length = 1
if "traffic_fines" in dataset_name:
    max_prefix_length = 10
elif "bpic2017" in dataset_name:
    max_prefix_length = min(20, dataset_manager.get_pos_case_length_quantile(data, 0.90))
else:
    max_prefix_length = min(40, dataset_manager.get_pos_case_length_quantile(data, 0.90))

In [None]:
max_prefix_length

In [None]:
# split into training and test
train, test = dataset_manager.split_data_strict(data, train_ratio, split="temporal")
if gap > 1:
    outfile = os.path.join(results_dir, "performance_results_%s_%s_%s_gap%s.csv" % (cls_method, dataset_name, method_name, gap))
else:
    outfile = os.path.join(results_dir, "performance_results_%s_%s_%s.csv" % (cls_method, dataset_name, method_name))

In [None]:
#prefix generation of training data

In [None]:
dt_train_prefixes = dataset_manager.generate_prefix_data(train, min_prefix_length, max_prefix_length) 
#get the label of the train set
train_y = dataset_manager.get_label_numeric(dt_train_prefixes)   

In [None]:
len(dt_train_prefixes)

In [None]:
#transform train dataset and add the column names back to the dataframe
feature_combiner = FeatureUnion([(method, EncoderFactory.get_encoder(method, **cls_encoder_args)) for method in methods])
feature_combiner.fit(dt_train_prefixes, train_y)
dt_train_named = feature_combiner.transform(dt_train_prefixes)
dt_train_named = pd.DataFrame(dt_train_named)
names= feature_combiner.get_feature_names()
dt_train_named.columns = names

In [None]:
#concat them back to correlate correlations

In [None]:
train_y = pd.DataFrame(train_y)
train_y  = train_y.rename(columns={train_y.columns[0]:'label'})
dt_train_named2 = pd.concat([dt_train_named,pd.DataFrame(train_y)], axis=1)

In [None]:
pd.set_option('display.max_rows', None)
dt_train_named2.dtypes

##### correlations (>0.5 with label)

In [None]:
#check the correlations

In [None]:
#Using Pearson Correlation
cor = dt_train_named2.corr()

In [None]:
#correlation with label

In [None]:
cor_target = abs(cor["label"])
#Selecting highly correlated features
relevant_features = cor_target[cor_target>.5].sort_values(ascending=False)
relevant_features

In [None]:
#this is the dataframe with the features you want to check if they are correlated with eachother

In [None]:
activity_correlations = dt_train_named2[['agg__Activity_08_AWB45_010',
'agg__Activity_08_AWB45_020_1',   
'agg__Activity_08_AWB45_020_2',    
'agg__Activity_08_AWB45_030',      
'agg__Activity_08_AWB45_040','label']]

In [None]:
corr = activity_correlations.corr()

In [None]:
corr

In [None]:
#these features need to be uncorrelated with eachother, so drop all but one

In [None]:
dt_train_named3 = dt_train_named2.drop(columns=['agg__Activity_08_AWB45_010','agg__Activity_08_AWB45_020_1','agg__Activity_08_AWB45_030','agg__Activity_08_AWB45_040'])

##### correlations (>0.3 with label)

In [None]:
cor = dt_train_named3.corr()

In [None]:
cor_target = abs(cor["label"])
#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.3].sort_values(ascending=False)
relevant_features

In [None]:
correlations2 = dt_train_named3[['static__Milieu (vergunning)',
'static__Responsible_actor_4634935',   
'agg__Activity_08_AWB45_020_2',    
'agg__Activity_08_AWB45_025',   
'agg__monitoringResource_4634935',
'agg__question_28' ]]

In [None]:
cor = correlations2.corr()

In [None]:
cor[cor>0.6]

In [None]:
#correlates with static_Milieu (vergunning)
dt_train_named4 = dt_train_named3.drop(columns=['static__Responsible_actor_4634935'])

##### correlations (>0.25 with label)

In [None]:
cor = dt_train_named4.corr()

In [None]:
cor_target = abs(cor["label"])
#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.25].sort_values(ascending=False)
relevant_features

In [None]:
correlations3 = dt_train_named4[[
'static__Milieu (vergunning)',
'agg__Activity_01_HOOFD_090',                                
'agg__Activity_08_AWB45_020_2',    
'agg__Activity_08_AWB45_025',   
'agg__monitoringResource_4634935',
'agg__question_28',
'agg__question_Uitgebreid']]

In [None]:
cor = correlations3.corr()

In [None]:
cor[cor>0.6]

In [None]:
dt_train_named5 = dt_train_named4.drop(columns=['agg__Activity_01_HOOFD_090'])

##### correlations (>0.20 with label)

In [None]:
cor = dt_train_named5.corr()

In [None]:
cor_target = abs(cor["label"])
#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.20].sort_values(ascending=False)
relevant_features.sort_values(ascending=False)

In [None]:
#A lot of relevant features, so you better put these in a list

In [None]:
columns = list(relevant_features.index)

In [None]:
correlations4 = dt_train_named5[columns]

In [None]:
cor = correlations4.corr()

In [None]:
cor[cor>0.6]

In [None]:
#The activity 330 stays, the 2 others need to be dropped
dt_train_named6 = dt_train_named5.drop(columns=['agg__Activity_01_HOOFD_370','agg__Activity_01_HOOFD_195','agg__Activity_01_HOOFD_375','agg__Activity_01_HOOFD_380','agg__Activity_01_HOOFD_430','agg__Activity_09_AH_I_010'])

In [None]:
#correlations with agg__Activity_08_AWB45_025
dt_train_named6 = dt_train_named6.drop(columns=['agg__Activity_08_AWB45_045'])
#correlation with agg__question_Uitgebreid 
dt_train_named6 = dt_train_named6.drop(columns=['agg__Activity_04_BPT_010','agg__Activity_04_BPT_020'])

In [None]:
#The activity 330 stays, the 2 others need to be dropped
correlations4 = correlations4.drop(columns=['agg__Activity_01_HOOFD_370','agg__Activity_01_HOOFD_195','agg__Activity_01_HOOFD_375','agg__Activity_01_HOOFD_380','agg__Activity_01_HOOFD_430','agg__Activity_09_AH_I_010'])
#correlations with agg__Activity_08_AWB45_045
correlations4 = correlations4.drop(columns=['agg__Activity_08_AWB45_045'])
#correlation with agg__question_Uitgebreid 
correlations4 = correlations4.drop(columns=['agg__Activity_04_BPT_010','agg__Activity_04_BPT_020'])

In [None]:
columns = list(correlations4.columns)

In [None]:
correlations4_1 = dt_train_named6[columns]

cor = correlations4_1.corr()

cor[cor>0.6]

In [None]:
#print the remaining columns

In [None]:
columns

In [None]:
len(columns)

In [None]:
columns = ['agg__Activity_08_AWB45_020_2',
 'static__Milieu (vergunning)',
 'agg__question_28',
 'agg__monitoringResource_4634935',
 'agg__Activity_08_AWB45_025',
 'agg__question_Uitgebreid',
 'agg__Activity_08_AWB45_170',
 'agg__Activity_01_HOOFD_330',
 'agg__org:resource_560530',
 'agg__org:resource_4634935',
 'agg__Activity_01_HOOFD_193',
 'agg__question_other']

In [None]:
for i in columns:
    print(i)

In [None]:
dt_train_named[columns].nunique()

In [None]:
for i in columns:
    print(i)

##### correlations (>0.15 with label)

In [None]:
cor = dt_train_named6.corr()

cor_target = abs(cor["label"])
#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.15].sort_values(ascending=False)
relevant_features.sort_values(ascending=False)

In [None]:
#A lot of relevant features, so you better put these in a list
columns = list(relevant_features.index)
correlations5 = dt_train_named6[columns]

cor = correlations5.corr()

cor[cor>0.6]

In [None]:
#correlations with question uitgebreid
dt_train_named7 = dt_train_named6.drop(columns=['agg__Activity_04_BPT_030'])

In [None]:
#correlations with agg__Activity_01_HOOFD_330
dt_train_named7 = dt_train_named7.drop(columns=['agg__Activity_01_HOOFD_480'])
dt_train_named7 = dt_train_named7.drop(columns=['agg__Activity_01_HOOFD_200'])

In [None]:
#correlates with 060
dt_train_named7 = dt_train_named7.drop(columns=['agg__Activity_08_AWB45_070_1'])
dt_train_named7 = dt_train_named7.drop(columns=['agg__Activity_08_AWB45_070_2'])

In [None]:
#490_1
dt_train_named7 = dt_train_named7.drop(columns=['agg__Activity_01_HOOFD_490_2'])

In [None]:
#correlates with 191
dt_train_named7 = dt_train_named7.drop(columns=['agg__Activity_01_HOOFD_192'])

In [None]:
#250
dt_train_named7 = dt_train_named7.drop(columns=['agg__Activity_01_HOOFD_260'])

In [None]:
#timesincecasestartsum
dt_train_named7 = dt_train_named7.drop(columns=['agg__timesincecasestart_max'])

##### correlations (>0.10 with label)

In [None]:
cor = dt_train_named7.corr()

In [None]:
cor_target = abs(cor["label"])
#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.10].sort_values(ascending=False)
relevant_features = relevant_features.sort_values(ascending=False)
relevant_features

In [None]:
#A lot of relevant features, so you better put these in a list
columns = list(relevant_features.index)
correlations5 = dt_train_named7[columns]

In [None]:
cor = correlations5.corr()
cor[cor>0.6]

In [None]:
for i in range(0,len(cor)):
    for j in range(i,len(cor)):
        if i != j:
            if cor.iloc[i,j]>0.6:
                print(correlations5.dtypes.index[i],correlations5.columns[j], cor.iloc[i,j])

In [None]:
relevant_features

In [None]:
dt_train_named8 = dt_train_named7.drop(columns=['agg__timesincecasestart_sum'])
dt_train_named8 = dt_train_named8.drop(columns=['agg__timesincecasestart_mean'])
dt_train_named8 = dt_train_named8.drop(columns=['agg__timesincecasestart_std'])
dt_train_named8 = dt_train_named8.drop(columns=['agg__Activity_08_AWB45_070_3'])
dt_train_named8 = dt_train_named8.drop(columns=['agg__Activity_08_AWB45_090_1'])
dt_train_named8 = dt_train_named8.drop(columns=['agg__timesincelastevent_mean'])
dt_train_named8 = dt_train_named8.drop(columns=['agg__timesincelastevent_std'])
dt_train_named8 = dt_train_named8.drop(columns=['agg__timesincelastevent_max'])
dt_train_named8 = dt_train_named8.drop(columns=['agg__Activity_01_HOOFD_250_2'])
dt_train_named8 = dt_train_named8.drop(columns=['agg__Activity_01_HOOFD_190_2'])
dt_train_named8 = dt_train_named8.drop(columns=['agg__weekday_max'])

In [None]:
correlations5 = correlations5.drop(columns=['agg__timesincecasestart_sum'])
correlations5 = correlations5.drop(columns=['agg__timesincecasestart_mean'])
correlations5 = correlations5.drop(columns=['agg__timesincecasestart_std'])
correlations5 = correlations5.drop(columns=['agg__timesincelastevent_mean'])
correlations5 = correlations5.drop(columns=['agg__timesincelastevent_std'])
correlations5 = correlations5.drop(columns=['agg__timesincelastevent_max']) 
correlations5 = correlations5.drop(columns=['agg__Activity_08_AWB45_070_3']) 
correlations5 = correlations5.drop(columns=['agg__Activity_08_AWB45_090_1']) 
correlations5 = correlations5.drop(columns=['agg__Activity_01_HOOFD_250_2']) 
correlations5 = correlations5.drop(columns=['agg__Activity_01_HOOFD_190_2']) 
correlations5 = correlations5.drop(columns=['agg__weekday_max']) 

In [None]:
 relevant_features.index

In [None]:
columns = ['agg__Activity_08_AWB45_020_2', 'static__Milieu (vergunning)',
       'agg__question_28', 'agg__monitoringResource_4634935',
       'agg__Activity_08_AWB45_025', 'agg__question_Uitgebreid',
       'agg__Activity_08_AWB45_170', 'agg__Activity_01_HOOFD_330',
       'agg__org:resource_560530', 'agg__org:resource_4634935',
       'agg__Activity_01_HOOFD_193', 'agg__question_other',
       'agg__Activity_08_AWB45_060', 'agg__Activity_01_HOOFD_191',
       'agg__Activity_08_AWB45_051_0', 'agg__timesincelastevent_sum',
       'agg__Activity_01_HOOFD_490_1', 'agg__Activity_01_HOOFD_250',
       'agg__Activity_08_AWB45_090_2', 'static__Kap',
       'static__Milieu (neutraal wijziging)', 'agg__Activity_08_AWB45_070_3',
       'static__Gebiedsbescherming', 'agg__month_std',
       'agg__timesincecasestart_std', 'agg__Activity_01_HOOFD_250_1',
       'agg__question_42', 'agg__Activity_01_HOOFD_130',
       'agg__Activity_01_HOOFD_250_2', 'agg__weekday_std',
       'agg__Activity_08_OLO_100', 'agg__Activity_01_HOOFD_190_1',
       'static__Responsible_actor_560530', 'agg__Activity_11_AH_II_010',
       'agg__Activity_08_AWB45_050', 'agg__timesincecasestart_mean',
       'agg__weekday_max', 'agg__timesincelastevent_max',
       'agg__Activity_08_AWB45_090_1', 'agg__timesincelastevent_std',
       'agg__Activity_08_AWB45_020_0', 'agg__timesincecasestart_sum',
       'agg__open_cases_std', 'agg__timesincelastevent_mean',
       'agg__Activity_01_HOOFD_190_2', 'static__Inrit/Uitweg',
       'agg__Activity_01_HOOFD_470', 'agg__Activity_01_HOOFD_510_2',
       'agg__month_min', 'agg__Activity_01_HOOFD_495']

## Dataset traffic_fines_1

In [None]:
dataset_ref = 'traffic_fines_1'
params_dir = './params_dir'
results_dir = './results' 
cls_encoding = 'agg'

gap = 1
n_iter = 1

datasets = [dataset_ref] if dataset_ref not in dataset_ref_to_datasets else dataset_ref_to_datasets[dataset_ref]
methods = encoding_dict[cls_encoding]

In [None]:
# print dataset name
for dataset_name in datasets:
    print('Dataset:', dataset_name)

In [None]:
# read the data
dataset_manager = DatasetManager(dataset_name)
data = dataset_manager.read_dataset()
cls_encoder_args = {'case_id_col': dataset_manager.case_id_col, 
                        'static_cat_cols': dataset_manager.static_cat_cols,
                        'static_num_cols': dataset_manager.static_num_cols, 
                        'dynamic_cat_cols': dataset_manager.dynamic_cat_cols,
                        'dynamic_num_cols': dataset_manager.dynamic_num_cols, 
                        'fillna': True}

In [None]:
#dimensions of the data
print('#columns:',len(data.columns))
print('#rows:',data.shape[0])

In [None]:
#data types
data.dtypes

In [None]:
#data description

This event log originates from an Italian local police force, containing events about notes sent about a fine.

### correlations

#### original dataset

##### load xes file

In [None]:
#original dataset
#this is useful for the original activity names

import pm4py
log = pm4py.read_xes('Road_Traffic_Fine_Management_Process.xes.gz')

import pandas as pd
from pm4py.objects.conversion.log import converter as log_converter
dataframe = log_converter.apply(log, variant=log_converter.Variants.TO_DATA_FRAME)

pd.set_option("display.max_columns", dataframe.columns.shape[0])
pd.set_option("display.max_rows", dataframe.index.shape[0])



In [None]:
dataframe.dtypes

In [None]:
dataframe.head()

In [None]:
#show the original activities names next to the new activity names

dataframe.groupby(['concept:name']).size()

##### correlations

In [None]:
#check the correlation with Pearson's correlation for the dataset
data2 = data.copy()
data2.head()

In [None]:
data2.label.head()

In [None]:
data2.label.unique()
data2["label"].replace({"regular": 0, "deviant": 1}, inplace=True)
data2.label.unique()

In [None]:
data2.dtypes

#### Transformed dataset

In [None]:
# determine min and max (truncated) prefix lengths

min_prefix_length = 1
if "traffic_fines" in dataset_name:
    max_prefix_length = 10
elif "bpic2017" in dataset_name:
    max_prefix_length = min(20, dataset_manager.get_pos_case_length_quantile(data, 0.90))
else:
    max_prefix_length = min(40, dataset_manager.get_pos_case_length_quantile(data, 0.90))

# split into training and test
train, test = dataset_manager.split_data_strict(data, train_ratio, split="temporal")

In [None]:
#prefix generation of training data
dt_train_prefixes = dataset_manager.generate_prefix_data(train, min_prefix_length, max_prefix_length) 
#get the label of the train set
train_y = dataset_manager.get_label_numeric(dt_train_prefixes)   

In [None]:
#transform train dataset and add the column names back to the dataframe
feature_combiner = FeatureUnion([(method, EncoderFactory.get_encoder(method, **cls_encoder_args)) for method in methods])
feature_combiner.fit(dt_train_prefixes, train_y)
dt_train_prefixes = feature_combiner.transform(dt_train_prefixes)
dt_train_prefixes = pd.DataFrame(dt_train_prefixes)
dt_train_named = dt_train_prefixes.copy()
names= feature_combiner.get_feature_names()
dt_train_named.columns = names

#concat them back to correlate correlations

train_y = pd.DataFrame(train_y)
train_y  = train_y.rename(columns={train_y.columns[0]:'label'})
dt_train_named2 = pd.concat([dt_train_named,pd.DataFrame(train_y)], axis=1)

##### correlations (>0.15 with label)

In [None]:
#check the correlations

#Using Pearson Correlation
cor = dt_train_named2.corr()

In [None]:
#correlation with label
cor_target = abs(cor["label"])
#Selecting highly correlated features
relevant_features = cor_target[cor_target>.15]
relevant_features = relevant_features.sort_values(ascending=False)
relevant_features

In [None]:
columns = list(relevant_features.index)

In [None]:
columns

In [None]:
#this is the dataframe with the features you want to check if they are correlated with eachother
correlations = dt_train_named2[columns]

In [None]:
corr = correlations.corr()

corr[corr>0.6]

In [None]:
#correlates with agg_Activity_send_fine

In [None]:
correlations = correlations.drop(columns=['agg__expense_max','agg__expense_std','agg__expense_mean','agg__Activity_Insert Fine Notification','agg__timesincelastevent_mean'])

In [None]:
#correlates with agg_Activity_send_fine

In [None]:
correlations = correlations.drop(columns=['agg__month_std'])

In [None]:
dt_train_named3 = dt_train_named2.drop(columns=['agg__expense_max','agg__expense_std','agg__expense_mean','agg__Activity_Insert Fine Notification','agg__timesincelastevent_mean','agg__month_std'])

In [None]:
list(correlations.columns)

##### correlations (>0.10 with label)

In [None]:
#check the correlations

#Using Pearson Correlation
cor = dt_train_named3.corr()

In [None]:
#correlation with label
cor_target = abs(cor["label"])
#Selecting highly correlated features
relevant_features = cor_target[cor_target>.10]
relevant_features = relevant_features.sort_values(ascending=False)
relevant_features

In [None]:
columns = list(relevant_features.index)

In [None]:
#this is the dataframe with the features you want to check if they are correlated with eachother
correlations2 = dt_train_named3[columns]

In [None]:
corr = correlations2.corr()

corr[corr>0.6]

In [None]:
relevant_features

In [None]:
correlations2 = correlations2.drop(columns=['agg__open_cases_std','agg__timesincelastevent_std','agg__timesincecasestart_std'])

In [None]:
list(correlations2.columns)

## Dataset BPIC17_0_Accepted

In [None]:
dataset_ref = 'bpic2017_accepted'
params_dir = './params_dir'
results_dir = './results' 
cls_encoding = 'agg'

gap = 1
n_iter = 1

datasets = [dataset_ref] if dataset_ref not in dataset_ref_to_datasets else dataset_ref_to_datasets[dataset_ref]
methods = encoding_dict[cls_encoding]

In [None]:
# print dataset name

for dataset_name in datasets:
    print('Dataset:', dataset_name)

# read the data

dataset_manager = DatasetManager(dataset_name)
data = dataset_manager.read_dataset()
cls_encoder_args = {'case_id_col': dataset_manager.case_id_col, 
                        'static_cat_cols': dataset_manager.static_cat_cols,
                        'static_num_cols': dataset_manager.static_num_cols, 
                        'dynamic_cat_cols': dataset_manager.dynamic_cat_cols,
                        'dynamic_num_cols': dataset_manager.dynamic_num_cols, 
                        'fillna': True}

In [None]:
#dimensions of the data

print('#columns:',len(data.columns))
print('#rows:',data.shape[0])

In [None]:
#data types
data.dtypes

In [None]:
#data description

This event log originates from the same financial institution as the BPIC2012 one.
However, the data collection has been improved, resulting in a richer and cleaner dataset. As in the previous case, the event log records execution traces of a loan application process. Similarly to BPIC2012, we define three separate labelings based on the outcome of the application, referred to

### correlations

#### original dataset

In [None]:
#check the correlation with Pearson's correlation for the dataset
data2 = data.copy()
data2.head()

data2.label.head()

data2.label.unique()
data2["label"].replace({"regular": 0, "deviant": 1}, inplace=True)
data2.label.unique()

#### transformed dataset

In [None]:
cls_method ='llm'

In [None]:
# determine min and max (truncated) prefix lengths

min_prefix_length = 1
if "traffic_fines" in dataset_name:
    max_prefix_length = 10
elif "bpic2017" in dataset_name:
    max_prefix_length = min(20, dataset_manager.get_pos_case_length_quantile(data, 0.90))
else:
    max_prefix_length = min(40, dataset_manager.get_pos_case_length_quantile(data, 0.90))

# split into training and test
train, test = dataset_manager.split_data_strict(data, train_ratio, split="temporal")

#prefix generation of training data

dt_train_prefixes = dataset_manager.generate_prefix_data(train, min_prefix_length, max_prefix_length) 
#get the label of the train set
train_y = dataset_manager.get_label_numeric(dt_train_prefixes)   

In [None]:
#transform train dataset and add the column names back to the dataframe
feature_combiner = FeatureUnion([(method, EncoderFactory.get_encoder(method, **cls_encoder_args)) for method in methods])
feature_combiner.fit(dt_train_prefixes, train_y)
dt_train_named = feature_combiner.transform(dt_train_named)
dt_train_named = pd.DataFrame(dt_train_named)
dt_train_named = dt_train_prefixes.copy()
names= feature_combiner.get_feature_names()
dt_train_named.columns = names

#concat them back to correlate correlations

train_y = pd.DataFrame(train_y)
train_y  = train_y.rename(columns={train_y.columns[0]:'label'})
dt_train_named2 = pd.concat([dt_train_named,pd.DataFrame(train_y)], axis=1)

##### correlations (>0.2 with label)

In [None]:
#Using Pearson Correlation
cor = dt_train_named2.corr()

In [None]:
#correlation with label

cor_target = abs(cor["label"])
#Selecting highly correlated features
relevant_features = cor_target[cor_target>0.2]
relevant_features = relevant_features.sort_values(ascending=False)
relevant_features

In [None]:
columns = list(relevant_features.index)
#this is the dataframe with the features you want to check if they are correlated with eachother
correlations = dt_train_named2[columns]

In [None]:
corr = correlations.corr()
corr[corr>0.6]

In [None]:
#correlated with agg__CreditScore_std

In [None]:
correlations2 = correlations.drop(columns=['agg__Selected_True','agg__CreditScore_max','agg__CreditScore_mean','agg__CreditScore_sum'])

In [None]:
dt_train_named3 = dt_train_named2.drop(columns=['agg__Selected_True','agg__CreditScore_max','agg__CreditScore_mean','agg__CreditScore_sum'])

In [None]:
list(correlations2.columns)

##### correlations (0.15> with label)

In [None]:
#Using Pearson Correlation
cor = dt_train_named3.corr()

In [None]:
#correlation with label
cor_target = abs(cor["label"])
#Selecting highly correlated features
relevant_features = cor_target[cor_target>.15]
relevant_features = relevant_features.sort_values(ascending=False)
relevant_features

In [None]:
columns = list(relevant_features.index)

In [None]:
#this is the dataframe with the features you want to check if they are correlated with eachother
correlations = dt_train_named3[columns]

In [None]:
corr = correlations.corr()
corr[corr>0.6]

In [None]:
list(correlations.columns)

##### correlations (>0.10 with label)

In [None]:
#correlation with label
cor_target = abs(cor["label"])
#Selecting highly correlated features
relevant_features = cor_target[cor_target>.10]
relevant_features = relevant_features.sort_values(ascending=False)
relevant_features

In [None]:
columns = list(relevant_features.index)

In [None]:
correlations2 = dt_train_named3[columns]

In [None]:
cor = correlations2.corr()

In [None]:
cor[cor > 0.6]

In [None]:
#correlates with activity submitted

In [None]:
correlations2 = correlations2.drop(columns=['agg__Activity_W_Handle leads','agg__org:resource_User_1'])

In [None]:
dt_train_named4 = dt_train_named3.drop(columns=['agg__Activity_W_Handle leads','agg__org:resource_User_1'])

In [None]:
list(correlations2.columns)

##### even more

In [None]:
#correlation with label

cor_target = abs(cor["label"])
#Selecting highly correlated features
relevant_features = cor_target[cor_target>.08]
relevant_features.sort_values(ascending=False)

In [None]:
columns = list(relevant_features.index)

In [None]:
correlations5 = dt_train_named5[columns]

In [None]:
cor = correlations5.corr()

In [None]:
cor[cor>.6]

In [None]:
dt_train_named6 = dt_train_named5.drop(columns=['agg__Activity_A_Validating','agg__NumberOfTerms_mean','agg__Activity_W_Call after offers'])

In [None]:
correlations5 = correlations5.drop(columns=['agg__Activity_W_Call after offers'])

In [None]:
columns = list(correlations5.columns)

In [None]:
columns

In [None]:
columns = ['static__ApplicationType_Limit raise',
 'static__ApplicationType_New credit',
 'agg__Activity_A_Cancelled',
 'agg__Activity_A_Submitted',
 'agg__Activity_W_Validate application',
 'agg__lifecycle:transition_ate_abort',
 'agg__Accepted_False',
 'agg__Accepted_True',
 'agg__Selected_False',
 'agg__CreditScore_std']