In [1]:
### NOTES ###

# 2012 works with both feature spaces
# 2012 WORKS ON FEATURE SPACE 1: TEST ACCURACY = 0.5483
# 2012 WORKS ON FEATURE SPACE 2: TEST ACCURACY = 0.7370

# created test features for 2017
# created train features for 2017
# 2017 WORKS ON FEATURE SPACE 1: TEST ACCURACY = 0.4813
# 2017 WORKS ON FEATURE SPACE 2: TEST ACCURACY = 0.3642

# 2018 - when you read the csv, it requires encoding = latin-1
# 2018 WORKS ON FEATURE SPACE 1: TEST ACCURACY = 0.2792
# 2018 WORKS ON FEATURE SPACE 2: TEST ACCURACY = 0.1995
# created train features for 2018
# created test features for 2018

# ROAD FINES WORKS ON FEATURE SPACE 1: TEST ACCURACY = 0.8622
# ROAD FINES WORKS ON FEATURE SPACE 2: TEST ACCURACY = 0.8593
# created test features for road fines
# created train features for road fines

# !!!!! IF YOU WANT TO RUN THE NOTEBOOK, ADD TO YOUR data FOLDER added_features_event_log_update_train.csv FROM GOOGLE DRIVE

In [2]:
### IMPORT PACKAGES ###

import pandas as pd
import numpy as np
from datetime import datetime

# process mining for python
import pm4py

# machine learning
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score

# nice plots & tables
import plotly.express as px
import plotly.graph_objects as go

# to make a counter object
from collections import Counter

In [3]:
# Defining database-specific variables

case_column = "case concept:name"
lifecycle_column = 'event lifecycle:transition'
event_column = "event concept:name"
timestamp_column = "event time:timestamp"
position_column = "position"
req_amount_column = 'case AMOUNT_REQ'
previous_event_column = 'previous_event'
previous_lifecycle_column = 'previous lifecycle'
previous_position_column = 'previous_position'
day_of_year_column = "day_of_year"
started_today_column = "started_today"
prev_started_today_column = 'previous_started_today'
timeformat_timestamp = '%Y-%m-%d %H:%M:%S.%f'
most_common_column = 'most_common_event_name'
previous_most_common_column = 'previous_most_common_event_name'

In [4]:
# ### IMPORT CSV FUNCTION ###

def import_csv(file_path):
    try:
        event_log = pd.read_csv(file_path, sep=',')
    except:
        event_log = pd.read_csv(file_path, sep=',', encoding = 'latin-1')
    # make a timestamp out of the timefield
    try:
        event_log[timestamp_column] = [datetime.strptime(date, timeformat_timestamp) for date in event_log[timestamp_column]]
    except:
        try:
            event_log[timestamp_column] = [datetime.strptime(date, "%d-%m-%Y %H:%M:%S.%f") for date in event_log[timestamp_column]]
        except:
            event_log[timestamp_column] = [datetime.strptime(date, "%Y-%m-%d") for date in event_log[timestamp_column]]
    event_log = event_log.sort_values(by=[timestamp_column])  # sort values by user and time of event


    # replace nan with 0:
    event_log = event_log.fillna(0)

    # change format into pm4py event_log
    event_log = pm4py.format_dataframe(event_log, case_id = case_column, activity_key = event_column,
                                       timestamp_key = timestamp_column, timest_format = '%Y-%m-%d %H:%M:%Sz')

    #2011-10-01T00:38:44.546+02:00
    # print what the start and end activities are:
    start_activities = pm4py.get_start_activities(event_log)
    end_activities = pm4py.get_end_activities(event_log)
    print("Start activities: {}\n\nEnd activities: {}".format(start_activities, end_activities))

    # print how many events and how many instances (cases) there are
    num_events = len(event_log)
    num_cases = len(event_log[case_column].unique())
    print(' ')
    print("Number of events: {}\nNumber of cases: {}".format(num_events, num_cases))


    return event_log

# MAKE SURE THAT YOU CHANGE THIS DEPENDING ON THE USED DATASET

In [5]:
data_2012 = False
added_features = True

In [6]:
### IMPORT TRAIN DATA ###

file = "data/added_features_event_log_update_train.csv"
event_log = import_csv(file)
event_names_list = list(event_log[event_column].unique())

Start activities: {'A_SUBMITTED': 10469}

End activities: {'A_DECLINED': 2690, 'W_Valideren aanvraag': 2293, 'W_Afhandelen leads': 1846, 'W_Completeren aanvraag': 1473, 'W_Nabellen offertes': 905, 'A_CANCELLED': 602, 'W_Nabellen incomplete dossiers': 338, 'O_CANCELLED': 268, 'W_Beoordelen fraude': 49, 'W_Wijzigen contractgegevens': 4, 'A_APPROVED': 1}
 
Number of events: 214377
Number of cases: 10469


In [7]:
### IMPORT TEST DATA ###

file = "data/added_features_event_log_update_test.csv"
test_event_log = import_csv(file)
test_event_names_list = list(test_event_log[event_column].unique())

Start activities: {'A_SUBMITTED': 2618}

End activities: {'A_DECLINED': 739, 'W_Completeren aanvraag': 468, 'W_Valideren aanvraag': 454, 'W_Nabellen offertes': 389, 'W_Afhandelen leads': 388, 'W_Nabellen incomplete dossiers': 114, 'A_CANCELLED': 43, 'O_CANCELLED': 15, 'W_Beoordelen fraude': 8}
 
Number of events: 47823
Number of cases: 2618


In [8]:
# LOG FEATURE -> number of new cases in that day

# first we add a column with the day number(day_of_year)

test_event_log[day_of_year_column] = [date.day_of_year for date in test_event_log[timestamp_column]]

event_log[day_of_year_column] = [date.day_of_year for date in event_log[timestamp_column]]

# now we want to see how many new cases were submitted that day until the given even/row

test_event_log[position_column] =  test_event_log.groupby(case_column).cumcount() + 1  # create the position column
test_event_log[started_today_column] = test_event_log[test_event_log[position_column]==1].groupby(day_of_year_column).cumcount() + 1  # cummulative sum per day for all events that have the position 1 
test_event_log[started_today_column] = test_event_log.groupby(day_of_year_column)[started_today_column].fillna(method='ffill')  # now we fill the NaN values after the first event that started that day
test_event_log[started_today_column] = test_event_log[started_today_column].fillna(0)  # finally we fill in all the remaining NaN values with 0; these are the events that happen before the first event where position == 1

event_log[position_column] =  event_log.groupby(case_column).cumcount() + 1  # create the position column
event_log[started_today_column] = event_log[event_log[position_column]==1].groupby(day_of_year_column).cumcount() + 1  # cummulative sum per day for all events that have the position 1 
event_log[started_today_column] = event_log.groupby(day_of_year_column)[started_today_column].fillna(method='ffill')  # now we fill the NaN values after the first event that started that day
event_log[started_today_column] = event_log[started_today_column].fillna(0)  # finally we fill in all the remaining NaN values with 0; these are the events that happen before the first event where position == 1

In [9]:
# Make column with previous day of year since that's what we use for RF

test_event_log[prev_started_today_column] = np.nan
test_event_log[prev_started_today_column] = test_event_log.groupby(case_column)[started_today_column].shift(1)
test_event_log[prev_started_today_column] = test_event_log[prev_started_today_column].fillna(-1) # it's not relevant what we fill it in with because we'll never have to predict the first event of a trace

event_log[prev_started_today_column] = np.nan
event_log[prev_started_today_column] = event_log.groupby(case_column)[started_today_column].shift(1)
event_log[prev_started_today_column] = event_log[prev_started_today_column].fillna(-1)

In [10]:
# Previous position column
test_event_log[previous_position_column] = np.nan
test_event_log[previous_position_column] = test_event_log.groupby(case_column)[position_column].shift(1)
test_event_log[previous_position_column] = test_event_log[previous_position_column].fillna(-1)

event_log[previous_position_column] = np.nan
event_log[previous_position_column] = event_log.groupby(case_column)[position_column].shift(1)
event_log[previous_position_column] = event_log[previous_position_column].fillna(-1)

In [11]:
# LOG FEATURE -> most common event name for that position (until the occurence of the current event - we don't count the future events)
dict_pos_events = {}
def most_common_event(event_name, position):
    if (position in dict_pos_events):
        if (event_name in dict_pos_events[position]):
            dict_pos_events[position][event_name] = dict_pos_events[position][event_name] + 1
        else:
            dict_pos_events[position][event_name] = 1
    else:
        dict_pos_events[position] = {}
        dict_pos_events[position][event_name] = 1
    return max(dict_pos_events[position], key=dict_pos_events[position].get)

event_log[most_common_column] = [most_common_event(event_name, position) for event_name, position in zip(event_log[event_column], event_log[position_column])]

dict_pos_events = {}
def most_common_event(event_name, position):
    if (position in dict_pos_events):
        if (event_name in dict_pos_events[position]):
            dict_pos_events[position][event_name] = dict_pos_events[position][event_name] + 1
        else:
            dict_pos_events[position][event_name] = 1
    else:
        dict_pos_events[position] = {}
        dict_pos_events[position][event_name] = 1
    return max(dict_pos_events[position], key=dict_pos_events[position].get)

test_event_log[most_common_column] = [most_common_event(event_name, position) for event_name, position in zip(test_event_log[event_column], test_event_log[position_column])]

In [12]:
test_event_log[previous_most_common_column] = np.nan
test_event_log[previous_most_common_column] = test_event_log.groupby(case_column)[most_common_column].shift(1)
test_event_log[previous_most_common_column] = test_event_log[previous_most_common_column].fillna(-1)

event_log[previous_most_common_column] = np.nan
event_log[previous_most_common_column] = event_log.groupby(case_column)[most_common_column].shift(1)
event_log[previous_most_common_column] = event_log[previous_most_common_column].fillna(-1)

In [13]:
# test_event_log[test_event_log[case_column]==206366][:]  # see this for an example
test_event_log

Unnamed: 0.1,Unnamed: 0,index,eventID,case concept:name,case REG_DATE,case AMOUNT_REQ,event concept:name,event lifecycle:transition,event time:timestamp,@@index,...,case:concept:name,concept:name,time:timestamp,day_of_year,position,started_today,previous_started_today,previous_position,most_common_event_name,previous_most_common_event_name
0,0,0,44964012621824,206324,2012-02-03T17:17:11.047+01:00,2500,A_SUBMITTED,COMPLETE,2012-02-03 17:17:11.047,0,...,206324,A_SUBMITTED,2012-02-03 17:17:11.047,34,1,1.0,-1.0,-1.0,A_SUBMITTED,-1
1,1,1,44964012621825,206324,2012-02-03T17:17:11.047+01:00,2500,A_PARTLYSUBMITTED,COMPLETE,2012-02-03 17:17:11.323,1,...,206324,A_PARTLYSUBMITTED,2012-02-03 17:17:11.323,34,2,1.0,1.0,1.0,A_PARTLYSUBMITTED,A_SUBMITTED
2,2,2,44964012621826,206324,2012-02-03T17:17:11.047+01:00,2500,A_DECLINED,COMPLETE,2012-02-03 17:17:42.964,2,...,206324,A_DECLINED,2012-02-03 17:17:42.964,34,3,1.0,1.0,2.0,A_DECLINED,A_PARTLYSUBMITTED
3,0,3,44968307589120,206327,2012-02-03T17:23:41.949+01:00,6000,A_SUBMITTED,COMPLETE,2012-02-03 17:23:41.949,3,...,206327,A_SUBMITTED,2012-02-03 17:23:41.949,34,1,2.0,-1.0,-1.0,A_SUBMITTED,-1
4,1,4,44968307589121,206327,2012-02-03T17:23:41.949+01:00,6000,A_PARTLYSUBMITTED,COMPLETE,2012-02-03 17:23:42.504,4,...,206327,A_PARTLYSUBMITTED,2012-02-03 17:23:42.504,34,2,2.0,2.0,1.0,A_PARTLYSUBMITTED,A_SUBMITTED
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47818,1,38039,56203942035457,214376,2012-02-29T23:51:16.799+01:00,15000,A_PARTLYSUBMITTED,COMPLETE,2012-02-29 23:51:17.423,47818,...,214376,A_PARTLYSUBMITTED,2012-02-29 23:51:17.423,60,2,138.0,138.0,1.0,A_PARTLYSUBMITTED,A_SUBMITTED
47819,2,38040,56203942035458,214376,2012-02-29T23:51:16.799+01:00,15000,W_Afhandelen leads,SCHEDULE,2012-02-29 23:52:01.287,47819,...,214376,W_Afhandelen leads,2012-02-29 23:52:01.287,60,3,138.0,138.0,2.0,A_PREACCEPTED,A_PARTLYSUBMITTED
47820,3,38096,56203942035459,214376,2012-02-29T23:51:16.799+01:00,15000,W_Afhandelen leads,START,2012-03-01 09:26:46.736,47820,...,214376,W_Afhandelen leads,2012-03-01 09:26:46.736,61,4,0.0,138.0,3.0,W_Completeren aanvraag,A_PREACCEPTED
47821,4,38099,56203942035460,214376,2012-02-29T23:51:16.799+01:00,15000,A_DECLINED,COMPLETE,2012-03-01 09:27:37.118,47821,...,214376,A_DECLINED,2012-03-01 09:27:37.118,61,5,0.0,0.0,4.0,W_Completeren aanvraag,W_Completeren aanvraag


In [14]:
### FEATURESPACES ###
if not added_features:
    test_event_log[previous_event_column] = test_event_log.groupby(case_column)[event_column].shift(1)
    event_log[previous_event_column] = event_log.groupby(case_column)[event_column].shift(1)

    test_event_log[previous_lifecycle_column] = test_event_log.groupby(case_column)[lifecycle_column].shift(1)
    event_log[previous_lifecycle_column] = event_log.groupby(case_column)[lifecycle_column].shift(1)



# FeatureSpace 1
X_1 = event_log[[previous_position_column, prev_started_today_column, previous_event_column, previous_lifecycle_column]]

X_1 = pd.get_dummies(X_1)

# Labels
y = event_log[event_column]

In [15]:
### FEATURESPACES - test ###

# FeatureSpace 1
X_1_test = test_event_log[[previous_position_column, prev_started_today_column, previous_event_column, previous_lifecycle_column]]

X_1_test = pd.get_dummies(X_1_test)

# Labels
y_test = test_event_log[event_column]

In [16]:
# Make sure the two sets have the same columns, otherwise we get errors when trying to run the RF model

for column in list(set(X_1_test).difference(X_1)):
    X_1[column] = 0

for column in list(set(X_1).difference(X_1_test)):
    X_1_test[column] = 0

In [17]:
if added_features:
    if data_2012:
        # FeatureSpace 2
        X_2 = event_log[[prev_started_today_column, previous_position_column, req_amount_column, previous_most_common_column, previous_event_column, previous_lifecycle_column] + test_event_names_list + list(set(event_names_list).difference(test_event_names_list))]
        X_2 = pd.get_dummies(X_2)

        X_2_test = test_event_log[[prev_started_today_column, previous_position_column, req_amount_column, previous_most_common_column, previous_event_column, previous_lifecycle_column] + test_event_names_list]
        X_2_test = pd.get_dummies(X_2_test)

        for column in list(set(X_2_test).difference(X_2)):
            X_2[column] = 0

        for column in list(set(X_2).difference(X_2_test)):
            X_2_test[column] = 0

    else:
        # FeatureSpace 2
        X_2 = event_log[[prev_started_today_column, previous_position_column, previous_most_common_column, previous_event_column, previous_lifecycle_column] + test_event_names_list + list(set(event_names_list).difference(test_event_names_list))]
        X_2 = pd.get_dummies(X_2)

        X_2_test = test_event_log[[prev_started_today_column, previous_position_column, previous_most_common_column, previous_event_column, previous_lifecycle_column] + test_event_names_list]
        X_2_test = pd.get_dummies(X_2_test)

        for column in list(set(X_2_test).difference(X_2)):
            X_2[column] = 0

        for column in list(set(X_2).difference(X_2_test)):
            X_2_test[column] = 0

In [18]:
### RandomForest ###

# FeatureSpace 1
rf_1 = RandomForestClassifier(max_depth=15, random_state=0)
X_1_cv_rf = np.mean(cross_val_score(rf_1, X_1, y, cv=5))
print('The average 5 fold cross validation accuracy for RandomForest on FeatureSpace 1 is: ', X_1_cv_rf)
rf_1.fit(X_1, y)
print('The test accuracy for RandomForest on FeatureSpace 1 is: ', rf_1.score(X_1_test, y_test))

if added_features:
    # FeatureSpace 2
    rf_2 = RandomForestClassifier(max_depth=15, random_state=0)
    X_2_cv_rf = np.mean(cross_val_score(rf_2, X_2, y, cv=5))
    print('The average 5 fold cross validation accuracy for RandomForest on FeatureSpace 2 is: ', X_2_cv_rf)
    rf_2.fit(X_2, y)
    print('The test accuracy for RandomForest on FeatureSpace 2 is: ', rf_2.score(X_2_test, y_test))

The average 5 fold cross validation accuracy for RandomForest on FeatureSpace 1 is:  0.7646622372686631
The test accuracy for RandomForest on FeatureSpace 1 is:  0.5483344834075654
The average 5 fold cross validation accuracy for RandomForest on FeatureSpace 2 is:  0.8368015079079492
The test accuracy for RandomForest on FeatureSpace 2 is:  0.7370093887878217


In [19]:
# ### RANDOM FOREST ### => this is optimization, so you don't have to run this anymore


# # Since the data is flattened, we can use cross validation.
# # To optimize a random forest, one can play around with the depth of the trees.
# depths = [1,2,3,4,5,6,7,8,9,10,15,20]
# X_1_rf_scores = pd.DataFrame()
# X_2_rf_scores = pd.DataFrame()
# for depth in depths:
#     clf = RandomForestClassifier(max_depth=depth, random_state=0)
#     X_1_rf_scores[depth] = cross_val_score(clf, X_1, y, cv=5)
#     X_2_rf_scores[depth] = cross_val_score(clf, X_2, y, cv=5)



# # plot all the traces for featureSpace 1
# fig = go.Figure()

# for depth in depths:
#     fig.add_trace(go.Scatter(y= X_1_rf_scores[depth], name=depth))
# fig.update_yaxes(range=[0,1], title='Accuracy')
# fig.update_xaxes(title='5 fold cross validation')
# fig.update_layout(title= '5 fold cross validation on random forest on featureSpace X_1, for different tree depths.',
#                   legend_title_text='tree depths')
# fig.show()



# # plot all the traces for featureSpace 2
# fig = go.Figure()

# for depth in depths:
#     fig.add_trace(go.Scatter(y= X_2_rf_scores[depth], name=depth))
# fig.update_yaxes(range=[0,1], title='Accuracy')
# fig.update_xaxes(title='5 fold cross validation')
# fig.update_layout(title= '5 fold cross validation on random forest on featureSpace X_2, for different tree depths.',
#                   legend_title_text='tree depths')
# fig.show()

In [20]:
# ### OPTIMIZED MODELS ### => this is optimization, so you don't have to run this anymore

# optimal_depth_X_1 = 15
# optimal_depth_X_2 = 15

# # optimal_k_X_1 = 50
# # optimal_k_X_2 = 50

# # average accuracy
# X_1_rf = np.mean(X_1_rf_scores[optimal_depth_X_1])
# X_2_rf = np.mean(X_2_rf_scores[optimal_depth_X_2])
# # X_1_knn = np.mean(X_1_knn_scores[optimal_k_X_1])
# # X_2_knn = np.mean(X_2_knn_scores[optimal_k_X_2])
# baseline = round(((my_counter.most_common(1)[0][1])/len(event_log)), 2)

# # make a plot with all values
# fig = go.Figure()

# fig.add_trace(go.Scatter(x=['X_1', 'X_2'], y= [baseline, baseline], name='Baseline'))
# fig.add_trace(go.Scatter(x=['X_1', 'X_2'], y= [X_1_rf, X_2_rf], name='RandomForest'))
# # fig.add_trace(go.Scatter(x=['X_1', 'X_2'], y= [X_1_knn, X_2_knn], name='KNN'))

# fig.update_yaxes(range=[0,1], title='Accuracy')
# fig.update_xaxes(title='mean 5 fold cross validation per featureSpace')
# fig.update_layout(title= '5 fold cross validation on randomForest and knn on featureSpaces X_1 and X_2.',
#                   legend_title_text='algorithm')
# fig.show()

In [20]:
# Results with previous_started_today and previous_position:
# F1: CV: 0.7670 and test: 0.5085
# F2: CV: 0.8394 and test: 0.6932

# Results with previous_started_today and previous_position, but WITHOUT requested amount:
# F1: CV: 0.7656 and test: 0.4977
# F2: CV: 0.8369 and test: 0.6930

# Results with previous_position:
# F1: CV: 0.7624 and test: 0.4173
# F2: CV: 0.8398 and test: 0.7092

# Results with previous_started_today:
# F1: CV: 0.7223 and test: 0.4190
# F2: CV: 0.8378 and test: 0.6655