In [1]:
import pandas as pd
import numpy as np

# process mining for python
import pm4py

# machine learning
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix

# nice plots & tables
import plotly
import plotly.express as px
import plotly.graph_objects as go
from IPython.display import display

In [2]:
def import_csv(file_path):
    event_log = pd.read_csv(file_path, sep=',')
    # make a timestamp out of the timefield
    event_log["event time:timestamp"] = pd.to_datetime(event_log["event time:timestamp"])

    # replace nan with 0:
    event_log = event_log.fillna(0)

    # change format into pm4py event_log
    event_log = pm4py.format_dataframe(event_log, case_id = 'case concept:name', activity_key = 'event concept:name',
                                       timestamp_key = "event time:timestamp", timest_format = '%Y-%m-%d %H:%M:%Sz')

    #2011-10-01T00:38:44.546+02:00
    # print what the start and end activities are:
    start_activities = pm4py.get_start_activities(event_log)
    end_activities = pm4py.get_end_activities(event_log)
    print("Start activities: {}\n\nEnd activities: {}".format(start_activities, end_activities))

    # print how many events and how many instances (cases) there are
    num_events = len(event_log)
    num_cases = len(event_log['case concept:name'].unique())
    print(' ')
    print("Number of events: {}\nNumber of cases: {}".format(num_events, num_cases))


    return event_log

In [3]:
file = "data/added_features_event_log.csv"

event_log = import_csv(file)

Start activities: {'A_SUBMITTED': 10469}

End activities: {'A_DECLINED': 2690, 'W_Valideren aanvraag': 1953, 'W_Afhandelen leads': 1864, 'W_Completeren aanvraag': 1705, 'W_Nabellen offertes': 1288, 'A_CANCELLED': 425, 'W_Nabellen incomplete dossiers': 318, 'O_CANCELLED': 174, 'W_Beoordelen fraude': 49, 'W_Wijzigen contractgegevens': 2, 'A_ACTIVATED': 1}
 
Number of events: 199668
Number of cases: 10469


In [4]:
event_log.head()

Unnamed: 0.2,Unnamed: 0,level_0,Unnamed: 0.1,index,eventID,case concept:name,case REG_DATE,case AMOUNT_REQ,event concept:name,event lifecycle:transition,...,A_CANCELLED,W_Afhandelen leads,A_DECLINED,O_DECLINED,W_Nabellen incomplete dossiers,W_Beoordelen fraude,W_Wijzigen contractgegevens,case:concept:name,concept:name,time:timestamp
0,0,0,0,0,0,173688,2011-10-01T00:38:44.546+02:00,20000,A_SUBMITTED,COMPLETE,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,173688,A_SUBMITTED,2011-10-01 00:38:44.546
1,1,1,1,1,1,173688,2011-10-01T00:38:44.546+02:00,20000,A_PARTLYSUBMITTED,COMPLETE,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,173688,A_PARTLYSUBMITTED,2011-10-01 00:38:44.880
2,2,2,2,2,2,173688,2011-10-01T00:38:44.546+02:00,20000,A_PREACCEPTED,COMPLETE,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,173688,A_PREACCEPTED,2011-10-01 00:39:37.906
3,3,3,3,3,3,173688,2011-10-01T00:38:44.546+02:00,20000,W_Completeren aanvraag,SCHEDULE,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,173688,W_Completeren aanvraag,2011-10-01 00:39:38.875
4,4,89,89,89,4,173688,2011-10-01T00:38:44.546+02:00,20000,W_Completeren aanvraag,START,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,173688,W_Completeren aanvraag,2011-10-01 11:36:46.437


In [5]:
event_log.columns

Index(['Unnamed: 0', 'level_0', 'Unnamed: 0.1', 'index', 'eventID ',
       'case concept:name', 'case REG_DATE', 'case AMOUNT_REQ',
       'event concept:name', 'event lifecycle:transition',
       'event time:timestamp', 'check', '@@index', 'previous_events',
       'previous_event', 'start', 'A_SUBMITTED', 'A_PARTLYSUBMITTED',
       'A_PREACCEPTED', 'W_Completeren aanvraag', 'A_ACCEPTED', 'O_SELECTED',
       'A_FINALIZED', 'O_CREATED', 'O_SENT', 'W_Nabellen offertes',
       'O_SENT_BACK', 'W_Valideren aanvraag', 'A_ACTIVATED', 'A_REGISTERED',
       'A_APPROVED', 'O_ACCEPTED', 'O_CANCELLED', 'A_CANCELLED',
       'W_Afhandelen leads', 'A_DECLINED', 'O_DECLINED',
       'W_Nabellen incomplete dossiers', 'W_Beoordelen fraude',
       'W_Wijzigen contractgegevens', 'case:concept:name', 'concept:name',
       'time:timestamp'],
      dtype='object')

In [34]:
# Assumption: it doesn't matter in what order things happened before, just that they happened before.
# So I'm building a model that predicts the next event (categorical), based on:
# - the amount of money they want to borrow
# - the previous event (categorical) ->'previous_event' -> sklearn does not accept categorical for trees
# (which could be solved with hot-one-encoding, but that would add a lot more features, so let's first try without)
# - and on with events have already happened (booleans).

X = event_log[['case AMOUNT_REQ', 'A_SUBMITTED', 'A_PARTLYSUBMITTED',
               'A_PREACCEPTED', 'W_Completeren aanvraag', 'A_ACCEPTED', 'O_SELECTED',
               'A_FINALIZED', 'O_CREATED', 'O_SENT', 'W_Nabellen offertes',
               'O_SENT_BACK', 'W_Valideren aanvraag', 'A_ACTIVATED', 'A_REGISTERED',
               'A_APPROVED', 'O_ACCEPTED', 'O_CANCELLED', 'A_CANCELLED',
               'W_Afhandelen leads', 'A_DECLINED', 'O_DECLINED',
               'W_Nabellen incomplete dossiers', 'W_Beoordelen fraude',
               'W_Wijzigen contractgegevens']]

y = event_log['event concept:name']

In [7]:
print(X.head())

   case AMOUNT_REQ  A_SUBMITTED  A_PARTLYSUBMITTED  A_PREACCEPTED  \
0            20000          0.0                0.0            0.0   
1            20000          1.0                0.0            0.0   
2            20000          1.0                1.0            0.0   
3            20000          1.0                1.0            1.0   
4            20000          1.0                1.0            1.0   

   W_Completeren aanvraag  A_ACCEPTED  O_SELECTED  A_FINALIZED  O_CREATED  \
0                     0.0         0.0         0.0          0.0        0.0   
1                     0.0         0.0         0.0          0.0        0.0   
2                     0.0         0.0         0.0          0.0        0.0   
3                     0.0         0.0         0.0          0.0        0.0   
4                     1.0         0.0         0.0          0.0        0.0   

   O_SENT  ...  A_APPROVED  O_ACCEPTED  O_CANCELLED  A_CANCELLED  \
0     0.0  ...         0.0         0.0          0.0   

In [8]:
# Because we now have flattened our complex data to simple tabular data,
# a simple random forest will probably outperform a neural network
# so let's build a simple one, so that I see how well my neural network performs
# and it also means that I can use n-fold cross validation :-)

# only needed without cross validation
#X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=0)

# we want to see what the best max_depth is:
depths = [1,2,3,4,5,6,7,8,9,10,15,20]
scores_df = pd.DataFrame()
for depth in depths:
    clf = RandomForestClassifier(max_depth=depth, random_state=0)
    scores_df[depth] = cross_val_score(clf, X, y, cv=5)

# without cross validation
#clf.fit(X_train, y_train)
#print(clf.score(X_test, y_test))

# with cross validation

print(scores_df)

         1         2         3         4         5         10        15  \
0  0.391170  0.580934  0.667251  0.714329  0.735689  0.762959  0.762283   
1  0.392422  0.580683  0.658937  0.718561  0.739320  0.767817  0.766314   
2  0.393249  0.582862  0.660014  0.720414  0.740897  0.767191  0.766415   
3  0.392082  0.581173  0.656199  0.715173  0.734881  0.757970  0.755566   
4  0.391005  0.578369  0.654797  0.717201  0.739439  0.762026  0.759422   

         20  
0  0.759954  
1  0.763209  
2  0.760555  
3  0.746150  
4  0.752360  


In [10]:
fig = go.Figure()

for depth in depths:
    fig.add_trace(go.Scatter(y= scores_df[depth], name=depth))
fig.update_yaxes(range=[0,1])
fig.show()

# a depth of more than 5 doesn't increase the performance that much anymore, but it could lead it overfitting
# based on the cross validation, it looks quite stable

In [35]:
#### confusion matrix ####
# to see where the most mistakes are made

# model
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=0)
clf = RandomForestClassifier(max_depth=10, random_state=0)
clf.fit(X_train, y_train)
print('The accuracy of this model is: ', clf.score(X_train, y_train))

# list of predictions
y_pred_list = list(clf.predict(X_test))
y_list = list(y_test)

print('Confusion Matrix')
my_matrix=pd.DataFrame(confusion_matrix(y_list, y_pred_list, labels=pd.unique(y_list)), index=pd.unique(y_list))
my_matrix.columns = pd.unique(y_list)
display(my_matrix)

The accuracy of this model is:  0.7664554822392227
Confusion Matrix


Unnamed: 0,A_CANCELLED,W_Afhandelen leads,W_Completeren aanvraag,A_SUBMITTED,A_REGISTERED,W_Valideren aanvraag,W_Nabellen offertes,O_SENT_BACK,A_DECLINED,A_PARTLYSUBMITTED,...,A_ACCEPTED,A_FINALIZED,O_SELECTED,W_Nabellen incomplete dossiers,A_PREACCEPTED,A_ACTIVATED,A_APPROVED,O_DECLINED,O_ACCEPTED,W_Beoordelen fraude
A_CANCELLED,0,0,147,0,0,5,222,0,0,0,...,0,0,10,20,0,0,0,0,0,0
W_Afhandelen leads,0,1589,405,0,0,0,0,0,73,0,...,0,0,0,0,547,0,0,0,0,10
W_Completeren aanvraag,0,0,7620,0,0,0,797,0,0,0,...,0,1,42,0,0,0,0,0,0,0
A_SUBMITTED,0,0,0,2107,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A_REGISTERED,0,0,0,0,175,19,0,0,0,0,...,0,0,0,17,0,11,59,0,27,0
W_Valideren aanvraag,0,0,0,0,0,1865,0,0,0,0,...,0,0,0,1009,0,0,0,0,0,0
W_Nabellen offertes,0,0,0,0,0,500,7546,0,0,0,...,0,0,0,0,0,0,0,0,0,0
O_SENT_BACK,0,0,0,0,0,0,468,0,0,0,...,0,0,0,21,0,0,0,0,0,0
A_DECLINED,0,489,180,0,0,35,2,0,134,0,...,0,0,7,16,301,0,0,0,0,13
A_PARTLYSUBMITTED,0,0,0,0,0,0,0,0,0,2156,...,0,0,0,0,0,0,0,0,0,0


In [None]:
from sklearn.neighbors import KNeighborsClassifier

nr_of_neighbors = [1,3,5,7,11]
scores_knn_df = pd.DataFrame()

for nr in nr_of_neighbors:
    neigh = KNeighborsClassifier(n_neighbors=nr)
    scores_knn_df[nr] = cross_val_score(neigh, X, y, cv=5)


print(scores_df)

fig = go.Figure()

for nr in nr_of_neighbors:
    fig.add_trace(go.Scatter(y= scores_knn_df[nr], name=nr))
fig.update_yaxes(range=[0,1])
fig.show()