In [168]:
#IMPORTS
import pm4py
import sklearn
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error
from pm4py.algo.transformation.log_to_features import algorithm as log_to_features
from itertools import product

# Prepare the Data
The code below is used to generate the CVS file. You can also skip running it and instead load the CSV directly a few cells below.

First, make sure the permit log is in the current directory, then run the code below

In [3]:
travel_permits = pm4py.read_xes('PermitLog.xes.gz')

parsing log, completed traces ::   0%|          | 0/7065 [00:00<?, ?it/s]

First, we make sure we have the prefixes we need, we just want the events leading up to 'Start trip'

In [62]:
travel_prefixes = pm4py.filtering.filter_prefixes(travel_permits, 'Start trip')

In [66]:
#quick check to see if we got what we wanted
i = 39
for e in travel_permits[i]:
    print(e['concept:name'])
print() 
for e in travel_prefixes[i]:
    print(e['concept:name'])

Permit SUBMITTED by EMPLOYEE
Permit FINAL_APPROVED by SUPERVISOR
Start trip
End trip
Declaration SUBMITTED by EMPLOYEE
Declaration FINAL_APPROVED by SUPERVISOR
Request Payment
Payment Handled

Permit SUBMITTED by EMPLOYEE
Permit FINAL_APPROVED by SUPERVISOR


Now extract the features we are interested in from the prefixes:

In [87]:
data, _ = log_to_features.apply(travel_prefixes, parameters={"str_ev_attr": ["concept:name"], 
                                                             'num_tr_attr': ["RequestedBudget","OverspentAmount"]})

data = np.array(data)
data.shape

(7065, 46)

However, we still need trip duration, which we will need to compute manually. We will use number of days to denote the duration

In [93]:
trip_durations = []
i = 0
for trace in travel_permits:
    for e in trace:
        if e['concept:name'] == "Start trip":
            start_time = e['time:timestamp']
        elif e['concept:name'] == "End trip":
            end_time = e['time:timestamp']
            break
    trip_durations.append((end_time-start_time).days)
    
trip_durations = np.array(trip_durations).reshape((-1,1))
trip_durations.shape

(7065, 1)

Now we merge the trip duration feature with the data we just extracted

In [100]:
final_data = np.concatenate((data, trip_durations), axis=1)
#switch the trip duration index with the overspent amount, since overspent amount is what we want to predict, so it's more intuitive
final_data[:,[-1,-2]] = final_data[:,[-2,-1]]
final_data.shape

(7065, 47)

We can now save the data to CSV:

In [105]:
np.savetxt('ML_data.csv', final_data, delimiter=',')

# Load the CSV
Or skip the below cell if you already ran the above cells

In [106]:
final_data = np.genfromtxt('ML_data.csv',delimiter=',')
final_data.shape

(7065, 47)

# Train the ML model
First, we split into train, test and validation sets. We do not shuffle because for process mining it is important that the validation and test sets are further in the future than the training set. The training set is 50%, validation 25%, and test set 25%.

In [154]:
X_train, X_valtest, y_train, y_valtest = sklearn.model_selection.train_test_split(final_data[:,:-1], final_data[:,-1], 
                                                                                  train_size=0.5, shuffle=False)
X_val, X_test, y_val, y_test = sklearn.model_selection.train_test_split(X_valtest, y_valtest, train_size=0.5, shuffle=False)

Next, we do a parameter search and choose the best parameters based on performance on the validation set. The score metric here has a maximum value of 1.0 and no minimum value.

In [164]:
#no gridsearchcv because it doesn't allow for a validation set
best_score = -10000
for n_est, min_split in product([25,50,100,200], [1,20,50]):
    model = RandomForestRegressor(n_estimators = n_est, min_samples_split = min_split)
    model.fit(X_train, y_train)
    score = model.score(X_val, y_val)
    print("n_estimators: {}, min_samples_split: {}, score: {}".format(n_est, min_split, score))
    if score > best_score:
        best_score = score
        best_param = n_est, min_split

n_estimators: 25, min_samples_split: 1, score: -0.00018689364638735384
n_estimators: 25, min_samples_split: 20, score: -0.0003260712066353033
n_estimators: 25, min_samples_split: 50, score: -0.00035390510172073064
n_estimators: 50, min_samples_split: 1, score: 4.247169866888889e-05
n_estimators: 50, min_samples_split: 20, score: -0.00014095695085547533
n_estimators: 50, min_samples_split: 50, score: -0.0004382467879173735
n_estimators: 100, min_samples_split: 1, score: -0.00024208112698587136
n_estimators: 100, min_samples_split: 20, score: -0.0002975917960832408
n_estimators: 100, min_samples_split: 50, score: -0.00028339729521809076
n_estimators: 200, min_samples_split: 1, score: -0.000109498449644585
n_estimators: 200, min_samples_split: 20, score: -0.00039018104932475595
n_estimators: 200, min_samples_split: 50, score: -0.000320108781380668


Now we take the model with the best performing parameters and test its performance on the test set

In [167]:
best_model = RandomForestRegressor(n_estimators = best_param[0], min_samples_split = best_param[1])
best_model.fit(X_train, y_train)
best_model.score(X_test, y_test)

0.06066773139999637

This score metric isn't the most intuitive, so let's compute mean absolute error on the test set as well.

In [169]:
mean_absolute_error(y_test, best_model.predict(X_test))

516.9346679294575

There is clearly still room for improvement