In [124]:
import pm4py
import pandas as pd
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import pickle
import numpy as np
import tqdm
import seaborn as sns
from project_functions import *
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score


In [2]:
log_df = None
with open("data/log_df_filtered.pkl", "rb") as f:
    log_df = pickle.load(f)

print("We have",len(log_df['case:concept:name'].unique()),"cases")

We have 130883 cases


# Dataset creation
First of all we filter all the c

# Create the datase

There will be four predicted classes:
* (1) short: < 7 days
* (2) medium-short: < 3 month
* (3) medium-long: <= 1 year
* (4) long: > 1 year

we include 4 column (max activities within 3 days is 4) where we will have a lot of missing values. We will expect a lot of missing values for act2, act3 and act4, but this is not a problem since we want to leverage the missing information treating those data as MNAR (missing not at random).

We want to use the information from the fact that more activities happened in a short time span, and look if there is a correlation with the duration of the case.

In [125]:
delta_time = 3 * 24 * 60 * 60

column_names = ['amount', 'article', 'act1', 'act2', 'act3','act4', 'org:resource', 'day','month','vehicleClass','case_duration']

#Structure to create the dataframe
temp_list=[]
temp_row = [len(column_names)]

#iter on all subdf of the same case
for case_id, group in log_df.groupby('case:concept:name'):
    temp_row = ['missing' for i in range(len(column_names))]
    start_ts = group['time:timestamp'].min()
    
    #take amount where case:concept:name = Create Fine
    temp_row[0] = group[group['concept:name'] == 'Create Fine']['amount'].values[0]
    #take article
    temp_row[1] = str(int(group[group['concept:name'] == 'Create Fine']['article'].values[0]))
    #take resource
    temp_row[6] = group[group['concept:name'] == 'Create Fine']['org:resource'].values[0]
    #vehicle class
    temp_row[9] = group[group['concept:name'] == 'Create Fine']['vehicleClass'].values[0]
    #print(amount, article, res, vc)
    
    #get day and month
    temp_row[7] = group['time:timestamp'].min().day
    temp_row[8] = group['time:timestamp'].min().month
    
    #get duration (expressed in days)
    case_duration = (group['time:timestamp'].max() - group['time:timestamp'].min()).total_seconds()/ (24 * 60 * 60)
    if case_duration < 7:
        temp_row[len(column_names)-1]= 'short'
    elif case_duration < 3*30: #3 months
        temp_row[len(column_names)-1]= 'medium-short'
    elif case_duration <= 365: #1 year
        temp_row[len(column_names)-1]= 'medium-long'
    else:
        temp_row[len(column_names)-1]= 'long'
        



    for count, (index, row) in enumerate(group.iterrows()):

        delta_activity = row['time:timestamp'] - start_ts
        if delta_activity <= pd.Timedelta(delta_time, unit='s'):
            #print(row['concept:name'],delta_activity)
            temp_row[2+count] = row['concept:name']
        else:
            break
    temp_list.append(temp_row)
    
dataset = pd.DataFrame(temp_list,columns=column_names)
dataset
    

Unnamed: 0,amount,article,act1,act2,act3,act4,org:resource,day,month,vehicleClass,case_duration
0,35.0,157,Create Fine,missing,missing,missing,561,2,8,A,long
1,36.0,157,Create Fine,missing,missing,missing,561,9,3,A,long
2,36.0,157,Create Fine,missing,missing,missing,537,19,3,A,medium-long
3,36.0,157,Create Fine,missing,missing,missing,537,20,3,A,long
4,36.0,157,Create Fine,Payment,missing,missing,537,20,3,A,short
...,...,...,...,...,...,...,...,...,...,...,...
130878,131.0,142,Create Fine,missing,missing,missing,25,7,9,A,long
130879,131.0,142,Create Fine,missing,missing,missing,25,7,9,A,medium-short
130880,131.0,142,Create Fine,missing,missing,missing,25,7,9,M,long
130881,131.0,142,Create Fine,missing,missing,missing,25,7,9,A,long


# Train the model

In [127]:
X = dataset.iloc[:, :-1]  # All columns except the last one
y = dataset.iloc[:, -1]   # The last column

# One-hot encode the categorical columns
X = pd.get_dummies(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Initialize the Decision Tree classifier
clf = DecisionTreeClassifier(criterion='entropy',
                             splitter='best',
                             max_depth=10,
)

# Train the classifier
clf.fit(X_train, y_train)

# Make predictions on the test set
y_pred = clf.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
#precision
precision = precision_score(y_test, y_pred, average='weighted')
#recall
recall = recall_score(y_test, y_pred, average='weighted')
#f1-score
f1= f1_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {accuracy * 100:.2f}%')
print(f'Precision: {precision * 100:.2f}%')
print(f'Recall: {recall * 100:.2f}%')
print(f'F1-score: {f1 * 100:.2f}%')

Accuracy: 61.89%
Precision: 65.81%
Recall: 61.89%
F1-score: 52.02%


In [112]:
from sklearn import tree
plt.figure(figsize=(50,50))
tree.plot_tree(clf,fontsize=10)
plt.show()


KeyboardInterrupt: 