# Importing Libraries

In [28]:
# loading the packages
import pandas as pd
import numpy as np
import pm4py
#import matplotlib.pyplot as plt
#from sklearn.model_selection import train_test_split
#from sklearn.tree import DecisionTreeClassifier
#from sklearn.tree import plot_tree
#from sklearn.metrics import accuracy_score
#from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import mean_squared_error

# Loading Data

In [2]:
log = pm4py.read_xes('BPI_Challenge_2012.xes')
df = pm4py.convert_to_dataframe(log)

In [3]:
# translating the Dutch phrases in the 'concept:name' column to English
translation_dict = {
    'W_Completeren aanvraag': 'W_Complete request',
    'W_Nabellen offertes': 'W_Follow up quotes',
    'W_Nabellen incomplete dossiers': 'W_Follow up incomplete files',
    'W_Valideren aanvraag': 'W_Validate request',
    'W_Afhandelen leads': 'W_Handle leads',
    'A_SUBMITTED': 'A_SUBMITTED',
    'A_PARTLYSUBMITTED': 'A_PARTLYSUBMITTED',
    'A_DECLINED': 'A_DECLINED',
    'A_PREACCEPTED': 'A_PREACCEPTED',
    'O_SENT': 'O_SENT',
    'O_CREATED': 'O_CREATED',
    'O_SELECTED': 'O_SELECTED',
    'A_ACCEPTED': 'A_ACCEPTED',
    'A_FINALIZED': 'A_FINALIZED',
    'O_CANCELLED': 'O_CANCELLED',
    'O_SENT_BACK': 'O_SENT_BACK',
    'A_CANCELLED': 'O_CANCELLED',
    'A_REGISTERED': 'A_REGISTERED',
    'A_ACTIVATED': 'A_ACTIVATED',
    'A_APPROVED': 'A_APPROVED',
    'O_ACCEPTED': 'O_ACCEPTED',
    'O_DECLINED': 'O_DECLINED',
    'W_Beoordelen fraude': 'W_Evaluate fraud',
    'W_Wijzigen contractgegevens': 'W_Modify contract details'
    }

In [4]:
df['concept:name_eng'] = df['concept:name'].map(translation_dict)

In [5]:
# adding position to the dataframe
df['position'] = df.groupby('case:concept:name').cumcount() + 1

In [6]:
# Adding the next activity(concept:name) to the dataframe and if the next activity is not available, then it will be fill in with No_Activity
df['next_activity'] = df.groupby('case:concept:name')['concept:name'].shift(-1).fillna('No_Activity')

In [7]:
df.drop(['case:REG_DATE', 'case:AMOUNT_REQ', 'lifecycle:transition'], axis=1, inplace = True)

In [8]:
date1 = pd.to_datetime(df['time:timestamp'], errors='coerce', format='%Y-%m-%d %H:%M:%S.%f%z')
date2 = pd.to_datetime(df['time:timestamp'], errors='coerce', format='%Y-%m-%d %H:%M:%S%z')
df['date'] = date1.fillna(date2)

In [9]:
df["date_milliseconds"] = (df['date'] - pd.Timestamp("1970-01-01", tz = "UTC")) // pd.Timedelta('1ms')

In [10]:
df["duration"] = df.groupby("case:concept:name")["date_milliseconds"].shift(-1) - df["date_milliseconds"]

In [11]:
df.head(10)

Unnamed: 0,org:resource,concept:name,time:timestamp,case:concept:name,concept:name_eng,position,next_activity,date,date_milliseconds,duration
0,112.0,A_SUBMITTED,2011-10-01 00:38:44.546000+00:00,173688,A_SUBMITTED,1,A_PARTLYSUBMITTED,2011-10-01 00:38:44.546000+00:00,1317429524546,334.0
1,112.0,A_PARTLYSUBMITTED,2011-10-01 00:38:44.880000+00:00,173688,A_PARTLYSUBMITTED,2,A_PREACCEPTED,2011-10-01 00:38:44.880000+00:00,1317429524880,53026.0
2,112.0,A_PREACCEPTED,2011-10-01 00:39:37.906000+00:00,173688,A_PREACCEPTED,3,W_Completeren aanvraag,2011-10-01 00:39:37.906000+00:00,1317429577906,969.0
3,112.0,W_Completeren aanvraag,2011-10-01 00:39:38.875000+00:00,173688,W_Complete request,4,W_Completeren aanvraag,2011-10-01 00:39:38.875000+00:00,1317429578875,39427562.0
4,,W_Completeren aanvraag,2011-10-01 11:36:46.437000+00:00,173688,W_Complete request,5,A_ACCEPTED,2011-10-01 11:36:46.437000+00:00,1317469006437,356871.0
5,10862.0,A_ACCEPTED,2011-10-01 11:42:43.308000+00:00,173688,A_ACCEPTED,6,O_SELECTED,2011-10-01 11:42:43.308000+00:00,1317469363308,145935.0
6,10862.0,O_SELECTED,2011-10-01 11:45:09.243000+00:00,173688,O_SELECTED,7,A_FINALIZED,2011-10-01 11:45:09.243000+00:00,1317469509243,0.0
7,10862.0,A_FINALIZED,2011-10-01 11:45:09.243000+00:00,173688,A_FINALIZED,8,O_CREATED,2011-10-01 11:45:09.243000+00:00,1317469509243,1954.0
8,10862.0,O_CREATED,2011-10-01 11:45:11.197000+00:00,173688,O_CREATED,9,O_SENT,2011-10-01 11:45:11.197000+00:00,1317469511197,183.0
9,10862.0,O_SENT,2011-10-01 11:45:11.380000+00:00,173688,O_SENT,10,W_Nabellen offertes,2011-10-01 11:45:11.380000+00:00,1317469511380,174.0


# Assigning the most frequent next event in each position

In [12]:
# Finidng the most frequent next activity for each position
position_dict = {}
for j in range(1, max(df["position"]+1)):
    dic = {}
    for i in (df[df['position'] == j]).index:
        if df['next_activity'][i] in dic:
            dic[df['next_activity'][i]] += 1
        else:
            dic[df['next_activity'][i]] = 1
    
    position_dict[j] = max(dic, key=dic.get)

In [13]:
# Iterating through the dataframe and map the values of position_dict to the column predicted_next_activity with lambda
df['predicted_next_activity'] = df['position'].map(lambda x: position_dict[x])

In [14]:
# printing the accuracy, precision, recall and f1-score of the model separately without report
accuracy = accuracy_score(df["next_activity"], df["predicted_next_activity"])
precision = precision_score(df["next_activity"], df["predicted_next_activity"], average='micro')
recall = recall_score(df["next_activity"], df["predicted_next_activity"], average='micro')
f1 = f1_score(df["next_activity"], df["predicted_next_activity"], average='micro')

f'Accuracy: {accuracy} Precision: {precision} Recall: {recall} F1: {f1}'

'Accuracy: 0.4316933638443936 Precision: 0.4316933638443936 Recall: 0.4316933638443936 F1: 0.4316933638443936'

# Assigning the timestamp for each position

In [15]:
# finding the average duration for each position
position_dict_time = {}
for j in range(1, 176):
    time = []
    for i in (df[df['position'] == j]).index:
        time.append(df["duration"].iloc[i])
    if len(time) > 0:
        position_dict_time[j] = sum(time)/len(time)

In [16]:
# adding the average duration to each position
df['predicted_next_timestamp_milliseconds'] = df['date_milliseconds'] + df['position'].map(lambda x: position_dict_time[x]).fillna(0)

In [17]:
#df

In [18]:
# convert the predicted_timestamp to miliseconds without e
df['predicted_next_timestamp'] = df['predicted_next_timestamp_milliseconds'].apply(lambda x: '%d' % x)

In [19]:
#df

In [20]:
# convert the predicted_timestamp to datetime
df['predicted_next_timestamp'] = pd.to_datetime(df['predicted_next_timestamp'], unit='ms')

  df['predicted_next_timestamp'] = pd.to_datetime(df['predicted_next_timestamp'], unit='ms')


In [21]:
df

Unnamed: 0,org:resource,concept:name,time:timestamp,case:concept:name,concept:name_eng,position,next_activity,date,date_milliseconds,duration,predicted_next_activity,predicted_next_timestamp_milliseconds,predicted_next_timestamp
0,112.0,A_SUBMITTED,2011-10-01 00:38:44.546000+00:00,173688,A_SUBMITTED,1,A_PARTLYSUBMITTED,2011-10-01 00:38:44.546000+00:00,1317429524546,334.0,A_PARTLYSUBMITTED,1.317430e+12,2011-10-01 00:39:35.680
1,112.0,A_PARTLYSUBMITTED,2011-10-01 00:38:44.880000+00:00,173688,A_PARTLYSUBMITTED,2,A_PREACCEPTED,2011-10-01 00:38:44.880000+00:00,1317429524880,53026.0,A_PREACCEPTED,1.317430e+12,2011-10-01 00:39:35.680
2,112.0,A_PREACCEPTED,2011-10-01 00:39:37.906000+00:00,173688,A_PREACCEPTED,3,W_Completeren aanvraag,2011-10-01 00:39:37.906000+00:00,1317429577906,969.0,W_Completeren aanvraag,1.317430e+12,2011-10-01 00:39:35.680
3,112.0,W_Completeren aanvraag,2011-10-01 00:39:38.875000+00:00,173688,W_Complete request,4,W_Completeren aanvraag,2011-10-01 00:39:38.875000+00:00,1317429578875,39427562.0,W_Completeren aanvraag,1.317443e+12,2011-10-01 04:15:51.808
4,,W_Completeren aanvraag,2011-10-01 11:36:46.437000+00:00,173688,W_Complete request,5,A_ACCEPTED,2011-10-01 11:36:46.437000+00:00,1317469006437,356871.0,W_Completeren aanvraag,1.317470e+12,2011-10-01 11:54:36.928
...,...,...,...,...,...,...,...,...,...,...,...,...,...
262195,112.0,A_PARTLYSUBMITTED,2012-02-29 23:51:17.423000+00:00,214376,A_PARTLYSUBMITTED,2,W_Afhandelen leads,2012-02-29 23:51:17.423000+00:00,1330559477423,43864.0,A_PREACCEPTED,1.330560e+12,2012-02-29 23:50:51.136
262196,112.0,W_Afhandelen leads,2012-02-29 23:52:01.287000+00:00,214376,W_Handle leads,3,W_Afhandelen leads,2012-02-29 23:52:01.287000+00:00,1330559521287,34485449.0,W_Completeren aanvraag,1.330560e+12,2012-02-29 23:53:02.208
262197,11169.0,W_Afhandelen leads,2012-03-01 09:26:46.736000+00:00,214376,W_Handle leads,4,A_DECLINED,2012-03-01 09:26:46.736000+00:00,1330594006736,50382.0,W_Completeren aanvraag,1.330607e+12,2012-03-01 13:03:50.272
262198,11169.0,A_DECLINED,2012-03-01 09:27:37.118000+00:00,214376,A_DECLINED,5,W_Afhandelen leads,2012-03-01 09:27:37.118000+00:00,1330594057118,4207.0,W_Completeren aanvraag,1.330595e+12,2012-03-01 09:45:02.720


In [22]:
df['next_timestamp'] = df.groupby('case:concept:name')['time:timestamp'].shift(-1).fillna(0)
df['next_timestamp_milliseconds'] = df.groupby('case:concept:name')['date_milliseconds'].shift(-1).fillna(0)

In [23]:
df['next_timestamp_milliseconds'] = df['next_timestamp_milliseconds'].apply(lambda x: '%d' % x)
df['predicted_next_timestamp_milliseconds'] = df['predicted_next_timestamp_milliseconds'].apply(lambda x: '%d' % x)

In [24]:
df

Unnamed: 0,org:resource,concept:name,time:timestamp,case:concept:name,concept:name_eng,position,next_activity,date,date_milliseconds,duration,predicted_next_activity,predicted_next_timestamp_milliseconds,predicted_next_timestamp,next_timestamp,next_timestamp_milliseconds
0,112.0,A_SUBMITTED,2011-10-01 00:38:44.546000+00:00,173688,A_SUBMITTED,1,A_PARTLYSUBMITTED,2011-10-01 00:38:44.546000+00:00,1317429524546,334.0,A_PARTLYSUBMITTED,1317429525127,2011-10-01 00:39:35.680,2011-10-01 00:38:44.880000+00:00,1317429524880
1,112.0,A_PARTLYSUBMITTED,2011-10-01 00:38:44.880000+00:00,173688,A_PARTLYSUBMITTED,2,A_PREACCEPTED,2011-10-01 00:38:44.880000+00:00,1317429524880,53026.0,A_PREACCEPTED,1317429559891,2011-10-01 00:39:35.680,2011-10-01 00:39:37.906000+00:00,1317429577906
2,112.0,A_PREACCEPTED,2011-10-01 00:39:37.906000+00:00,173688,A_PREACCEPTED,3,W_Completeren aanvraag,2011-10-01 00:39:37.906000+00:00,1317429577906,969.0,W_Completeren aanvraag,1317429577906,2011-10-01 00:39:35.680,2011-10-01 00:39:38.875000+00:00,1317429578875
3,112.0,W_Completeren aanvraag,2011-10-01 00:39:38.875000+00:00,173688,W_Complete request,4,W_Completeren aanvraag,2011-10-01 00:39:38.875000+00:00,1317429578875,39427562.0,W_Completeren aanvraag,1317442585858,2011-10-01 04:15:51.808,2011-10-01 11:36:46.437000+00:00,1317469006437
4,,W_Completeren aanvraag,2011-10-01 11:36:46.437000+00:00,173688,W_Complete request,5,A_ACCEPTED,2011-10-01 11:36:46.437000+00:00,1317469006437,356871.0,W_Completeren aanvraag,1317470103140,2011-10-01 11:54:36.928,2011-10-01 11:42:43.308000+00:00,1317469363308
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
262195,112.0,A_PARTLYSUBMITTED,2012-02-29 23:51:17.423000+00:00,214376,A_PARTLYSUBMITTED,2,W_Afhandelen leads,2012-02-29 23:51:17.423000+00:00,1330559477423,43864.0,A_PREACCEPTED,1330559512434,2012-02-29 23:50:51.136,2012-02-29 23:52:01.287000+00:00,1330559521287
262196,112.0,W_Afhandelen leads,2012-02-29 23:52:01.287000+00:00,214376,W_Handle leads,3,W_Afhandelen leads,2012-02-29 23:52:01.287000+00:00,1330559521287,34485449.0,W_Completeren aanvraag,1330559521287,2012-02-29 23:53:02.208,2012-03-01 09:26:46.736000+00:00,1330594006736
262197,11169.0,W_Afhandelen leads,2012-03-01 09:26:46.736000+00:00,214376,W_Handle leads,4,A_DECLINED,2012-03-01 09:26:46.736000+00:00,1330594006736,50382.0,W_Completeren aanvraag,1330607013719,2012-03-01 13:03:50.272,2012-03-01 09:27:37.118000+00:00,1330594057118
262198,11169.0,A_DECLINED,2012-03-01 09:27:37.118000+00:00,214376,A_DECLINED,5,W_Afhandelen leads,2012-03-01 09:27:37.118000+00:00,1330594057118,4207.0,W_Completeren aanvraag,1330595153821,2012-03-01 09:45:02.720,2012-03-01 09:27:41.325000+00:00,1330594061325


In [25]:
mse = mean_squared_error(df["next_timestamp_milliseconds"], df["predicted_next_timestamp_milliseconds"])

print(mse)

8.762091732717506e+22
