## ***Duration Adjustment and Evaluation***
#### *This code uses the synthetic logs generated in Step_2 of the experiment. As an example, we used the synthetic 'Purchasing' eventlog with Multitasking Ratio (MR) = 0.9.*
#### *It computes overlap intervals and distributes them using two approaches: Our proposed method and the method by Estrada-Torres et al.*
#### *The results are compared against the ground-truth durations, and evaluation metrics are then calculated.*
#### *In the code, estimates from our method are labeled `ADJ` whereas the duration_column in dataframe is `final_adjusted_duration` , while those from equal splitting are labeled `EQU` whereas its estimated duration column in dataframe is `final_adjusted_duration_equal`.*

In [None]:
# Import the libraries:
import numpy as np
import matplotlib.pyplot as plt
import time
import pm4py
from pm4py.objects.log.importer.xes import importer as xes_importer
import pytz
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt
from pm4py.objects.log.util import sorting
from pm4py.objects.log.exporter.xes import exporter as xes_exporter

In [None]:
# Load the synthetic Log
log = xes_importer.apply('MR_0.9.xes')
log

parsing log, completed traces ::   0%|          | 0/608 [00:00<?, ?it/s]

[{'attributes': {'concept:name': '2'}, 'events': [{'concept:name': 'Create Request for Quotation Requester', 'org:resource': 'Alberto Duport', 'start:timestamp': datetime.datetime(2011, 1, 1, 8, 16, tzinfo=datetime.timezone.utc), 'time:timestamp': datetime.datetime(2011, 1, 1, 8, 31, 15, tzinfo=datetime.timezone.utc), 'duration_minutes': 15.25, 'old_start:timestamp': datetime.datetime(2011, 1, 1, 8, 16, tzinfo=datetime.timezone.utc), 'old_end:timestamp': datetime.datetime(2011, 1, 1, 8, 26, tzinfo=datetime.timezone.utc), 'old_duration_minutes': 10.0, 'overlap_flag': True}, '..', {'concept:name': "Authorize Supplier's Invoice payment", 'org:resource': 'Pedro Alvares', 'start:timestamp': datetime.datetime(2011, 1, 6, 5, 58, tzinfo=datetime.timezone.utc), 'time:timestamp': datetime.datetime(2011, 1, 6, 5, 58, tzinfo=datetime.timezone.utc), 'duration_minutes': 0.0, 'old_start:timestamp': datetime.datetime(2011, 1, 6, 5, 58, tzinfo=datetime.timezone.utc), 'old_end:timestamp': datetime.datet

In [None]:
# Converting into the dataframe
log_df = pm4py.convert_to_dataframe(log)
log_df.head(60)

Unnamed: 0,concept:name,org:resource,start:timestamp,time:timestamp,duration_minutes,old_start:timestamp,old_end:timestamp,old_duration_minutes,overlap_flag,case:concept:name
0,Create Request for Quotation Requester,Alberto Duport,2011-01-01 08:16:00+00:00,2011-01-01 08:31:15+00:00,15.25,2011-01-01 08:16:00+00:00,2011-01-01 08:26:00+00:00,10.0,True,2
1,Amend Request for Quotation Requester,Christian Francois,2011-01-01 10:16:00+00:00,2011-01-01 10:27:36+00:00,11.6,2011-01-01 10:16:00+00:00,2011-01-01 10:21:00+00:00,5.0,True,2
2,Release Purchase Order,Elvira Lores,2011-01-04 23:01:00+00:00,2011-01-04 23:11:50+00:00,10.833333,2011-01-04 23:01:00+00:00,2011-01-04 23:02:00+00:00,1.0,True,2
3,Amend Request for Quotation Requester,Esmana Liubiata,2011-01-01 12:33:00+00:00,2011-01-01 13:00:39+00:00,27.65,2011-01-01 12:33:00+00:00,2011-01-01 12:39:00+00:00,6.0,True,2
4,Analyze Quotation comparison Map,Esmana Liubiata,2011-01-01 23:33:00+00:00,2011-01-02 00:06:04+00:00,33.066667,2011-01-01 23:33:00+00:00,2011-01-01 23:44:00+00:00,11.0,True,2
5,Confirm Purchase Order,Esmeralda Clay,2011-01-02 20:23:00+00:00,2011-01-03 13:53:27+00:00,1050.45,2011-01-02 20:23:00+00:00,2011-01-02 20:29:00+00:00,6.0,True,2
6,Deliver Goods Services,Esmeralda Clay,2011-01-02 20:25:55+00:00,2011-01-04 08:07:00+00:00,2141.083333,2011-01-03 03:15:00+00:00,2011-01-04 14:53:00+00:00,2138.0,True,2
7,Send invoice,Esmeralda Clay,2011-01-05 14:50:00+00:00,2011-01-05 14:50:00+00:00,0.0,2011-01-05 14:50:00+00:00,2011-01-05 14:50:00+00:00,0.0,False,2
8,Choose best option,Fjodor Kowalski,2011-01-01 23:44:00+00:00,2011-01-01 23:44:00+00:00,0.0,2011-01-01 23:44:00+00:00,2011-01-01 23:44:00+00:00,0.0,False,2
9,Send Request for Quotation to Supplier,Francois de Perrier,2011-01-01 15:18:00+00:00,2011-01-01 15:40:00+00:00,22.0,2011-01-01 15:18:00+00:00,2011-01-01 15:40:00+00:00,22.0,False,2


##### ***Marking the workitems for overlapping execution and assigning an overlapping group identifier:***
*In this step, we identify whether workitem overlaps with another workitem or executes individually. This condition is recorded in the column `overlap`, and the value is represented as a binary indicator (True or False). Furthermore, each group of overlapping workitems (that are sharing the common overlapping interval) is assigned a unique identifier, where '0' represents workitems that do not overlap. We call this column as 'overlap_section'.*

In [None]:
def mark_overlaps(
    log_df: pd.DataFrame,
    start_col: str = 'start:timestamp',
    end_col: str = 'time:timestamp',
    resource_col: str = 'org:resource',
    duration_sec_col: str = 'duration_seconds'  
) -> pd.DataFrame:
  
    df = log_df.copy()
    df[start_col] = pd.to_datetime(df[start_col])
    
   # initial values for columns
    df['overlap'] = False
    df['overlap_section'] = 0
    next_section_id = 1  #unique id for overlap sections across all resources

    #loop per resource.
    grouped = df.groupby(resource_col, sort=False)
    for resource, group in grouped:
        # sort by start time 
        g = group.sort_values(start_col)
        indices = g.index.to_list()
        starts = g[start_col].values  
        ends = g[end_col].values
        n = len(indices)
        if n <= 1:
            continue  # no possible overlap for a single item

        # union-find (disjoint set) structure
        parent = list(range(n))
        def find(i):
            # pathcompression
            while parent[i] != i:
                parent[i] = parent[parent[i]]
                i = parent[i]
            return i
        def union(i, j):
            ri, rj = find(i), find(j)
            if ri != rj:
                parent[rj] = ri

        # compare intervals pairwise but break early using sorted starts:
        # for each i, only j with starts[j] < ends[i] can overlap i, so:
        for i in range(n):
            # j starts with i+1
            for j in range(i+1, n):
                # if start_j >= end_i then j and later cannot overlap i (sorted starts), so break
                if starts[j] >= ends[i]:
                    break
                # otherwise check overlap condition (strict)
                if (starts[i] < ends[j]) and (starts[j] < ends[i]):
                    union(i, j)

        # collect components
        components = {}
        for k in range(n):
            root = find(k)
            components.setdefault(root, []).append(k)

        # assign overlap flags/section ids for components with size > 1
        for comp in components.values():
            if len(comp) > 1:
                # assign a new global section id
                sid = next_section_id
                next_section_id += 1
                for k in comp:
                    df.at[indices[k], 'overlap'] = True
                    df.at[indices[k], 'overlap_section'] = sid
        # singletons remain overlap=False and overlap_section=0

    # return marked dataframe
    marked_df = df  # contains all original columns plus 'overlap' and 'overlap_section'
    return marked_df


marked_df = mark_overlaps(log_df)
marked_df.head(20)

Unnamed: 0,concept:name,org:resource,start:timestamp,time:timestamp,duration_minutes,old_start:timestamp,old_end:timestamp,old_duration_minutes,overlap_flag,case:concept:name,overlap,overlap_section
0,Create Request for Quotation Requester,Alberto Duport,2011-01-01 08:16:00+00:00,2011-01-01 08:31:15+00:00,15.25,2011-01-01 08:16:00+00:00,2011-01-01 08:26:00+00:00,10.0,True,2,True,1
1,Amend Request for Quotation Requester,Christian Francois,2011-01-01 10:16:00+00:00,2011-01-01 10:27:36+00:00,11.6,2011-01-01 10:16:00+00:00,2011-01-01 10:21:00+00:00,5.0,True,2,True,35
2,Release Purchase Order,Elvira Lores,2011-01-04 23:01:00+00:00,2011-01-04 23:11:50+00:00,10.833333,2011-01-04 23:01:00+00:00,2011-01-04 23:02:00+00:00,1.0,True,2,True,74
3,Amend Request for Quotation Requester,Esmana Liubiata,2011-01-01 12:33:00+00:00,2011-01-01 13:00:39+00:00,27.65,2011-01-01 12:33:00+00:00,2011-01-01 12:39:00+00:00,6.0,True,2,True,96
4,Analyze Quotation comparison Map,Esmana Liubiata,2011-01-01 23:33:00+00:00,2011-01-02 00:06:04+00:00,33.066667,2011-01-01 23:33:00+00:00,2011-01-01 23:44:00+00:00,11.0,True,2,True,97
5,Confirm Purchase Order,Esmeralda Clay,2011-01-02 20:23:00+00:00,2011-01-03 13:53:27+00:00,1050.45,2011-01-02 20:23:00+00:00,2011-01-02 20:29:00+00:00,6.0,True,2,True,130
6,Deliver Goods Services,Esmeralda Clay,2011-01-02 20:25:55+00:00,2011-01-04 08:07:00+00:00,2141.083333,2011-01-03 03:15:00+00:00,2011-01-04 14:53:00+00:00,2138.0,True,2,True,130
7,Send invoice,Esmeralda Clay,2011-01-05 14:50:00+00:00,2011-01-05 14:50:00+00:00,0.0,2011-01-05 14:50:00+00:00,2011-01-05 14:50:00+00:00,0.0,False,2,False,0
8,Choose best option,Fjodor Kowalski,2011-01-01 23:44:00+00:00,2011-01-01 23:44:00+00:00,0.0,2011-01-01 23:44:00+00:00,2011-01-01 23:44:00+00:00,0.0,False,2,False,0
9,Send Request for Quotation to Supplier,Francois de Perrier,2011-01-01 15:18:00+00:00,2011-01-01 15:40:00+00:00,22.0,2011-01-01 15:18:00+00:00,2011-01-01 15:40:00+00:00,22.0,False,2,False,0


***In the experimental setup only, if zero-durations workitems appear in overlapping workitems interval temporal boundary, we will consider them as no_overlapping workitem because it is not actively taking part in multitasking. (it is also a precautionary measure so the activity instance will be affect our column `marking` with we will add later).***

In [None]:
marked_df.loc[marked_df["duration_minutes"] == 0, ["overlap", "overlap_section"]] = [False, 0]

In [None]:
# Removing unnecessary columns: <<Check>>
marked_df = marked_df.drop('overlap_flag', axis=1)

marked_df = marked_df[['case:concept:name', 'concept:name', 'org:resource', 'start:timestamp', 'time:timestamp', 'duration_minutes', 
                       'old_start:timestamp', 'old_end:timestamp', 'old_duration_minutes', 'overlap', 'overlap_section']]
marked_df

Unnamed: 0,case:concept:name,concept:name,org:resource,start:timestamp,time:timestamp,duration_minutes,old_start:timestamp,old_end:timestamp,old_duration_minutes,overlap,overlap_section
0,2,Create Request for Quotation Requester,Alberto Duport,2011-01-01 08:16:00+00:00,2011-01-01 08:31:15+00:00,15.250000,2011-01-01 08:16:00+00:00,2011-01-01 08:26:00+00:00,10.0,True,1
1,2,Amend Request for Quotation Requester,Christian Francois,2011-01-01 10:16:00+00:00,2011-01-01 10:27:36+00:00,11.600000,2011-01-01 10:16:00+00:00,2011-01-01 10:21:00+00:00,5.0,True,35
2,2,Release Purchase Order,Elvira Lores,2011-01-04 23:01:00+00:00,2011-01-04 23:11:50+00:00,10.833333,2011-01-04 23:01:00+00:00,2011-01-04 23:02:00+00:00,1.0,True,74
3,2,Amend Request for Quotation Requester,Esmana Liubiata,2011-01-01 12:33:00+00:00,2011-01-01 13:00:39+00:00,27.650000,2011-01-01 12:33:00+00:00,2011-01-01 12:39:00+00:00,6.0,True,96
4,2,Analyze Quotation comparison Map,Esmana Liubiata,2011-01-01 23:33:00+00:00,2011-01-02 00:06:04+00:00,33.066667,2011-01-01 23:33:00+00:00,2011-01-01 23:44:00+00:00,11.0,True,97
...,...,...,...,...,...,...,...,...,...,...,...
9114,1368,Create Purchase Requisition,Miu Hanwan,2011-07-16 05:22:00+00:00,2011-07-16 05:45:01+00:00,23.016667,2011-07-16 05:22:00+00:00,2011-07-16 05:37:00+00:00,15.0,True,2246
9115,1387,Analyze Purchase Requisition,Maris Freeman,2011-07-20 09:17:00+00:00,2011-07-20 09:23:00+00:00,6.000000,2011-07-20 09:17:00+00:00,2011-07-20 09:23:00+00:00,6.0,False,0
9116,1387,Create Purchase Requisition,Tesca Lobes,2011-07-19 11:08:00+00:00,2011-07-19 11:24:00+00:00,16.000000,2011-07-19 11:08:00+00:00,2011-07-19 11:24:00+00:00,16.0,False,0
9117,1917,Analyze Purchase Requisition,Maris Freeman,2011-10-09 15:11:00+00:00,2011-10-09 15:17:00+00:00,6.000000,2011-10-09 15:11:00+00:00,2011-10-09 15:17:00+00:00,6.0,False,0


In [None]:
marked_df_sorted = marked_df.sort_values(['org:resource', 'start:timestamp']).reset_index(drop=True)

# checking the summary of workitems executions (overlapping or not):
print("No. of workitems that are taking part in multitasking: ", len(marked_df_sorted[marked_df_sorted['overlap']==True]))
print("No. of workitems that are NOT taking part in multitasking: ", len(marked_df_sorted[marked_df_sorted['overlap']==False]))
print("No. of unique overlapping group of workitems belongs to same overlap interval: ", len(marked_df_sorted['overlap_section'].unique()))

No. of workitems that are taking part in multitasking:  4608
No. of workitems that are NOT taking part in multitasking:  4511
No. of unique overlapping group of workitems belongs to same overlap interval:  2305


#### *We add a column named `marking`, which will classify the workitem into three categories:*
- **no_OL:**  *if the workitem is not taking part in multitasking execution.* 
- **same_act_OL:** *if the workitem is taking part in multitasking execution, and its overlap pair also executing the same activity instance.*
- **diff_act_OL:** *if the workitem is taking part in multitasking execution, and its overlap pair executing the different activity instance.*

In [None]:
# Adding a 'marking' column to show that the work item is: 'No_Overlap' , 'Same_Act_OL' , 'Diff_Act_OL'

section_activity_counts = (marked_df[marked_df["overlap"] == True]
    .groupby(["org:resource", "overlap_section"])["concept:name"]
    .nunique().reset_index(name="unique_activities"))

section_activity_counts["marking"] = np.where(
    section_activity_counts["unique_activities"] == 1,
    "same_act_OL",
    "diff_act_OL")

marked_df = marked_df.merge(section_activity_counts[["org:resource", "overlap_section", "marking"]],
    on=["org:resource", "overlap_section"],
    how="left")

marked_df["marking"] = np.where(marked_df["overlap"] == False,
    "no_OL", marked_df["marking"])

marked_df.head(20)

Unnamed: 0,case:concept:name,concept:name,org:resource,start:timestamp,time:timestamp,duration_minutes,old_start:timestamp,old_end:timestamp,old_duration_minutes,overlap,overlap_section,marking
0,2,Create Request for Quotation Requester,Alberto Duport,2011-01-01 08:16:00+00:00,2011-01-01 08:31:15+00:00,15.25,2011-01-01 08:16:00+00:00,2011-01-01 08:26:00+00:00,10.0,True,1,same_act_OL
1,2,Amend Request for Quotation Requester,Christian Francois,2011-01-01 10:16:00+00:00,2011-01-01 10:27:36+00:00,11.6,2011-01-01 10:16:00+00:00,2011-01-01 10:21:00+00:00,5.0,True,35,diff_act_OL
2,2,Release Purchase Order,Elvira Lores,2011-01-04 23:01:00+00:00,2011-01-04 23:11:50+00:00,10.833333,2011-01-04 23:01:00+00:00,2011-01-04 23:02:00+00:00,1.0,True,74,diff_act_OL
3,2,Amend Request for Quotation Requester,Esmana Liubiata,2011-01-01 12:33:00+00:00,2011-01-01 13:00:39+00:00,27.65,2011-01-01 12:33:00+00:00,2011-01-01 12:39:00+00:00,6.0,True,96,diff_act_OL
4,2,Analyze Quotation comparison Map,Esmana Liubiata,2011-01-01 23:33:00+00:00,2011-01-02 00:06:04+00:00,33.066667,2011-01-01 23:33:00+00:00,2011-01-01 23:44:00+00:00,11.0,True,97,diff_act_OL
5,2,Confirm Purchase Order,Esmeralda Clay,2011-01-02 20:23:00+00:00,2011-01-03 13:53:27+00:00,1050.45,2011-01-02 20:23:00+00:00,2011-01-02 20:29:00+00:00,6.0,True,130,diff_act_OL
6,2,Deliver Goods Services,Esmeralda Clay,2011-01-02 20:25:55+00:00,2011-01-04 08:07:00+00:00,2141.083333,2011-01-03 03:15:00+00:00,2011-01-04 14:53:00+00:00,2138.0,True,130,diff_act_OL
7,2,Send invoice,Esmeralda Clay,2011-01-05 14:50:00+00:00,2011-01-05 14:50:00+00:00,0.0,2011-01-05 14:50:00+00:00,2011-01-05 14:50:00+00:00,0.0,False,0,no_OL
8,2,Choose best option,Fjodor Kowalski,2011-01-01 23:44:00+00:00,2011-01-01 23:44:00+00:00,0.0,2011-01-01 23:44:00+00:00,2011-01-01 23:44:00+00:00,0.0,False,0,no_OL
9,2,Send Request for Quotation to Supplier,Francois de Perrier,2011-01-01 15:18:00+00:00,2011-01-01 15:40:00+00:00,22.0,2011-01-01 15:18:00+00:00,2011-01-01 15:40:00+00:00,22.0,False,0,no_OL


In [None]:
print("No. of workitems that are Not Overlapping: ", (marked_df["marking"] == "no_OL").sum())
print("No. of overlapping workitems, whose pairs execute same activity: ", (marked_df["marking"] == "same_act_OL").sum())
print("No. of overlapping workitems, whose pairs execute different activity: ", (marked_df["marking"] == "diff_act_OL").sum())

No. of workitems that are Not Overlapping:  4511
No. of overlapping workitems, whose pairs execute same activity:  1374
No. of overlapping workitems, whose pairs execute different activity:  3234


In [None]:
# Calculatng the duration in minutes:
marked_df['time:timestamp'] = pd.to_datetime(marked_df['time:timestamp'])
marked_df['start:timestamp'] = pd.to_datetime(marked_df['start:timestamp'])
marked_df['duration_minutes'] = (marked_df['time:timestamp'] - marked_df['start:timestamp']).dt.total_seconds() / 60

# Showing the table for:
# Activity | No_Multitasking Groundtruth Durations | Duration with Multitasked Executions 

average_duration = (marked_df.groupby('concept:name')[['old_duration_minutes', 'duration_minutes']]
    .mean()
    .reset_index()
    .rename(columns={'old_duration_minutes': 'Ground-Truth log Duration (No-Multitasking)',
           'duration_minutes': 'Multitasking log Duration'}))
average_duration

Unnamed: 0,concept:name,Ground-Truth log Duration (No-Multitasking),Multitasking log Duration
0,Amend Purchase Requisition,27.363636,30.313636
1,Amend Request for Quotation Requester,9.850195,12.617834
2,Amend Request for Quotation Requester Manager,19.122449,20.744898
3,Analyze Purchase Requisition,6.581152,7.844066
4,Analyze Quotation comparison Map,20.154964,22.887127
5,Analyze Request for Quotation,23.033424,54.609455
6,Approve Purchase Order for payment,1.0,38.395803
7,Authorize Supplier's Invoice payment,0.0,0.0
8,Choose best option,0.0,0.0
9,Confirm Purchase Order,19.808717,140.755892


##### ***Computing the non-overlapping interval duration of multitasked workitems:***
*In this step, for each multitasked workitem, we identify the intervals during which it executes individually. These intervals durations are named as `non_overlap_part`* 

In [None]:
# Computing Non-Overlap part with the overlapping work item:

def compute_non_overlap(df, start_col='start:timestamp', end_col='time:timestamp'):
   
    df = df.copy()
    
    df[start_col] = pd.to_datetime(df[start_col], utc=True)
    df[end_col] = pd.to_datetime(df[end_col], utc=True)
    
    df['non_overlap_part'] = 0.0

    for section_id, group in df.groupby('overlap_section'):
        group = group.sort_values(start_col)
        
        # collect all unique time points
        times = pd.concat([group[start_col], group[end_col]]).drop_duplicates().sort_values().reset_index(drop=True)
        
        for i in range(len(times)-1):
            t0, t1 = times[i], times[i+1]
            slice_len = (t1 - t0).total_seconds() / 60.0  # in minutes
            if slice_len <= 0:
                continue
            
            # find active tasks in this slice
            active = group[(group[start_col] < t1) & (group[end_col] > t0)]
            
            if len(active) == 1:
                # this slice belongs entirely to this single task --so non-overlap
                idx = active.index[0]
                df.at[idx, 'non_overlap_part'] += slice_len
  
    return df
marked_df = compute_non_overlap(marked_df)
marked_df[marked_df['overlap_section']==6].head()

Unnamed: 0,case:concept:name,concept:name,org:resource,start:timestamp,time:timestamp,duration_minutes,old_start:timestamp,old_end:timestamp,old_duration_minutes,overlap,overlap_section,marking,non_overlap_part
384,81,Create Purchase Requisition,Alberto Duport,2011-01-10 17:53:00+00:00,2011-01-10 18:22:50+00:00,29.833333,2011-01-10 17:53:00+00:00,2011-01-10 18:12:00+00:00,19.0,True,6,same_act_OL,6.05
410,82,Create Purchase Requisition,Alberto Duport,2011-01-10 17:59:03+00:00,2011-01-10 18:46:00+00:00,46.95,2011-01-10 20:09:00+00:00,2011-01-10 20:43:00+00:00,34.0,True,6,same_act_OL,23.166667


##### ***Computing the activity average duration for use in the weighting factor:*** 
*We compute the average activity durations (weights) using KDE by considering the non-overlapped workitems. For the cases where no workitem of an activity is executed individually, we use the **non_overlap_part** durations within the multitasked workitems to compute the KDE; otherwise, in the worst-case scenario, we use the overall computed average duration.*

In [None]:
# Computing Weights by KDE (Average duration of activities when occur independently 'with No-overlapping') :

from scipy.stats import gaussian_kde

def kde_mean(samples, n_samples=5000):
    samples = np.array(samples, dtype=float)
    samples = samples[~np.isnan(samples)]  #drop NaN if any
    
    # if no data
    if len(samples) == 0:
        return np.nan
    
    # if only one value (or all values equal)
    if len(samples) < 2 or np.allclose(samples, samples[0]):
        return float(np.mean(samples))
    
    # otherwise use KDE
    kde = gaussian_kde(samples)
    sampled = kde.resample(n_samples).flatten()
    return float(np.mean(sampled))

# 1. take only non-overlapping rows
no_overlap_df = marked_df[marked_df['overlap'] == False].copy()

# 2. group by activity and compute KDE-based average duration
result = (no_overlap_df
    .groupby('concept:name')['duration_minutes']
    .apply(lambda x: kde_mean(x.values))
    .reset_index(name='kde_avg_duration')
)

print(result)

                                      concept:name  kde_avg_duration
0                       Amend Purchase Requisition         26.172190
1            Amend Request for Quotation Requester          9.763730
2    Amend Request for Quotation Requester Manager         18.588653
3                     Analyze Purchase Requisition          6.272740
4                 Analyze Quotation comparison Map         20.515584
5                    Analyze Request for Quotation         22.070975
6               Approve Purchase Order for payment          1.000000
7             Authorize Supplier's Invoice payment          0.000000
8                               Choose best option          0.000000
9                           Confirm Purchase Order         20.476703
10                           Create Purchase Order          9.355184
11                     Create Purchase Requisition         30.156403
12                 Create Quotation comparison Map        203.808370
13          Create Request for Quo

In [None]:
# Checking if any activity is missing:
all_activities = marked_df['concept:name'].unique()
missing_acts = set(all_activities) - set(result['concept:name'])
missing_acts

set()

In [None]:
# Run the following block only if there are missing activities
if missing_acts:
    # Handling missing activities (if any)
    # using the non-overlap fragments inside overlapping sections ---

    fragments_df = (marked_df.groupby('concept:name')['non_overlap_part']
        .apply(lambda x: [v for v in x if v > 0])  # collect all fragments > 0
        .reset_index(name='fragments')
    )
    
    fragments_df['kde_avg_duration'] = fragments_df['fragments'].apply(kde_mean)
    fragments_df = fragments_df[['concept:name', 'kde_avg_duration']]
    
    # filter only the ones we actually need
    fragments_missing = fragments_df[fragments_df['concept:name'].isin(missing_acts)]
    
    #concate
    result = pd.concat([result, fragments_missing], ignore_index=True)
    
    #if still missing and resulted in nan then:
    for idx, row in result.iterrows():
        if pd.isna(row['kde_avg_duration']):
            act = row['concept:name']
            mean_val = marked_df.loc[marked_df['concept:name'] == act, 'duration_minutes'].mean()
            result.at[idx, 'kde_avg_duration'] = mean_val
result

#### ***Applying our technique (distributing overlap_duration proportionally with weighting factor):***
*After computing the non-overlapped activity average durations by KDE, we indentify the overlapping intervals within the multitasked workitems and distribute them proportionally according to the weighting factor. The resulting portion will be recorded in the column `allocated_overlap_duration`*

In [None]:
## Computing Overlap Part and allocate it to multitasked work items as per the computed weights:

def allocate_overlap_durations(marked_df, kde_result,
                               start_col='start:timestamp',
                               end_col='time:timestamp'):
    
    df = marked_df.copy()

    df[start_col] = pd.to_datetime(df[start_col], utc=True)
    df[end_col] = pd.to_datetime(df[end_col], utc=True)

    # merge KDE mean values
    df = df.merge(kde_result, on='concept:name', how='left')

    # initialize allocation col.
    df['allocated_overlap_duration'] = 0.0
    df['total_overlap_part'] = 0.0

    # process each overlap_section separately (skip 0 = no overlap)
    for section_id, group in df.groupby('overlap_section'):
        if section_id == 0:
            continue

        group = group.sort_values(start_col)

        # collect all unique time points:
        times = pd.concat([group[start_col], group[end_col]]).drop_duplicates().sort_values().reset_index(drop=True)

        for i in range(len(times) - 1):
            t0, t1 = times[i], times[i + 1]
            slice_len = (t1 - t0).total_seconds() / 60.0  # in minutes
            if slice_len <= 0:
                continue

            active = group[(group[start_col] < t1) & (group[end_col] > t0)]
            if len(active) < 2:
                continue

            for idx in active.index:          
                df.at[idx, 'total_overlap_part'] += slice_len

            # get KDE weights
            weights = active['kde_avg_duration'].values.astype(float)
            total_w = weights.sum()
            if total_w <= 0:
                continue  # avoid division by zero

            # distribute slice length to active tasks
            for idx, w in zip(active.index, weights):
                share = slice_len * (w / total_w)
                df.at[idx, 'allocated_overlap_duration'] += share

    # compute final adjusted duration plus its non-overlapping part
    df['final_adjusted_duration'] = df['allocated_overlap_duration'] + df['non_overlap_part']
    return df

marked_df = allocate_overlap_durations(marked_df, result)

In [None]:
marked_df

Unnamed: 0,case:concept:name,concept:name,org:resource,start:timestamp,time:timestamp,duration_minutes,old_start:timestamp,old_end:timestamp,old_duration_minutes,overlap,overlap_section,marking,non_overlap_part,kde_avg_duration,allocated_overlap_duration,total_overlap_part,final_adjusted_duration
0,2,Create Request for Quotation Requester,Alberto Duport,2011-01-01 08:16:00+00:00,2011-01-01 08:31:15+00:00,15.250000,2011-01-01 08:16:00+00:00,2011-01-01 08:26:00+00:00,10.0,True,1,same_act_OL,4.033333,9.913702,5.608333,11.216667,9.641667
1,2,Amend Request for Quotation Requester,Christian Francois,2011-01-01 10:16:00+00:00,2011-01-01 10:27:36+00:00,11.600000,2011-01-01 10:16:00+00:00,2011-01-01 10:21:00+00:00,5.0,True,35,diff_act_OL,1.933333,9.763730,2.364289,9.666667,4.297622
2,2,Release Purchase Order,Elvira Lores,2011-01-04 23:01:00+00:00,2011-01-04 23:11:50+00:00,10.833333,2011-01-04 23:01:00+00:00,2011-01-04 23:02:00+00:00,1.0,True,74,diff_act_OL,0.433333,1.000000,0.483371,10.400000,0.916704
3,2,Amend Request for Quotation Requester,Esmana Liubiata,2011-01-01 12:33:00+00:00,2011-01-01 13:00:39+00:00,27.650000,2011-01-01 12:33:00+00:00,2011-01-01 12:39:00+00:00,6.0,True,96,diff_act_OL,2.350000,9.763730,6.187915,25.300000,8.537915
4,2,Analyze Quotation comparison Map,Esmana Liubiata,2011-01-01 23:33:00+00:00,2011-01-02 00:06:04+00:00,33.066667,2011-01-01 23:33:00+00:00,2011-01-01 23:44:00+00:00,11.0,True,97,diff_act_OL,4.750000,20.515584,11.464578,28.316667,16.214578
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9114,1368,Create Purchase Requisition,Miu Hanwan,2011-07-16 05:22:00+00:00,2011-07-16 05:45:01+00:00,23.016667,2011-07-16 05:22:00+00:00,2011-07-16 05:37:00+00:00,15.0,True,2246,diff_act_OL,8.583333,30.156403,10.903205,14.433333,19.486539
9115,1387,Analyze Purchase Requisition,Maris Freeman,2011-07-20 09:17:00+00:00,2011-07-20 09:23:00+00:00,6.000000,2011-07-20 09:17:00+00:00,2011-07-20 09:23:00+00:00,6.0,False,0,no_OL,0.000000,6.272740,0.000000,0.000000,0.000000
9116,1387,Create Purchase Requisition,Tesca Lobes,2011-07-19 11:08:00+00:00,2011-07-19 11:24:00+00:00,16.000000,2011-07-19 11:08:00+00:00,2011-07-19 11:24:00+00:00,16.0,False,0,no_OL,16.000000,30.156403,0.000000,0.000000,16.000000
9117,1917,Analyze Purchase Requisition,Maris Freeman,2011-10-09 15:11:00+00:00,2011-10-09 15:17:00+00:00,6.000000,2011-10-09 15:11:00+00:00,2011-10-09 15:17:00+00:00,6.0,False,0,no_OL,0.000000,6.272740,0.000000,0.000000,0.000000


#### ***Applying Estrada-Torres et al. Approach:***
*Distributing the overlap interval duration **Equally** among the involved workitems. The resulting portion will be recorded in the column `allocated_overlap_duration_equal`*

In [None]:
## Equally distributing the overlap_duration

def allocate_equal_overlap(df, start_col='start:timestamp', end_col='time:timestamp'):
    
    df = df.copy()

    # initialize column for equal allocation
    df['allocated_overlap_duration_equal'] = 0.0

    df[start_col] = pd.to_datetime(df[start_col], utc=True)
    df[end_col] = pd.to_datetime(df[end_col], utc=True)

    # iterate over overlap sections
    for section_id, group in df.groupby('overlap_section'):
        if section_id == 0:
            continue

        group = group.sort_values(start_col)
        times = pd.concat([group[start_col], group[end_col]]).drop_duplicates().sort_values().reset_index(drop=True)

        for i in range(len(times) - 1):
            t0, t1 = times[i], times[i + 1]
            slice_len = (t1 - t0).total_seconds() / 60.0
            if slice_len <= 0:
                continue

            active = group[(group[start_col] < t1) & (group[end_col] > t0)]
            if len(active) < 2:
                continue

            # equal division of overlap
            equal_share = slice_len / len(active)

            for idx in active.index:
                df.at[idx, 'allocated_overlap_duration_equal'] += equal_share

    #final adjusted duration with equal share plus its non-overlapping part
    df['final_adjusted_duration_equal'] = df['allocated_overlap_duration_equal'] + df['non_overlap_part']

    return df

marked_df = allocate_equal_overlap(marked_df)

In [None]:
# When there is no_overlapping -- there will be no change in durations occur:
mask_no_overlap = marked_df['overlap'] == False
marked_df.loc[mask_no_overlap, 'final_adjusted_duration'] = marked_df.loc[mask_no_overlap, 'duration_minutes']
marked_df.loc[mask_no_overlap, 'final_adjusted_duration_equal'] = marked_df.loc[mask_no_overlap, 'duration_minutes']

***The final dataframe, we can see the following columns:***
- `case:concept:name` : case_id.
- `concept:name` : activity.
- `org:resource` : resource.
- `start:timestamp` : start time of the workitem with overlapping execution.
- `time:timestamp` : end time of the workitem with overlapping execution.
- `duration_minutes` : duration of the workitem (time:timestamp - start:timestamp).
- `old_start:timestamp` : start time of the workitems when there were no multitasking executions. 
- `old_end:timestamp` : end time of the workitems when there were no multitasking executions.
- `old_duration_minutes` : duration of the workitems when there were no multitasking executions. We treat it as ***Ground-truth (GT)*** in our subsequent experiment steps.
- `overlap` : indicates whether workitem is overlapping or not *(True or False)*.
- `overlap_section` : identifier number for parallel executed workitems sharing their execution interval.
- `marking` : classify the workitem as no_OL, same_act_OL, diff_act_OL. 
- `non_overlap_part` : duration portion of workitem when it is executing individually. 
- `total_overlap_part` : duration portion of workitem when it is executing in parallel with other workitems.
- `kde_avg_duration` : non-overlapped average duration of activties, that is used in the weighting factor calculation.
- `allocated_overlap_duration` : indicates the portion that a workitem gets after distributing the overlap interval duration by 'OUR' approach (Proportional-split).
- **`final_adjusted_duration` : indicates the final adjusted estimated duration from *'OUR'* proposed technique.**
- `allocated_overlap_duration_equal` : indicates the portion that a workitem gets after distributing the overlap interval duration by Estrada-Torres et al. approach (Equal-split).
- **`final_adjusted_duration_equal` : indicates the final adjusted estimated duration from *'Equal-Split'*  technique.**

In [None]:
# marked_df_final_sorted - for addition column after both approaches applied:
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:.2f}'.format

#Sorting the columns:
marked_df = marked_df[['case:concept:name', 'concept:name', 'org:resource', 'start:timestamp', 'time:timestamp', 'duration_minutes', 'old_start:timestamp',	'old_end:timestamp', 'old_duration_minutes',
                       'overlap', 'overlap_section', 'marking', 'non_overlap_part', 'total_overlap_part', 'kde_avg_duration', 'allocated_overlap_duration', 'final_adjusted_duration', 
                       'allocated_overlap_duration_equal',	'final_adjusted_duration_equal']]

marked_df_final_sorted = marked_df.sort_values(['org:resource', 'start:timestamp']).reset_index(drop=True)
marked_df_final_sorted[marked_df_final_sorted['overlap_section']==56]

Unnamed: 0,case:concept:name,concept:name,org:resource,start:timestamp,time:timestamp,duration_minutes,old_start:timestamp,old_end:timestamp,old_duration_minutes,overlap,overlap_section,marking,non_overlap_part,total_overlap_part,kde_avg_duration,allocated_overlap_duration,final_adjusted_duration,allocated_overlap_duration_equal,final_adjusted_duration_equal
877,263,Create Purchase Requisition,Christian Francois,2011-02-06 01:07:00+00:00,2011-02-06 01:43:32+00:00,36.53,2011-02-06 01:07:00+00:00,2011-02-06 01:43:00+00:00,36.0,True,56,diff_act_OL,19.68,16.85,30.16,16.31,35.99,8.43,28.11
878,182,Release Purchase Order,Christian Francois,2011-02-06 01:26:41+00:00,2011-02-06 01:44:00+00:00,17.32,2011-02-06 11:02:00+00:00,2011-02-06 11:03:00+00:00,1.0,True,56,diff_act_OL,0.47,16.85,1.0,0.54,1.01,8.43,8.89


### ***Metrics Calculation : MAE, MRE, MAPE***

We add columns to calculate **MAE**, **MRE**, and **MAPE** for *MT*, *EQU*, and *ADJ* with respect to the *GT (Ground-truth)* duration.

$$ MAE=|GT-x|     \qquad     MRE=\frac{|GT-x|}{GT}    \qquad    MAPE=\frac{|GT-x|}{GT}\cdot100 $$

*here,* `GT` refers to the ground-truth duration of the event log when no multitasking executions were present.
and ***x*** refers to: 
- `MT` refers to the original event log duration with multitasking executions (no algorithm is applied to adjust the execution time).
- `ADJ` refers to the adjusted estimated duration resulting from our proposed technique.
- `EQU` refers to the adjusted estimated duration resulting from the equal-splitting technique.



In [None]:
marked_df_final_sorted['GT-MT'] = abs( marked_df_final_sorted['old_duration_minutes'] - marked_df_final_sorted['duration_minutes'] ) 
marked_df_final_sorted['GT-ADJ'] = abs( marked_df_final_sorted['old_duration_minutes'] - marked_df_final_sorted['final_adjusted_duration'] )       #Prop-share
marked_df_final_sorted['GT-EQU'] = abs( marked_df_final_sorted['old_duration_minutes'] - marked_df_final_sorted['final_adjusted_duration_equal'] ) #Equal-share

#replacing inf to NAN (in case GT=0) so it will not affect further analysis:
marked_df_final_sorted['(GT-MT)/GT'] = (marked_df_final_sorted['GT-MT'] / marked_df_final_sorted['old_duration_minutes']).replace([np.inf, -np.inf], np.nan)
marked_df_final_sorted['(GT-ADJ)/GT'] = (marked_df_final_sorted['GT-ADJ'] / marked_df_final_sorted['old_duration_minutes']).replace([np.inf, -np.inf], np.nan)
marked_df_final_sorted['(GT-EQU)/GT'] = (marked_df_final_sorted['GT-EQU'] / marked_df_final_sorted['old_duration_minutes']).replace([np.inf, -np.inf], np.nan)

marked_df_final_sorted.head(60)

Unnamed: 0,case:concept:name,concept:name,org:resource,start:timestamp,time:timestamp,duration_minutes,old_start:timestamp,old_end:timestamp,old_duration_minutes,overlap,overlap_section,marking,non_overlap_part,total_overlap_part,kde_avg_duration,allocated_overlap_duration,final_adjusted_duration,allocated_overlap_duration_equal,final_adjusted_duration_equal,GT-MT,GT-ADJ,GT-EQU,(GT-MT)/GT,(GT-ADJ)/GT,(GT-EQU)/GT
0,2,Create Request for Quotation Requester,Alberto Duport,2011-01-01 08:16:00+00:00,2011-01-01 08:31:15+00:00,15.25,2011-01-01 08:16:00+00:00,2011-01-01 08:26:00+00:00,10.0,True,1,same_act_OL,4.03,11.22,9.91,5.61,9.64,5.61,9.64,5.25,0.36,0.36,0.53,0.04,0.04
1,6,Create Request for Quotation Requester,Alberto Duport,2011-01-01 08:20:02+00:00,2011-01-01 08:39:00+00:00,18.97,2011-01-01 17:32:00+00:00,2011-01-01 17:45:00+00:00,13.0,True,1,same_act_OL,7.75,11.22,9.91,5.61,13.36,5.61,13.36,5.97,0.36,0.36,0.46,0.03,0.03
2,12,Create Purchase Requisition,Alberto Duport,2011-01-02 05:31:00+00:00,2011-01-02 05:41:00+00:00,10.0,2011-01-02 05:31:00+00:00,2011-01-02 05:41:00+00:00,10.0,False,0,no_OL,0.0,0.0,30.16,0.0,10.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0
3,17,Create Request for Quotation Requester,Alberto Duport,2011-01-03 04:02:00+00:00,2011-01-03 04:12:00+00:00,10.0,2011-01-03 04:02:00+00:00,2011-01-03 04:12:00+00:00,10.0,False,0,no_OL,0.0,0.0,9.91,0.0,10.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0
4,27,Create Purchase Requisition,Alberto Duport,2011-01-03 17:22:00+00:00,2011-01-03 18:16:00+00:00,54.0,2011-01-03 17:22:00+00:00,2011-01-03 18:16:00+00:00,54.0,False,0,no_OL,0.0,0.0,30.16,0.0,54.0,0.0,54.0,0.0,0.0,0.0,0.0,0.0,0.0
5,30,Create Purchase Requisition,Alberto Duport,2011-01-04 12:33:00+00:00,2011-01-04 13:17:00+00:00,44.0,2011-01-04 12:33:00+00:00,2011-01-04 13:17:00+00:00,44.0,False,0,no_OL,0.0,0.0,30.16,0.0,44.0,0.0,44.0,0.0,0.0,0.0,0.0,0.0,0.0
6,41,Create Purchase Requisition,Alberto Duport,2011-01-05 10:01:00+00:00,2011-01-05 10:44:52+00:00,43.87,2011-01-05 10:01:00+00:00,2011-01-05 10:33:00+00:00,32.0,True,2,diff_act_OL,22.35,21.52,30.16,12.81,35.16,10.76,33.11,11.87,3.16,1.11,0.37,0.1,0.03
7,27,Analyze Quotation comparison Map,Alberto Duport,2011-01-05 10:23:21+00:00,2011-01-05 10:50:00+00:00,26.65,2011-01-05 12:08:00+00:00,2011-01-05 12:25:00+00:00,17.0,True,2,diff_act_OL,5.13,21.52,20.52,8.71,13.84,10.76,15.89,9.65,3.16,1.11,0.57,0.19,0.07
8,27,Choose best option,Alberto Duport,2011-01-05 12:25:00+00:00,2011-01-05 12:25:00+00:00,0.0,2011-01-05 12:25:00+00:00,2011-01-05 12:25:00+00:00,0.0,False,0,no_OL,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,,
9,30,Amend Request for Quotation Requester,Alberto Duport,2011-01-05 19:14:00+00:00,2011-01-05 19:28:00+00:00,14.0,2011-01-05 19:14:00+00:00,2011-01-05 19:28:00+00:00,14.0,False,0,no_OL,0.0,0.0,9.76,0.0,14.0,0.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
marked_df_final_sorted.isna().any()

case:concept:name                   False
concept:name                        False
org:resource                        False
start:timestamp                     False
time:timestamp                      False
duration_minutes                    False
old_start:timestamp                 False
old_end:timestamp                   False
old_duration_minutes                False
overlap                             False
overlap_section                     False
marking                             False
non_overlap_part                    False
total_overlap_part                  False
kde_avg_duration                    False
allocated_overlap_duration          False
final_adjusted_duration             False
allocated_overlap_duration_equal    False
final_adjusted_duration_equal       False
GT-MT                               False
GT-ADJ                              False
GT-EQU                              False
(GT-MT)/GT                           True
(GT-ADJ)/GT                       

#### ***Overall_log_stats: including all work items***

In [None]:
# Overall_log_stats: including all work items

avg_duration_GT = marked_df_final_sorted['old_duration_minutes'].mean()
avg_duration_MT = marked_df_final_sorted['duration_minutes'].mean()
avg_duration_ADJ = marked_df_final_sorted['final_adjusted_duration'].mean()
avg_duration_EQU = marked_df_final_sorted['final_adjusted_duration_equal'].mean()

mean_MAE_MT = marked_df_final_sorted['GT-MT'].mean()
mean_MAE_ADJ = marked_df_final_sorted['GT-ADJ'].mean()
mean_MAE_EQU = marked_df_final_sorted['GT-EQU'].mean()  

mean_MRE_MT = marked_df_final_sorted['(GT-MT)/GT'].mean() 
mean_MRE_ADJ = marked_df_final_sorted['(GT-ADJ)/GT'].mean() 
mean_MRE_EQU = marked_df_final_sorted['(GT-EQU)/GT'].mean()  

mean_MRE_MT_perc = (marked_df_final_sorted['(GT-MT)/GT'].mean()) * 100
mean_MRE_ADJ_perc = (marked_df_final_sorted['(GT-ADJ)/GT'].mean()) * 100
mean_MRE_EQU_perc = (marked_df_final_sorted['(GT-EQU)/GT'].mean()) * 100  

std_MRE_MT  = marked_df_final_sorted['(GT-MT)/GT'].std(ddof=1)
std_MRE_ADJ = marked_df_final_sorted['(GT-ADJ)/GT'].std(ddof=1)
std_MRE_EQU = marked_df_final_sorted['(GT-EQU)/GT'].std(ddof=1)
std_MAPE_MT  = std_MRE_MT  * 100
std_MAPE_ADJ = std_MRE_ADJ * 100
std_MAPE_EQU = std_MRE_EQU * 100


#summary
sections = {
    "AVERAGE DURATIONS": [
        ("avg_duration_GT", avg_duration_GT),
        ("avg_duration_MT", avg_duration_MT),
        ("avg_duration_ADJ", avg_duration_ADJ),
        ("avg_duration_EQU", avg_duration_EQU),
    ],
    "MAE (Mean Absolute Error)": [
        ("MAE_MT", mean_MAE_MT),
        ("MAE_ADJ", mean_MAE_ADJ),
        ("MAE_EQU", mean_MAE_EQU),
    ],
    "MRE (Mean Relative Error)": [
        ("MRE_MT", mean_MRE_MT),
        ("MRE_ADJ", mean_MRE_ADJ),
        ("MRE_EQU", mean_MRE_EQU),
    ],
    "MAPE (%)": [
        ("MAPE_MT(%)", mean_MRE_MT_perc),
        ("MAPE_ADJ(%)", mean_MRE_ADJ_perc),
        ("MAPE_EQU(%)", mean_MRE_EQU_perc),
    ],  
    "STD (MAPE)": [
        ("STD_ADJ", std_MAPE_ADJ),
        ("STD_EQU", std_MAPE_EQU),
    ],
}

print("Overall_log_stats: including all work items")

for section, items in sections.items():
    print(f"\n===== {section} =====")
    for name, value in items:
        print(f"{name}: {value:.3f}")

Overall_log_stats: including all work items

===== AVERAGE DURATIONS =====
avg_duration_GT: 114.315
avg_duration_MT: 143.307
avg_duration_ADJ: 114.315
avg_duration_EQU: 114.315

===== MAE (Mean Absolute Error) =====
MAE_MT: 28.992
MAE_ADJ: 4.303
MAE_EQU: 16.636

===== MRE (Mean Relative Error) =====
MRE_MT: 3.303
MRE_ADJ: 0.081
MRE_EQU: 1.575

===== MAPE (%) =====
MAPE_MT(%): 330.324
MAPE_ADJ(%): 8.098
MAPE_EQU(%): 157.548

===== STD (MAPE) =====
STD_ADJ: 24.834
STD_EQU: 987.273


#### ***Log_stats: considering only mutitasking work items where pair executes different activities***

In [None]:
# Considering only mutitasking work items where pair executes different activities

df_use1 = marked_df_final_sorted[marked_df_final_sorted["marking"].isin(["diff_act_OL"])].copy()

avg_duration_GT = df_use1['old_duration_minutes'].mean()
avg_duration_MT = df_use1['duration_minutes'].mean()
avg_duration_ADJ = df_use1['final_adjusted_duration'].mean()
avg_duration_EQU = df_use1['final_adjusted_duration_equal'].mean()

mean_MAE_MT = df_use1['GT-MT'].mean()
mean_MAE_ADJ = df_use1['GT-ADJ'].mean()
mean_MAE_EQU = df_use1['GT-EQU'].mean()

mean_MRE_MT = df_use1['(GT-MT)/GT'].mean() 
mean_MRE_ADJ = df_use1['(GT-ADJ)/GT'].mean() 
mean_MRE_EQU = df_use1['(GT-EQU)/GT'].mean()  

mean_MRE_MT_perc = mean_MRE_MT * 100
mean_MRE_ADJ_perc = mean_MRE_ADJ * 100
mean_MRE_EQU_perc = mean_MRE_EQU * 100  


std_MRE_MT  = df_use1['(GT-MT)/GT'].std(ddof=1)
std_MRE_ADJ = df_use1['(GT-ADJ)/GT'].std(ddof=1)
std_MRE_EQU = df_use1['(GT-EQU)/GT'].std(ddof=1)
std_MAPE_MT  = std_MRE_MT  * 100
std_MAPE_ADJ = std_MRE_ADJ * 100
std_MAPE_EQU = std_MRE_EQU * 100

# summary
sections = {
    "AVERAGE DURATIONS": [
        ("avg_duration_GT", avg_duration_GT),
        ("avg_duration_MT", avg_duration_MT),
        ("avg_duration_ADJ", avg_duration_ADJ),
        ("avg_duration_EQU", avg_duration_EQU),
    ],
    "MAE (Mean Absolute Error)": [
        ("MAE_MT", mean_MAE_MT),
        ("MAE_ADJ", mean_MAE_ADJ),
        ("MAE_EQU", mean_MAE_EQU),
    ],
    "MRE (Mean Relative Error)": [
        ("MRE_MT", mean_MRE_MT),
        ("MRE_ADJ", mean_MRE_ADJ),
        ("MRE_EQU", mean_MRE_EQU),
    ],
    "MAPE (%)": [
        ("MAPE_MT(%)", mean_MRE_MT_perc),
        ("MAPE_ADJ(%)", mean_MRE_ADJ_perc),
        ("MAPE_EQU(%)", mean_MRE_EQU_perc),
    ],    
    "STD (MAPE)": [
        ("STD_ADJ", std_MAPE_ADJ),
        ("STD_EQU", std_MAPE_EQU),
    ],
}

print("Stats: considering only mutitasking work items where pair executes different activities")

for section, items in sections.items():
    print(f"\n===== {section} =====")
    for name, value in items:
        print(f"{name}: {value:.3f}")


Stats: considering only mutitasking work items where pair executes different activities

===== AVERAGE DURATIONS =====
avg_duration_GT: 115.016
avg_duration_MT: 162.451
avg_duration_ADJ: 115.016
avg_duration_EQU: 115.016

===== MAE (Mean Absolute Error) =====
MAE_MT: 47.435
MAE_ADJ: 3.523
MAE_EQU: 38.299

===== MRE (Mean Relative Error) =====
MRE_MT: 7.794
MRE_ADJ: 0.146
MRE_EQU: 3.785

===== MAPE (%) =====
MAPE_MT(%): 779.434
MAPE_ADJ(%): 14.613
MAPE_EQU(%): 378.533

===== STD (MAPE) =====
STD_ADJ: 34.710
STD_EQU: 1513.538


#### ***Other Analysis:***

In [None]:
### Stats For Activities: Overall Stats for all work items <both Overlap = True & False>

checker = (
    marked_df_final_sorted.groupby("concept:name")[["old_duration_minutes", "duration_minutes", "final_adjusted_duration", "final_adjusted_duration_equal",
                                                    "(GT-MT)/GT", "(GT-ADJ)/GT", "(GT-EQU)/GT" 
                                                    ]].mean()
    .reset_index()
    .rename(columns={'old_duration_minutes':'Avg_Duration-GT (No MT)',
                     'duration_minutes':'Avg_Duration-MT',
                     'final_adjusted_duration':'Avg_Duration-ADJ',
                     'final_adjusted_duration_equal': 'Avg_Duration-EQU',
                     
                     "(GT-MT)/GT": 'MRE_MT', 
                     "(GT-ADJ)/GT": 'MRE_ADJ',
                     "(GT-EQU)/GT": 'MRE_EQU'})
)

activity_counts = (marked_df_final_sorted
    .groupby("concept:name")
    .agg(total_occurrences=('concept:name', 'count'),
        overlap_count=('overlap', lambda x: (x == True).sum())).reset_index()
)

# Merge 
checker = checker.merge(activity_counts, on='concept:name', how='left')
checker.head(60)

Unnamed: 0,concept:name,Avg_Duration-GT (No MT),Avg_Duration-MT,Avg_Duration-ADJ,Avg_Duration-EQU,MRE_MT,MRE_ADJ,MRE_EQU,total_occurrences,overlap_count
0,Amend Purchase Requisition,27.36,30.31,27.34,26.95,0.1,0.01,0.03,11,4
1,Amend Request for Quotation Requester,9.85,12.62,9.68,10.17,0.31,0.04,0.08,514,213
2,Amend Request for Quotation Requester Manager,19.12,20.74,19.22,18.14,0.09,0.02,0.05,49,20
3,Analyze Purchase Requisition,6.58,7.84,6.64,6.38,0.21,0.04,0.06,382,211
4,Analyze Quotation comparison Map,20.15,22.89,20.25,19.91,0.18,0.04,0.06,413,151
5,Analyze Request for Quotation,23.03,54.61,22.32,34.51,2.04,0.12,0.9,1107,845
6,Approve Purchase Order for payment,1.0,38.4,0.88,19.48,37.4,0.15,18.49,413,304
7,Authorize Supplier's Invoice payment,0.0,0.0,0.0,0.0,,,,413,0
8,Choose best option,0.0,0.0,0.0,0.0,,,,413,0
9,Confirm Purchase Order,19.81,140.76,18.57,77.52,8.57,0.14,4.16,413,226


In [None]:
### PER-RESOURCE SUMMARY

#rows where overlap = True
df_ov = marked_df_final_sorted[marked_df_final_sorted["overlap"] == True].copy()

# 1.Total unique overlap sections per resource
total_overlap_sections = (df_ov.groupby("org:resource")["overlap_section"].nunique()
         .reset_index(name="total_overlap_sections"))

#same-activity overlap sections per resource
same_activity = (df_ov.groupby(["org:resource", "overlap_section"])["concept:name"].nunique()
         .reset_index(name="unique_activities"))

same_activity["same_activity_overlap"] = same_activity["unique_activities"] == 1

same_activity_counts = (same_activity.groupby("org:resource")["same_activity_overlap"].sum()
                 .reset_index(name="same_activity_overlap_sections"))

#Merging
summary = (total_overlap_sections.merge(same_activity_counts, on="org:resource", how="left").fillna(0))

summary["same_activity_percentage"] = (summary["same_activity_overlap_sections"] / summary["total_overlap_sections"] * 100).round(2)


#GLOBAL SUMMARY:
global_total = df_ov["overlap_section"].nunique()

global_same = (df_ov.groupby("overlap_section")["concept:name"].nunique()
         .eq(1)
         .sum())

#percentage:
global_percentage = round(global_same / global_total * 100, 2)

print("\n=== GLOBAL SUMMARY ===")
print(f"Total overlap sections = {global_total}, "
    f"Same-activity overlap sections = {global_same}, "
    f"Percentage = {global_percentage}%")
print("=== PER RESOURCE SUMMARY ===")
summary



=== GLOBAL SUMMARY ===
Total overlap sections = 2304, Same-activity overlap sections = 687, Percentage = 29.82%
=== PER RESOURCE SUMMARY ===


Unnamed: 0,org:resource,total_overlap_sections,same_activity_overlap_sections,same_activity_percentage
0,Alberto Duport,34,10,29.41
1,Anna Kaufmann,23,3,13.04
2,Anne Olwada,43,14,32.56
3,Carmen Finacse,45,27,60.0
4,Christian Francois,37,8,21.62
5,Clement Duchot,30,8,26.67
6,Elvira Lores,24,5,20.83
7,Esmana Liubiata,34,10,29.41
8,Esmeralda Clay,41,24,58.54
9,Fjodor Kowalski,27,10,37.04
