# Experiment: Understanding Tasks (Localization, Characterization, and Explanation) on Business Process Drift - (Maaradji - Fast)

## Lib Imports and configurations

In [1]:
# %matplotlib notebook
# %matplotlib inline 
%load_ext autoreload
%autoreload 2


import sys
import os
import glob
import io

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import pm4py
import ruptures as rpt
from ruptures.metrics import precision_recall, meantime
import scipy.stats as ss
from sklearn.model_selection import ParameterGrid

from tqdm.notebook import tqdm_notebook
import time
from matplotlib.backends.backend_pdf import PdfPages
from joblib import Parallel, delayed

sys.path.append("../Codes/")
import TMPD_utils
import TMPD_class

import warnings
warnings.filterwarnings("ignore")

import gc
gc.enable()

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_colwidth', 150)
# pd.set_option('display.float_format', lambda x: f'{x:,.3f}')
pd.options.display.float_format = '{:.4f}'.format
np.set_printoptions(threshold=sys.maxsize)


## Function to run the class TMPD pipeline in parallel

In [2]:
def run_pipeline_TMPD(kwargs, display=False, return_bool=False):

    try:

        ### Loading event log
        # Extracting event log
        event_log = TMPD_utils.parse_mxml(kwargs['log_path'])

        # This is a specifc step for this event log that separates in different rows the start and end of an activity. As we will only use one timestamp, we are filtering only the row representing the end of an activity.  
        event_log = event_log[event_log.EventType == "complete"]

        # Save original event index
        event_log = event_log.reset_index(names='original_index') 

        # Create an id based on the order of the event in the raw event log
        event_log["Event_order_id"] = event_log.index

        # This is a specifc step for this event log, because it have duplicated case ids. So we create a new case id.
        event_log["Trace_order"] = TMPD_utils.cumulative_counting(event_log["CaseId"])

        # # Add a Start and End actiivities case it doesn't have
        # event_log = TMPD_utils.add_start_end_activities(event_log=event_log, case_id_col="Trace_order", activity_col="Activity", timestamp_col="Timestamp")


        ### Step 1 - Instantiating class and setting event log
        print('step 1')
        # Iniatializing the TMPD_class
        TMPD_instance = TMPD_class.TMPD(scenario='offline')

        # Setting the transition log
        TMPD_instance.set_transition_log(event_log, case_id = kwargs['case_id'], activity_key = kwargs['activity_key']
                                         , timestamp_key = kwargs['timestamp_key'], timestamp_format=kwargs['timestamp_format'], other_columns_keys=kwargs['other_columns_keys'])

        # Executing the transition log
        TMPD_instance.run_transition_log()

        # Showing the transition log created
        # TMPD_instance.get_transition_log().head(15)

        
        ### Step 2 - Window Strategy
        print('step 2')
        # Setting the window strategy parameters
        TMPD_instance.set_windowing_strategy(window_size_mode = kwargs['window_size_mode'], window_size = kwargs['window_size'], window_ref_mode = kwargs['window_ref_mode']
                                                , overlap = kwargs['overlap'], sliding_step = kwargs['sliding_step'])

        # Executing the window strategy indexation
        TMPD_instance.run_windowing_strategy()

        # Showing the windows indexes
        # dict(list(TMPD_instance.get_windowing_strategy().items())[:15])


        ### Step 3 - Process Representation (using Transition Matrix)
        print('step 3')
        # Setting the Transition Matrix (TM) process representation
        TMPD_instance.set_process_representation(threshold_anomaly = kwargs['threshold_anomaly']
                                            , control_flow_features = kwargs['control_flow_features']
                                            , time_features = kwargs['time_features']
                                            , resource_features = kwargs['resource_features']
                                            , data_features = kwargs['data_features'])

        # Executing the process_representation using all dataset just for an example
        # TMPD_instance.run_process_representation(TMPD_instance.transition_log)

        # Showing the process representation created
        # TMPD_instance.get_process_representation().head(15)


        ### Step 4 - Change Representation
        print('step 4')
        # Setting Change Representation
        TMPD_instance.set_change_representation(kwargs['change_features_strategy_dict'])

        # Executing the Change Representation using the window strategy
        TMPD_instance.run_change_representation()

        # Showing the Change Representation created
        # TMPD_instance.get_change_representation().head(15)


        ### Step 5 - Detection Task
        # print('step 5')
        # # Setting Detection Task
        # TMPD_instance.set_detection_task(kwargs['detection_task_strategy_dict'])

        # # Executing the Detection Task
        # TMPD_instance.run_detection_task()

        # # Getting Detection Task Results
        # detection_task_results = TMPD_instance.get_detection_task()


        ### Step 6a - Localization Task
        print('step 6a')
        # Setting Localization Task
        TMPD_instance.set_localization_task(reference_window_index=kwargs['reference_window_index'], detection_window_index=kwargs['detection_window_index']
                                            , pvalue_threshold=kwargs['pvalue_threshold'], effect_prop_threshold=kwargs['effect_prop_threshold'], effect_count_threshold=kwargs['effect_count_threshold'], pseudo_count=kwargs['pseudo_count'])

        # Executing Localization Task
        TMPD_instance.run_localization_task()

        # Showing Localization Task Results
        changed_transitions, change_informations, reference_bpmn_text, detection_bpmn_text = TMPD_instance.get_localization_task(show_localization_dfg=False, show_original_dfg=False, show_original_bpmn=False)

        # Getting the distinct activities in the Localization Result
        localization_distinct_activities = set()
        for key, value in change_informations.items():
            if isinstance(value, list): # Check if the value is a list
                # Skip the list if it contains only "None"
                if len(value) == 1 and value[0] == "None":
                    continue
                for item in value:
                    if isinstance(item, tuple): # If the items are tuples, extend the set with the tuple items
                        localization_distinct_activities.update(item)
                    else: # If the items are not tuples (i.e., strings), add them directly to the set
                        localization_distinct_activities.add(item)
        localization_result = list(localization_distinct_activities)


        ### Step 6b - Characterization Task
        print('step 6b')
        # Setting Characterization Task
        TMPD_instance.set_characterization_task(llm_company = kwargs['llm_company'], llm_model=kwargs['llm_model'], api_key_path=kwargs['api_key_path'], llm_instructions_path=kwargs['llm_instructions_path'])

        # Executing Characterization Task
        TMPD_instance.run_characterization_task()

        # Showing Characterization Task Results
        # change_patterns_llm_classification, change_patterns_llm_response = TMPD_instance.get_characterization_task()

        ### Defining Grounding truth
        change_pattern_ground_truth = kwargs['log_path'].split("\\")[-2]

        change_activities_ground_truth = {
            'cb': {
                'change_pattern_name': 'CB',
                'characterization_activities': ['Check_if_home_insurance_quote_is_requested', 'Prepare_acceptance_pack'],
                'localization_activities': ['Send_acceptance_pack', 'Check_if_home_insurance_quote_is_requested', 'Prepare_acceptance_pack', 'Assess_eligibility', 'Send_home_insurance_quote']
            },
            'cd': {
                'change_pattern_name': 'CD',
                'characterization_activities': ['Appraise_property', 'Check_credit_history', 'Assess_loan_risk'],
                'localization_activities': ['Assess_loan_risk', 'Appraise_property', 'Assess_eligibility', 'Check_credit_history']
            },
            'cf': {
                'change_pattern_name': 'CF',
                'characterization_activities': ['Send_home_insurance_quote', 'Send_acceptance_pack'],
                'localization_activities': ['Check_if_home_insurance_quote_is_requested', 'Send_home_insurance_quote', 'Send_acceptance_pack', 'Verify_repayment_agreement']
            },
            'cm': {
                'change_pattern_name': 'CM',
                'characterization_activities': ['Prepare_acceptance_pack'],
                'localization_activities': ['Send_acceptance_pack', 'Check_if_home_insurance_quote_is_requested', 'Prepare_acceptance_pack', 'Assess_eligibility', 'Send_home_insurance_quote', 'Verify_repayment_agreement']
            },
            'cp': {
                'change_pattern_name': 'CP',
                'characterization_activities': ['Assess_loan_risk', 'Check_credit_history'],
                'localization_activities': ['Assess_loan_risk', 'Check_credit_history', 'Verify_repayment_agreement', 'Approve_application', 'Cancel_application']
            },
            'fr': {
                'change_pattern_name': 'FR',
                'characterization_activities': ['Check_if_home_insurance_quote_is_requested', 'Send_acceptance_pack', 'Send_home_insurance_quote'],
                'localization_activities': ['Check_if_home_insurance_quote_is_requested', 'Send_home_insurance_quote', 'Send_acceptance_pack', 'Verify_repayment_agreement']
            },
            'lp': {
                'change_pattern_name': 'LP',
                'characterization_activities': ['Assess_loan_risk', 'Appraise_property', 'Assess_eligibility', 'Check_credit_history'],
                'localization_activities': ['Assess_loan_risk', 'Appraise_property', 'Assess_eligibility', 'Check_credit_history', 'Reject_application', 'Prepare_acceptance_pack']
            },
            'pl': {
                'change_pattern_name': 'PL',
                'characterization_activities': ['Appraise_property', 'Check_credit_history', 'Assess_loan_risk'],
                'localization_activities': ['Assess_loan_risk', 'Check__application__form_completeness', 'Appraise_property', 'Assess_eligibility', 'Check_credit_history']
            },
            'pm': {
                'change_pattern_name': 'PM',
                'characterization_activities': ['Prepare_acceptance_pack', 'Send_home_insurance_quote'],
                'localization_activities': ['Send_acceptance_pack', 'Check_if_home_insurance_quote_is_requested', 'Prepare_acceptance_pack', 'Assess_eligibility', 'Send_home_insurance_quote', 'Verify_repayment_agreement']
            },
            're': {
                'change_pattern_name': 'SRE',
                'characterization_activities': ['Assess_eligibility'],
                'localization_activities': ['Assess_eligibility', 'Reject_application', 'Prepare_acceptance_pack', 'Assess_loan_risk', 'Appraise_property']
            },
            'rp': {
                'change_pattern_name': 'RP',
                'characterization_activities': ['Verify_repayment_agreement', 'Replaced_Activity'],
                'localization_activities': ['Cancel_application', 'Send_acceptance_pack', 'Replaced_Activity', 'Approve_application', 'Send_home_insurance_quote', 'Verify_repayment_agreement']
            },
            'sw': {
                'change_pattern_name': 'SW',
                'characterization_activities': ['Prepare_acceptance_pack', 'Check_if_home_insurance_quote_is_requested', 'Verify_repayment_agreement'],
                'localization_activities': ['Send_acceptance_pack', 'Cancel_application', 'Check_if_home_insurance_quote_is_requested', 'Approve_application', 'Prepare_acceptance_pack', 'Assess_eligibility', 'Send_home_insurance_quote', 'Verify_repayment_agreement']
            }
        }

        localization_ground_truth = change_activities_ground_truth[change_pattern_ground_truth]['localization_activities']
        characterization_ground_truth = {'concept_drift' : ['Yes'], 'change_pattern' : [change_activities_ground_truth[change_pattern_ground_truth]['change_pattern_name']], 'activities' : change_activities_ground_truth[change_pattern_ground_truth]['characterization_activities']}


        ### Validation metrics
        print('Validation metrics')

        ## Localization
        # Set comparison for 'Activities'
        precision, recall, f1_score = TMPD_utils.list_match_metrics(localization_ground_truth, localization_result)

        localization_task_validation_results = ({
            'localization_activities_precision': precision,
            'localization_activities_recall': recall,
            'localization_activities_f1_score': f1_score
        })

        ## Characterization
        characterization_result = TMPD_instance.get_characterization_task()[0]

        # Initialize a results dictionary
        characterization_task_validation_results = {
            'characterization_concept_drift_match': None,
            'characterization_change_pattern_match_f1_score': None,
            'characterization_change_pattern_match_precision': None,
            'characterization_change_pattern_match_recall': None,
            'characterization_activities_precision': None,
            'characterization_activities_recall': None,
            'characterization_activities_f1_score': None
        }

        # Check matches
        try:
            characterization_task_validation_results['characterization_concept_drift_match'] = TMPD_utils.list_match_metrics(characterization_ground_truth['concept_drift'], characterization_result['concept_drift'])[2]
        except:
            characterization_task_validation_results['characterization_concept_drift_match'] = -1

        try:
            precision, recall, f1_score = TMPD_utils.list_match_metrics(characterization_ground_truth['change_pattern'], characterization_result['change_pattern'])
            characterization_task_validation_results.update({
                'characterization_change_pattern_match_f1_score': f1_score,
                'characterization_change_pattern_match_precision': precision,
                'characterization_change_pattern_match_recall': recall
            })

        except:
            characterization_task_validation_results.update({
                'characterization_change_pattern_match_f1_score': -1,
                'characterization_change_pattern_match_precision': -1,
                'characterization_change_pattern_match_recall': -1
            })

        # Set comparison for 'Activities'
        precision, recall, f1_score = TMPD_utils.list_match_metrics(characterization_ground_truth['activities'], characterization_result['activities'])

        characterization_task_validation_results.update({
            'characterization_activities_precision': precision,
            'characterization_activities_recall': recall,
            'characterization_activities_f1_score': f1_score
        })


        understanding_tasks_results = pd.DataFrame([{**localization_task_validation_results, **characterization_task_validation_results}])

        understanding_tasks_results['localization_activities'] = ', '.join(localization_result) 
        understanding_tasks_results['characterization_change_pattern'] = ', '.join(TMPD_instance.get_characterization_task()[0]['change_pattern'])
        understanding_tasks_results['characterization_activities'] = ', '.join(TMPD_instance.get_characterization_task()[0]['activities']) 
        understanding_tasks_results['localization_changes'] = [TMPD_instance.get_localization_task(show_localization_dfg=False, show_original_dfg=False, show_original_bpmn=False)[1]]
        understanding_tasks_results['reference_bpmn_text'] = TMPD_instance.get_localization_task(show_localization_dfg=False, show_original_dfg=False, show_original_bpmn=False)[2]
        understanding_tasks_results['detection_bpmn_text'] = TMPD_instance.get_localization_task(show_localization_dfg=False, show_original_dfg=False, show_original_bpmn=False)[3]
        understanding_tasks_results['change_patterns_llm_response'] = TMPD_instance.get_characterization_task()[1]
        


        ### Add informations to final result
        print('Adding informations')
        understanding_tasks_results['log_path'] = kwargs['log_path']
        understanding_tasks_results['change_pattern'] = kwargs['log_path'].split("\\")[-2]
        understanding_tasks_results['log_size'] = event_log['Trace_order'].nunique()
        understanding_tasks_results['window_size_mode'] = kwargs['window_size_mode']
        understanding_tasks_results['window_size'] = kwargs['window_size']
        understanding_tasks_results['window_ref_mode'] = kwargs['window_ref_mode']
        understanding_tasks_results['overlap'] = kwargs['overlap']
        understanding_tasks_results['sliding_step'] = kwargs['sliding_step']

        understanding_tasks_results['reference_window_index'] = kwargs['reference_window_index']
        understanding_tasks_results['detection_window_index'] = kwargs['detection_window_index']
        understanding_tasks_results['pvalue_threshold'] = kwargs['pvalue_threshold']
        understanding_tasks_results['effect_prop_threshold'] = kwargs['effect_prop_threshold']
        understanding_tasks_results['effect_count_threshold'] = kwargs['effect_count_threshold']
        understanding_tasks_results['pseudo_count'] = kwargs['pseudo_count']
        understanding_tasks_results['llm_company'] = kwargs['llm_company']
        understanding_tasks_results['llm_model'] = kwargs['llm_model']
        understanding_tasks_results['llm_instructions_path'] = kwargs['llm_instructions_path']

    

    except Exception as e:
        understanding_tasks_results = pd.DataFrame(data={'Error': [e]})
        understanding_tasks_results['log_path'] = kwargs['log_path']
        understanding_tasks_results['change_pattern'] = kwargs['log_path'].split("\\")[-2]
        understanding_tasks_results['log_size'] = event_log['Trace_order'].nunique()
        understanding_tasks_results['window_size_mode'] = kwargs['window_size_mode']
        understanding_tasks_results['window_size'] = kwargs['window_size']
        understanding_tasks_results['window_ref_mode'] = kwargs['window_ref_mode']
        understanding_tasks_results['overlap'] = kwargs['overlap']
        understanding_tasks_results['sliding_step'] = kwargs['sliding_step']

        understanding_tasks_results['reference_window_index'] = kwargs['reference_window_index']
        understanding_tasks_results['detection_window_index'] = kwargs['detection_window_index']
        understanding_tasks_results['pvalue_threshold'] = kwargs['pvalue_threshold']
        understanding_tasks_results['effect_prop_threshold'] = kwargs['effect_prop_threshold']
        understanding_tasks_results['effect_count_threshold'] = kwargs['effect_count_threshold']
        understanding_tasks_results['pseudo_count'] = kwargs['pseudo_count']
        understanding_tasks_results['llm_company'] = kwargs['llm_company']
        understanding_tasks_results['llm_model'] = kwargs['llm_model']
        understanding_tasks_results['llm_instructions_path'] = kwargs['llm_instructions_path']
    
    # Deleting class instance
    try:
        del TMPD_instance
    except:
        pass

    # Returning detection results or save to file
    if return_bool:
        return understanding_tasks_results
    else:
        understanding_tasks_results.to_pickle("Results/Understanding_Business_Process_Drift_Maaradji_Fast_files/"+ str(kwargs['id']) + ".pkl")

## Loading Business Process Drift - (Maaradji - Fast) event logs

In [3]:
# Mapping all event_logs paths
logs_path = glob.glob("../Input/Synthetic/Business Process Drift (Maaradji - Fast)/logs/*/*k.*")

# Keepint only the logs with size 10000 and with single change pattern.
change_patterns_excluded = ['IOR', 'IRO', 'OIR', 'ORI', 'RIO', 'ROI']
logs_path = [x for x in logs_path if "10" in x and not any(keyword in x for keyword in change_patterns_excluded)] 

# Showing mapped paths
print("How many logs? ", len(logs_path))
pd.DataFrame(logs_path)

How many logs?  12


Unnamed: 0,0
0,../Input/Synthetic/Business Process Drift (Maaradji - Fast)/logs\cb\cb10k.mxml
1,../Input/Synthetic/Business Process Drift (Maaradji - Fast)/logs\cd\cd10k.mxml
2,../Input/Synthetic/Business Process Drift (Maaradji - Fast)/logs\cf\cf10k.mxml
3,../Input/Synthetic/Business Process Drift (Maaradji - Fast)/logs\cm\cm10k.mxml
4,../Input/Synthetic/Business Process Drift (Maaradji - Fast)/logs\cp\cp10k.mxml
5,../Input/Synthetic/Business Process Drift (Maaradji - Fast)/logs\fr\fr10k.MXML
6,../Input/Synthetic/Business Process Drift (Maaradji - Fast)/logs\lp\lp10k.mxml
7,../Input/Synthetic/Business Process Drift (Maaradji - Fast)/logs\pl\pl10k.mxml
8,../Input/Synthetic/Business Process Drift (Maaradji - Fast)/logs\pm\pm10k.mxml
9,../Input/Synthetic/Business Process Drift (Maaradji - Fast)/logs\re\re10k.mxml


<!-- ![Alt text](../Images/Concept_drift_firstcycle_steps_eng.png "General steps") -->

## Experiment impacts of the parameters

### Define the Parameters GridSearch 

In [4]:
# ### Grid Search of parameters
# TMPD_ParameterGrid = ParameterGrid(
#     [{
#         # Step 1 - Instantiating class and setting event log
#         'case_id' : ['Trace_order']
#         , 'activity_key' : ['Activity']
#         , 'timestamp_key' : ['Timestamp']
#         , 'timestamp_format' : [None]
#         , 'other_columns_keys' : [[]]

#         # Step 2 - Setting Window Strategy
#         , 'window_size_mode' : ['Fixed']
#         , 'window_size' : [1000, 2000, 4000]
#         , 'window_ref_mode' : ['Fixed', 'Sliding']
#         , 'overlap' : [True]
#         , 'sliding_step' : [200, 500]

#         # Step 3 - Setting Process Representation (using Transition Matrix)
#         , 'threshold_anomaly': [0]
#         , 'control_flow_features': [{'frequency', 'probability'}]
#         , 'time_features': [{}] #'avg_time':'timestamp', 'time_std':'timestamp'
#         , 'resource_features': [{}]
#         , 'data_features': [{}]

#         # Step 4 - Setting Change Representation
#         , 'change_features_strategy_dict' : [{
#             'delta_matrix_strategy': 
#                 {
#                     'frequency_delta' : {'process_feature':'frequency', 'method':'aggregation', 'agg_function' : 'sum'}
#                     , 'frequency_delta_percentage' : {'process_feature':'frequency', 'method':'percentage'}
#                     , 'prob_freq_delta_weight' : {'process_feature':'probability', 'method':'aggregation_weight', 'agg_function' : 'sum', 'weight_feature' : 'frequency'}
#                 }
#             , 'statistic_test_strategy' : 
#                 {
#                     'frequency_gtest_pvalue' : {'process_feature':'frequency', 'method':'g_test', 'contingency_matrix_sum_value' : '5', 'remove_zeros':'True'}
#                     , 'frequency_cramersv' : {'process_feature':'frequency', 'method':'cramers_v', 'contingency_matrix_sum_value' : '5', 'remove_zeros':'True'}
#                 }
#         }]
        
#         # Step 5 - Setting Detection Task
#         , 'detection_task_strategy_dict' :  [
#             {
#             'time_series_strategy': 
#                 {
#                     'cpd_frequency_delta3' : {'change_features':['frequency_delta'], 'method':'cpd_pelt', 'smooth' : '3'}
#                     , 'cpd_prob_freq_delta3' : {'change_features':['prob_freq_delta_weight'], 'method':'cpd_pelt', 'smooth' : '3'}
#                     , 'cpd_cramersv_frequency3' : {'change_features':['frequency_cramersv'], 'method':'cpd_pelt', 'smooth' : '3'}  
#                 }
#             , 'threshold_strategy' : 
#                 {
#                     'gtest_frequency3' : {'change_features':['frequency_gtest_pvalue'], 'method':'comparison_operator', 'operator' : 'le', 'threshold_value' : '0.025', 'smooth' : '3'}
#                     , 'fixed_frequency_delta_percentage3' : {'change_features':['frequency_delta_percentage'], 'method':'comparison_operator', 'operator' : 'ge', 'threshold_value' : '0.05', 'smooth' : '3'}
#                     , 'fixed_cramersv_frequency3' : {'change_features':['frequency_cramersv'], 'method':'comparison_operator', 'operator' : 'ge', 'threshold_value' : '0.05', 'smooth' : '3'}
#                 }
#             }
#         ]
#         , 'margin_error' : [3]
#     }
#     , {
#         # Step 1 - Instantiating class and setting event log
#         'case_id' : ['Trace_order']
#         , 'activity_key' : ['Activity']
#         , 'timestamp_key' : ['Timestamp']
#         , 'timestamp_format' : [None]
#         , 'other_columns_keys' : [[]]

#         # Step 2 - Setting Window Strategy
#         , 'window_size_mode' : ['Fixed']
#         , 'window_size' : [1000, 2000, 4000]
#         , 'window_ref_mode' : ['Fixed', 'Sliding']
#         , 'overlap' : [False]
#         , 'sliding_step' : [0]

#         # Step 3 - Setting Process Representation (using Transition Matrix)
#         , 'threshold_anomaly': [0]
#         , 'control_flow_features': [{'frequency', 'probability'}]
#         , 'time_features': [{}] #'avg_time':'timestamp', 'time_std':'timestamp'
#         , 'resource_features': [{}]
#         , 'data_features': [{}]

#         # Step 4 - Setting Change Representation
#         , 'change_features_strategy_dict' : [{
#             'delta_matrix_strategy': 
#                 {
#                     'frequency_delta' : {'process_feature':'frequency', 'method':'aggregation', 'agg_function' : 'sum'}
#                     , 'frequency_delta_percentage' : {'process_feature':'frequency', 'method':'percentage'}
#                     , 'prob_freq_delta_weight' : {'process_feature':'probability', 'method':'aggregation_weight', 'agg_function' : 'sum', 'weight_feature' : 'frequency'}
#                 }
#             , 'statistic_test_strategy' : 
#                 {
#                     'frequency_gtest_pvalue' : {'process_feature':'frequency', 'method':'g_test', 'contingency_matrix_sum_value' : '5', 'remove_zeros':'True'}
#                     , 'frequency_cramersv' : {'process_feature':'frequency', 'method':'cramers_v', 'contingency_matrix_sum_value' : '5', 'remove_zeros':'True'}
#                 }
#         }]
        
#         # Step 5 - Setting Detection Task
#         , 'detection_task_strategy_dict' :  [
#             {
#             'time_series_strategy': 
#                 {
#                     'cpd_frequency_delta3' : {'change_features':['frequency_delta'], 'method':'cpd_pelt', 'smooth' : '3'}
#                     , 'cpd_prob_freq_delta3' : {'change_features':['prob_freq_delta_weight'], 'method':'cpd_pelt', 'smooth' : '3'}
#                     , 'cpd_cramersv_frequency3' : {'change_features':['frequency_cramersv'], 'method':'cpd_pelt', 'smooth' : '3'} 
#                 }
#             , 'threshold_strategy' : 
#                 {
#                     'gtest_frequency3' : {'change_features':['frequency_gtest_pvalue'], 'method':'comparison_operator', 'operator' : 'le', 'threshold_value' : '0.025', 'smooth' : '3'}
#                     , 'fixed_frequency_delta_percentage3' : {'change_features':['frequency_delta_percentage'], 'method':'comparison_operator', 'operator' : 'ge', 'threshold_value' : '0.05', 'smooth' : '3'}
#                     , 'fixed_cramersv_frequency3' : {'change_features':['frequency_cramersv'], 'method':'comparison_operator', 'operator' : 'ge', 'threshold_value' : '0.05', 'smooth' : '3'}
#                 }
#             }
#         ]
#         , 'margin_error' : [3]
#     }
# ])

### Combine parameters GridSearch with all event logs

In [5]:
# TMPD_ParameterGrid_logs = []
# id=0
# for param_grid in TMPD_ParameterGrid:
#     for log_path in logs_path:
#         param_grid_aux = param_grid.copy()
#         param_grid_aux['log_path'] = log_path 
#         param_grid_aux['id'] = id
#         TMPD_ParameterGrid_logs.append(param_grid_aux)
#         id = id+1
# len(TMPD_ParameterGrid_logs)

### Execute all experiments in parallel

In [6]:
# # Executing parameter grid in parallel
# Parallel(n_jobs=-1)(delayed(run_pipeline_TMPD)(TMPD_Parameters, display=False, return_bool=False) for TMPD_Parameters in tqdm_notebook(TMPD_ParameterGrid_logs))

In [7]:
# # Reading all executes
# saved_pkls = glob.glob("Results/Understanding_Business_Process_Drift_Maaradji_Fast_files/*.pkl")
# results = []
# for saved_pkl in saved_pkls:
#     results.append(pd.read_pickle(saved_pkl))
# results_df = pd.concat(results, axis=0, ignore_index=True)
# results_df.to_csv('Results/Business_Process_Drift_Maaradji_Fast_Understanding_Task.csv')

### Load results if necessary

In [8]:
# results_df = pd.read_csv('Results/Business_Process_Drift_Maaradji_Fast_Understanding_Task.csv', index_col=0)

### Analyse results

In [9]:
# results_df.head()

In [10]:
# params = ['detection_strategy', 'detection_feature', 'window_size_mode', 'window_size', 'window_ref_mode', 'overlap', 'sliding_step', 'margin_error'] #, 'tipo_mudanca'
# validation_metrics = ["f1","delay"]

# results_df_agg = results_df.groupby(params)[validation_metrics].agg(['mean'])
# results_df_agg.columns = results_df_agg.columns.map('_'.join)
# # results_df_agg.sort_values(["f1_mean","delay_mean"], ascending=[False,True], inplace=True)
# # # all_results_grouped.to_excel(OUTPUT_RESULTS + all_results_grouped_'+model+'.xlsx', sheet_name=model)
# results_df_agg.head(200)

## Run specific experiment

In [11]:
TMPD_ParameterGrid_experiment = ParameterGrid(
    [{
        # Step 1 - Instantiating class and setting event log
        'case_id' : ['Trace_order']
        , 'activity_key' : ['Activity']
        , 'timestamp_key' : ['Timestamp']
        , 'timestamp_format' : [None]
        , 'other_columns_keys' : [[]]

        # Step 2 - Setting Window Strategy
        , 'window_size_mode' : ['Fixed']
        , 'window_size' : [4000]
        , 'window_ref_mode' : ['Fixed'] #, 'Sliding'
        , 'overlap' : [True]
        , 'sliding_step' : [200]

        # Step 3 - Setting Process Representation (using Transition Matrix)
        , 'threshold_anomaly': [0.005]
        , 'control_flow_features': [{'frequency', 'probability'}]
        , 'time_features': [{}] #'avg_time':'timestamp', 'time_std':'timestamp'
        , 'resource_features': [{}]
        , 'data_features': [{}]

        # Step 4 - Setting Change Representation
        , 'change_features_strategy_dict' : [{
            'delta_matrix_strategy': 
                {
                    # 'frequency_delta' : {'process_feature':'frequency', 'method':'aggregation', 'agg_function' : 'sum'}
                    # , 'frequency_delta_percentage' : {'process_feature':'frequency', 'method':'percentage'}
                    # , 'prob_freq_delta_weight' : {'process_feature':'probability', 'method':'aggregation_weight', 'agg_function' : 'sum', 'weight_feature' : 'frequency'}
                }
            , 'statistic_test_strategy' : 
                {
                    'frequency_gtest_pvalue' : {'process_feature':'frequency', 'method':'g_test', 'contingency_matrix_sum_value' : '5', 'remove_zeros':'True'}
                    # , 'frequency_cramersv' : {'process_feature':'frequency', 'method':'cramers_v', 'contingency_matrix_sum_value' : '5', 'remove_zeros':'True'}
                }
        }]
        
        # Step 5 - Setting Detection Task
        , 'detection_task_strategy_dict' :  [
            {
            'time_series_strategy': 
                {
                    # 'cpd_frequency_delta3' : {'change_features':['frequency_delta'], 'method':'cpd_pelt', 'smooth' : '3'}
                    # , 'cpd_prob_freq_delta3' : {'change_features':['prob_freq_delta_weight'], 'method':'cpd_pelt', 'smooth' : '3'}
                    # , 'cpd_cramersv_frequency3' : {'change_features':['frequency_cramersv'], 'method':'cpd_pelt', 'smooth' : '3'}  
                }
            , 'threshold_strategy' : 
                {
                    'gtest_frequency3' : {'change_features':['frequency_gtest_pvalue'], 'method':'comparison_operator', 'operator' : 'le', 'threshold_value' : '0.025', 'smooth' : '3'}
                    # , 'fixed_frequency_delta_percentage3' : {'change_features':['frequency_delta_percentage'], 'method':'comparison_operator', 'operator' : 'ge', 'threshold_value' : '0.05', 'smooth' : '3'}
                    # , 'fixed_cramersv_frequency3' : {'change_features':['frequency_cramersv'], 'method':'comparison_operator', 'operator' : 'ge', 'threshold_value' : '0.05', 'smooth' : '3'}
                }
            }
        ]

        # Step 6a - Localization Task
        , 'reference_window_index': [0]
        , 'detection_window_index': [75]
        , 'pvalue_threshold': [0.05]
        , 'effect_prop_threshold': [0.2]
        , 'effect_count_threshold': [0.02]
        , 'pseudo_count': [5]

        # Step 6b - Characterization Task
        , 'llm_company' : ['openai'] # 'google', 'openai'
        , 'llm_model': ["gpt-4o"] # "gpt-4o", "gemini-pro", "gpt-4-1106-preview", "gpt-3.5-turbo-0125", "gpt-4"
        , 'api_key_path' : ['../Temp/openai_api_key.txt'] # '../Temp/google_api_key.txt', '../Temp/openai_api_key.txt'
        , 'llm_instructions_path': ['../Codes/LLM_Instructions/instructions_v7.yaml']

    }]
)

In [12]:
TMPD_ParameterGrid_logs_experiment = []
id=0
for param_grid in TMPD_ParameterGrid_experiment:
    for log_path in logs_path:
        param_grid_aux = param_grid.copy()
        param_grid_aux['log_path'] = log_path 
        param_grid_aux['id'] = id
        TMPD_ParameterGrid_logs_experiment.append(param_grid_aux)
        id = id+1
len(TMPD_ParameterGrid_logs_experiment)


12

In [14]:
TMPD_logs_results_experiment = Parallel(n_jobs=1)(delayed(run_pipeline_TMPD)(TMPD_Parameters, display=False, return_bool=True) for TMPD_Parameters in tqdm_notebook(TMPD_ParameterGrid_logs_experiment))
TMPD_logs_results_experiment_df = pd.concat(TMPD_logs_results_experiment, axis=0, ignore_index=True)

  0%|          | 0/12 [00:00<?, ?it/s]

step 1
step 2
step 3
step 4
step 6a
step 6b
################################ llm_bpmn_analysis_response #####################################
### Analysis of BPMN Diagrams Before and After Concept Drift ###

#### BPMN Before Concept Drift ####
```
->( 
    'Loan__application_received', 
    *( 
        'Check__application__form_completeness', 
        ->( 
            'Return_application_back_to_applicant', 
            'Receive_updated_application' 
        ) 
    ), 
    +( 
        'Appraise_property', 
        ->( 
            'Check_credit_history', 
            'Assess_loan_risk' 
        ) 
    ), 
    'Assess_eligibility', 
    X( 
        ->( 
            'Prepare_acceptance_pack', 
            'Check_if_home_insurance_quote_is_requested', 
            X( 
                'Send_acceptance_pack', 
                'Send_home_insurance_quote' 
            ), 
            'Verify_repayment_agreement', 
            X( 
                ->( 
                    'Cancel_application', 

In [15]:
TMPD_logs_results_experiment_df

Unnamed: 0,localization_activities_precision,localization_activities_recall,localization_activities_f1_score,characterization_concept_drift_match,characterization_change_pattern_match_f1_score,characterization_change_pattern_match_precision,characterization_change_pattern_match_recall,characterization_activities_precision,characterization_activities_recall,characterization_activities_f1_score,localization_activities,characterization_change_pattern,characterization_activities,localization_changes,reference_bpmn_text,detection_bpmn_text,change_patterns_llm_response,log_path,change_pattern,log_size,window_size_mode,window_size,window_ref_mode,overlap,sliding_step,reference_window_index,detection_window_index,pvalue_threshold,effect_prop_threshold,effect_count_threshold,pseudo_count,llm_company,llm_model,llm_instructions_path,change_patterns_separated
0,0.2632,1.0,0.4167,1.0,1.0,1.0,1.0,1.0,1.0,1.0,"Loan__application_received, Assess_loan_risk, Verify_repayment_agreement, Cancel_application, Approve_application, Assess_eligibility, Send_accept...",CB,"Prepare_acceptance_pack, Check_if_home_insurance_quote_is_requested","{'Transitions with variations in probability': [('Assess_eligibility', 'Prepare_acceptance_pack'), ('Check_if_home_insurance_quote_is_requested', ...","->( 'Loan__application_received', *( 'Check__application__form_completeness', ->( 'Return_application_back_to_applicant', 'Receive_updated_applica...","->( 'Loan__application_received', *( 'Check__application__form_completeness', ->( 'Return_application_back_to_applicant', 'Receive_updated_applica...","### Step-by-Step Analysis ###\n\n#### Step 1: Confirm Concept Drift ####\nBased on the provided analysis, there is a clear indication of a structu...",../Input/Synthetic/Business Process Drift (Maaradji - Fast)/logs\cb\cb10k.mxml,cb,10000,Fixed,4000,Fixed,True,200,0,75,0.05,0.2,0.02,5,openai,gpt-4o,../Codes/LLM_Instructions/instructions_v7.yaml,False
1,0.2105,1.0,0.3478,1.0,0.0,0.0,0.0,1.0,0.3333,0.5,"Loan__application_received, Assess_loan_risk, Verify_repayment_agreement, Cancel_application, Approve_application, Assess_eligibility, Send_accept...",PM,Assess_loan_risk,"{'Transitions with variations in probability': [('Appraise_property', 'Assess_eligibility'), ('Appraise_property', 'Assess_loan_risk'), ('Assess_l...","->( 'Loan__application_received', *( 'Check__application__form_completeness', ->( 'Return_application_back_to_applicant', 'Receive_updated_applica...","->( 'Loan__application_received', *( 'Check__application__form_completeness', ->( 'Return_application_back_to_applicant', 'Receive_updated_applica...",### Step-by-Step Analysis ###\n\n#### Step 1: Summarize the Analysis Information ####\n\n1. **Alterations in Activities and Transitions:**\n - *...,../Input/Synthetic/Business Process Drift (Maaradji - Fast)/logs\cd\cd10k.mxml,cd,10000,Fixed,4000,Fixed,True,200,0,75,0.05,0.2,0.02,5,openai,gpt-4o,../Codes/LLM_Instructions/instructions_v7.yaml,False
2,0.2105,1.0,0.3478,1.0,1.0,1.0,1.0,1.0,1.0,1.0,"Loan__application_received, Assess_loan_risk, Verify_repayment_agreement, Cancel_application, Approve_application, Assess_eligibility, Send_accept...",CF,"Send_acceptance_pack, Send_home_insurance_quote","{'Transitions with variations in probability': [('Check_if_home_insurance_quote_is_requested', 'Send_acceptance_pack'), ('Check_if_home_insurance_...","->( 'Loan__application_received', *( 'Check__application__form_completeness', ->( 'Return_application_back_to_applicant', 'Receive_updated_applica...","->( 'Loan__application_received', *( 'Check__application__form_completeness', ->( 'Return_application_back_to_applicant', 'Receive_updated_applica...","### Step-by-Step Analysis ###\n\n#### Step 1: Confirm Concept Drift ####\nBased on the provided analysis, it is clear that there is a significant ...",../Input/Synthetic/Business Process Drift (Maaradji - Fast)/logs\cf\cf10k.mxml,cf,10000,Fixed,4000,Fixed,True,200,0,75,0.05,0.2,0.02,5,openai,gpt-4o,../Codes/LLM_Instructions/instructions_v7.yaml,False
3,0.3158,1.0,0.48,1.0,1.0,1.0,1.0,1.0,1.0,1.0,"Loan__application_received, Assess_loan_risk, Verify_repayment_agreement, Cancel_application, Approve_application, Assess_eligibility, Send_accept...",CM,Prepare_acceptance_pack,"{'Transitions with variations in probability': [('Assess_eligibility', 'Prepare_acceptance_pack'), ('Check_if_home_insurance_quote_is_requested', ...","->( 'Loan__application_received', *( 'Check__application__form_completeness', ->( 'Return_application_back_to_applicant', 'Receive_updated_applica...","->( 'Loan__application_received', *( 'Check__application__form_completeness', ->( 'Return_application_back_to_applicant', 'Receive_updated_applica...","### Step-by-Step Analysis ###\n\n#### Step 1: Confirm Concept Drift ####\nBased on the provided analysis, there are clear indications of changes i...",../Input/Synthetic/Business Process Drift (Maaradji - Fast)/logs\cm\cm10k.mxml,cm,10000,Fixed,4000,Fixed,True,200,0,75,0.05,0.2,0.02,5,openai,gpt-4o,../Codes/LLM_Instructions/instructions_v7.yaml,False
4,0.2632,1.0,0.4167,1.0,1.0,1.0,1.0,0.2222,1.0,0.3636,"Loan__application_received, Assess_loan_risk, Verify_repayment_agreement, Cancel_application, Approve_application, Assess_eligibility, Send_accept...",CP,"Assess_eligibility, Send_acceptance_pack, Send_home_insurance_quote, Assess_loan_risk, Verify_repayment_agreement, Check_if_home_insurance_quote_i...","{'Transitions with variations in probability': [('Assess_loan_risk', 'Appraise_property'), ('Assess_loan_risk', 'Assess_eligibility'), ('Check_cre...","->( 'Loan__application_received', *( 'Check__application__form_completeness', ->( 'Return_application_back_to_applicant', 'Receive_updated_applica...","->( 'Loan__application_received', *( 'Check__application__form_completeness', ->( 'Return_application_back_to_applicant', 'Receive_updated_applica...","### Step-by-Step Analysis ###\n\n#### Step 1: Identify Concept Drift ####\nBased on the provided analysis, there are significant changes in the BP...",../Input/Synthetic/Business Process Drift (Maaradji - Fast)/logs\cp\cp10k.mxml,cp,10000,Fixed,4000,Fixed,True,200,0,75,0.05,0.2,0.02,5,openai,gpt-4o,../Codes/LLM_Instructions/instructions_v7.yaml,False
5,0.2105,1.0,0.3478,1.0,1.0,1.0,1.0,1.0,1.0,1.0,"Loan__application_received, Assess_loan_risk, Verify_repayment_agreement, Cancel_application, Approve_application, Assess_eligibility, Send_accept...",FR,"Check_if_home_insurance_quote_is_requested, Send_acceptance_pack, Send_home_insurance_quote","{'Transitions with variations in probability': [('Check_if_home_insurance_quote_is_requested', 'Send_acceptance_pack'), ('Check_if_home_insurance_...","->( 'Loan__application_received', *( 'Check__application__form_completeness', ->( 'Return_application_back_to_applicant', 'Receive_updated_applica...","->( 'Loan__application_received', *( 'Check__application__form_completeness', ->( 'Return_application_back_to_applicant', 'Receive_updated_applica...",### Step-by-Step Analysis ###\n\n#### Step 1: Summarize the Analysis Information ####\n- **Alterations in Activities and Transitions**:\n - No st...,../Input/Synthetic/Business Process Drift (Maaradji - Fast)/logs\fr\fr10k.MXML,fr,10000,Fixed,4000,Fixed,True,200,0,75,0.05,0.2,0.02,5,openai,gpt-4o,../Codes/LLM_Instructions/instructions_v7.yaml,False
6,0.3,1.0,0.4615,1.0,0.0,0.0,0.0,1.0,1.0,1.0,"Loan__application_received, Assess_loan_risk, Verify_repayment_agreement, Cancel_application, Approve_application, START, Assess_eligibility, Send...",CB,"Assess_eligibility, Appraise_property, Assess_loan_risk, Check_credit_history","{'Transitions with variations in probability': [('Assess_eligibility', 'Prepare_acceptance_pack'), ('Assess_eligibility', 'Reject_application'), (...","->( 'Loan__application_received', *( 'Check__application__form_completeness', ->( 'Return_application_back_to_applicant', 'Receive_updated_applica...","->( 'Loan__application_received', *( 'Check__application__form_completeness', ->( 'Return_application_back_to_applicant', 'Receive_updated_applica...","### Step-by-Step Analysis ###\n\n#### Step 1: Confirm Concept Drift ####\nBased on the provided analysis, there is a clear indication of a concept...",../Input/Synthetic/Business Process Drift (Maaradji - Fast)/logs\lp\lp10k.mxml,lp,10000,Fixed,4000,Fixed,True,200,0,75,0.05,0.2,0.02,5,openai,gpt-4o,../Codes/LLM_Instructions/instructions_v7.yaml,False
7,0.2632,1.0,0.4167,1.0,1.0,1.0,1.0,1.0,1.0,1.0,"Loan__application_received, Assess_loan_risk, Verify_repayment_agreement, Cancel_application, Approve_application, Assess_eligibility, Send_accept...",PL,"Appraise_property, Check_credit_history, Assess_loan_risk","{'Transitions with variations in probability': [('Appraise_property', 'Assess_eligibility'), ('Appraise_property', 'Assess_loan_risk'), ('Appraise...","->( 'Loan__application_received', *( 'Check__application__form_completeness', ->( 'Return_application_back_to_applicant', 'Receive_updated_applica...","->( 'Loan__application_received', *( 'Check__application__form_completeness', ->( 'Return_application_back_to_applicant', 'Receive_updated_applica...","### Step-by-Step Analysis ###\n\n#### Step 1: Identify Concept Drift ####\nBased on the provided analysis, there is a clear indication of a change...",../Input/Synthetic/Business Process Drift (Maaradji - Fast)/logs\pl\pl10k.mxml,pl,10000,Fixed,4000,Fixed,True,200,0,75,0.05,0.2,0.02,5,openai,gpt-4o,../Codes/LLM_Instructions/instructions_v7.yaml,False
8,0.3158,1.0,0.48,1.0,1.0,1.0,1.0,1.0,1.0,1.0,"Loan__application_received, Assess_loan_risk, Verify_repayment_agreement, Cancel_application, Approve_application, Assess_eligibility, Send_accept...",PM,"Prepare_acceptance_pack, Send_home_insurance_quote","{'Transitions with variations in probability': [('Assess_eligibility', 'Prepare_acceptance_pack'), ('Check_if_home_insurance_quote_is_requested', ...","->( 'Loan__application_received', *( 'Check__application__form_completeness', ->( 'Return_application_back_to_applicant', 'Receive_updated_applica...","->( 'Loan__application_received', *( 'Check__application__form_completeness', ->( 'Return_application_back_to_applicant', 'Receive_updated_applica...",### Step-by-Step Analysis ###\n\n#### Step 1: Confirming Identified Changes ####\n\n**Sequence of Activities:**\n- **Before Drift:** 'Prepare_acce...,../Input/Synthetic/Business Process Drift (Maaradji - Fast)/logs\pm\pm10k.mxml,pm,10000,Fixed,4000,Fixed,True,200,0,75,0.05,0.2,0.02,5,openai,gpt-4o,../Codes/LLM_Instructions/instructions_v7.yaml,False
9,0.2632,1.0,0.4167,1.0,1.0,1.0,1.0,0.5,1.0,0.6667,"Loan__application_received, Assess_loan_risk, Verify_repayment_agreement, Cancel_application, Approve_application, Assess_eligibility, Send_accept...",SRE,"Assess_loan_risk, Assess_eligibility","{'Transitions with variations in probability': [('Assess_eligibility', 'Prepare_acceptance_pack'), ('Assess_eligibility', 'Reject_application'), (...","->( 'Loan__application_received', *( 'Check__application__form_completeness', ->( 'Return_application_back_to_applicant', 'Receive_updated_applica...","->( 'Loan__application_received', *( 'Check__application__form_completeness', ->( 'Return_application_back_to_applicant', 'Receive_updated_applica...",### Step-by-Step Analysis ###\n\n#### Step 1: Combine All Information Previously Analyzed ####\n\n1. **Alterations in Activities and Transitions:*...,../Input/Synthetic/Business Process Drift (Maaradji - Fast)/logs\re\re10k.mxml,re,10000,Fixed,4000,Fixed,True,200,0,75,0.05,0.2,0.02,5,openai,gpt-4o,../Codes/LLM_Instructions/instructions_v7.yaml,False


In [16]:
TMPD_logs_results_experiment_df['characterization_change_pattern_match_f1_score'].mean()

0.75

In [17]:
TMPD_logs_results_experiment_df.to_excel('Results/Business_Process_Drift_Maaradji_Fast_Understanding_Task_Single_v7.xlsx')

In [None]:
# with PdfPages('Results/gtest_frequency3_experiment_Business_Process_Drift.pdf') as pdf:
#     for index, result in TMPD_logs_results_experiment_df.iterrows(): 
#         fig, ax = plt.subplots(figsize=(15,3))
#         plt.axis('off')
#         ax.imshow(result['display'])
#         pdf.savefig(bbox_inches='tight')
#         plt.show()