# Hands-on tutorial for Nirdizati light

# Prepare environment

In [1]:
# Import required modules
import os
import random
import numpy as np

import pandas as pd
import pm4py

from nirdizati_light.log.common import get_log, split_train_val_test
from nirdizati_light.encoding.common import get_encoded_df, EncodingType
from nirdizati_light.encoding.constants import TaskGenerationType, PrefixLengthStrategy, EncodingTypeAttribute
from nirdizati_light.encoding.time_encoding import TimeEncodingType
from nirdizati_light.labeling.common import LabelTypes
from nirdizati_light.predictive_model.common import ClassificationMethods
from nirdizati_light.predictive_model.predictive_model import PredictiveModel
from nirdizati_light.hyperparameter_optimisation.common import retrieve_best_model, HyperoptTarget
from nirdizati_light.evaluation.common import evaluate_classifier, evaluate_classifiers, plot_model_comparison
from nirdizati_light.explanation.common import ExplainerType, explain

# Configure pipeline parameters

# Setting up the CONF dictionary
## Configuration Parameters for Process Mining in LaTeX

Below is the configuration dictionary `CONF` formatted in LaTeX. Each parameter is explained in detail.
The dictionary is the main objectives of the Nirdizati\_light framework. It specifies all required parameters to use when training multiple predictive model(s) for a given configuration, ranging from the train-test-val split ratio to the prefix lengths to be used and encodings to be used by the model.

---


## Configuration Dictionary: CONF


### **Paths**

\begin{aligned}
\textbf{data:} & \quad \text{'../datasets/bpic2012\_O\_ACCEPTED-COMPLETE/full.xes'} \quad \text{// Path to the dataset file.} \\
\textbf{output:} & \quad \text{'output\_data'} \quad \text{// Path to the directory where output data is saved.} \\
\end{aligned}


---

### **Dataset Splitting**
\begin{align*}
\textbf{train\_val\_test\_split:} & \quad [0.7, 0.1, 0.2] \quad \text{// Proportions for splitting the dataset into training, validation, and testing sets.} \\
& \quad \text{// Example: 70\% training, 10\% validation, 20\% testing.}
\end{align*}


---

### **Prefix Length**

\begin{aligned}
\textbf{prefix\_length\_strategy:} & \quad \text{PrefixLengthStrategy.FIXED.value}  \quad \text{// Strategy to define the prefix length (fixed in this case).} \\
\textbf{prefix\_length:} & \quad 10 \quad \text{// Specifies the fixed length of the prefix.} \\
\end{aligned}


---

### **Encoding**

\begin{align*}
\textbf{padding:} & \quad \text{True}  \quad \text{// Indicates whether to use padding in the encoding.} \\
\textbf{feature\_selection:} & \quad \text{EncodingType.SIMPLE.value}  \quad \text{// Specifies the encoding type for features (e.g., SIMPLE).} \\
\textbf{attribute\_encoding:} & \quad \text{EncodingTypeAttribute.LABEL.value}  \quad \text{// Attribute encoding strategy (e.g., LABEL).} \\
\textbf{time\_encoding:} & \quad \text{TimeEncodingType.NONE.value} \quad \text{// Defines the time encoding strategy (none in this case).}
\end{align*}


---

### **Labeling and Task Generation**
\begin{aligned}
\textbf{labeling\_type:} & \quad \text{LabelTypes.NEXT\_ACTIVITY.value} \quad \text{// Specifies the label type (next activity in this case).} \\
\textbf{task\_generation\_type:} & \quad \text{TaskGenerationType.ALL\_IN\_ONE.value} \quad \text{// Defines whether training is on a single prefix length or all lengths.} \\
\end{aligned}

---

### **Predictive Models and Hyperparameter Optimization**

\begin{aligned}
\textbf{predictive\_models:} & \quad \text{[
ClassificationMethods.MLP.value (Multi-Layer Perceptron),} \\
& \quad \text{ClassificationMethods.SGDCLASSIFIER.value (Stochastic Gradient Descent Classifier),} \\
& \quad \text{ClassificationMethods.SVM.value (Support Vector Machine) ,} \\
& \quad \text{ClassificationMethods.XGBOOST.value] (Extreme Gradient Boosting)} \\
& \quad \text{// List of predictive models to train.} \\
\textbf{hyperparameter\_optimisation\_target:} & \quad \text{HyperoptTarget.F1.value} \\
& \quad \text{// Metric to optimize during hyperparameter tuning.} \\
\textbf{hyperparameter\_optimisation\_evaluations:} & \quad 3 \quad \text{// Number of configurations to try during optimization for each model.} \\
\end{aligned}


---

### **Explainability**

\begin{aligned}
\textbf{explanator:} & \quad \text{ExplainerType.DICE.value} \\
& \quad \text{// Method used for explaining the model predictions.} \\
\end{aligned}


---

### **Other Parameters**

\begin{aligned}
\textbf{target\_event:} & \quad \text{None} \quad \text{// Specifies the target event, if applicable (NEXT\_EVENT ONLY).} \\
\textbf{seed:} & \quad \text{SEED} \quad \text{// Random seed to ensure reproducibility.} \\
\end{aligned}


---

This configuration setup provides a reproducible and flexible approach for predictive modeling in process mining tasks. Copy the LaTeX blocks into your Jupyter Notebook Markdown or use a Python cell to render the configuration dynamically.


In [2]:
# Set a seed for reproducibility
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)

# Name of the log to use
LOG_NAME = 'cvs_pharmacy'

# Configuration dictionary for nirdizati light
CONF = {
    # path to log
    'data': os.path.join('/Users/andrei/Desktop/PhD/experiments', 'bpic2012_O_ACCEPTED-COMPLETE_trunc.xes'),
    # train-validation-test set split percentages
    'train_val_test_split': [0.7, 0.1, 0.2],

    # path to output folder
    'output': 'output_data',

    'prefix_length_strategy': PrefixLengthStrategy.FIXED.value,
    'prefix_length': 10,
    # whether to use padding or not in encoding
    'padding': True,
    # which encoding to use
    'feature_selection': EncodingType.SIMPLE.value,
    # which attribute encoding to use
    'attribute_encoding': EncodingTypeAttribute.LABEL.value,
    # which time encoding to use
    'time_encoding': TimeEncodingType.NONE.value,

    # the label to be predicted (e.g. outcome, next activity)
    'labeling_type': LabelTypes.ATTRIBUTE_STRING.value,
    # whether the model should be trained on the specified prefix length (ONLY_THIS) or to every prefix in range [1, prefix_length] (ALL_IN_ONE)
    'task_generation_type': TaskGenerationType.ALL_IN_ONE.value,

    # list of predictive models and their respective hyperparameter optimization space
    # if it is None, then the default hyperopt space will be used; otherwise, the provided space will be used
    'predictive_models': [
        ClassificationMethods.DT.value,
        #ClassificationMethods.KNN.value,
        #ClassificationMethods.LSTM.value,
        # ClassificationMethods.MLP.value,
        # ClassificationMethods.PERCEPTRON.value,
        # ClassificationMethods.SGDCLASSIFIER.value,
        #ClassificationMethods.SVM.value,
        # ClassificationMethods.XGBOOST.value,
    ],

    # which metric to optimize hyperparameters for
    'hyperparameter_optimisation_target': HyperoptTarget.F1.value,
    # number of hyperparameter configurations to try
    'hyperparameter_optimisation_evaluations': 10,

    # explainability method to use
    'explanator': ExplainerType.DICE.value,

    'target_event': None,
    'seed': SEED,
}

# Load log

In [3]:
log = get_log(filepath=CONF['data'])

parsing log, completed traces ::   0%|          | 0/4685 [00:00<?, ?it/s]

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



# Encoding

# Explanation of the `get_encoded_df` Function Parameters

The function `get_encoded_df` encodes data from a process log to create:
1. An **encoder object** (`encoder`) that processes features or labels.
2. A fully encoded **DataFrame** (`full_df`) for machine learning tasks.

## Parameters:

1. **`log`**:
   - Likely a process event log or dataset containing sequential events for one or more processes.
   - Columns might include `case_id`, `activity`, `timestamp`, and other attributes.

2. **`feature_encoding_type`**:
   - Specifies the type of feature encoding:
     - `frequency`: Frequency-based encoding (e.g., count of activities).
     - `ordinal`: Maps categorical features to integers.
     - `one_hot`: One-hot encoding for categorical features.
     - `embedding`: Use embeddings for numerical representation.

3. **`prefix_length`**:
   - Determines the length of process prefixes to be used.
   - Affects how much history (number of events) is considered for features.

4. **`prefix_length_strategy`**:
   - Defines how prefixes are handled:
     - `fixed`: Use a fixed number of steps (e.g., only first $N$ steps).
     - `dynamic`: Variable-length prefixes, potentially based on case characteristics.

5. **`time_encoding_type`**:
   - Encodes timestamp-related features (e.g., weekday, hour, month, time since last event, time since case start):
     - `NONE`: Does not encode the relative time attributes.
     - `DATE`: Extracts date-related features like day-of-week or month.
     - `DURATION`: Encodes time differences between events, time since case start, etc.
     - `DATE_AND_DURATION`: Combines date and duration encodings.

6. **`attribute_encoding`**:
   - Specifies how to encode additional attributes:
     - `LABEL`: Encode attributes as labels (e.g., 'A', 'B', 'C').
     - `ONEHOT`: Use one-hot encoding for attributes.

7. **`padding`**:
   - Ensures uniform input length by padding shorter prefixes with default values (e.g., zeros).
   - Useful for models like RNNs or LSTMs requiring uniform input dimensions.

8. **`labeling_type`**:
   - Defines how labels are generated for classification or prediction:
     - `NEXT_ACTIVITY`: Predict the next activity in the process.
     - `ATTRIBUTE_STRING`: Predict the final outcome of a process instance.
     - `ATTRIBUTE_NUMBER`: Predict a numerical attribute value (e.g., case duration).
     - `REMAINING_TIME`: Predict the remaining time until case completion.
     - `DURATION`: Predict the duration of a case.

9. **`task_generation_type`**:
   - Specifies the type of task:
     - `CLASSIFICATION`: Assign discrete classes (e.g., case outcome as 'successful' or 'failed').
     - `REGRESSION`: Predict continuous values (e.g., REMAINING_TIME, DURATION).

10. **`target_event`**:
    - Focuses on a specific event in the process for next activity predictive tasks.
    - Example: A particular milestone activity or timestamp.

## Outputs:

1. **`encoder`**:
   - Object or function that transforms raw features into encoded forms.
   - Stores mappings (e.g., one-hot encoding maps, label encoders).

2. **`full_df`**:
   - Fully encoded DataFrame for model training.
   - Includes encoded features, labels, and optional padding for consistency.


## <span style="color: red;">WARNING:</span> When using encoder.decode(full_df), this modifies the full_df DataFrame in place. To encode the dataframe again, you need to re-run the encoding process with encoder.encode(full_df).


# PREFIX SELECTION STRATEGY

### ALL_IN_ONE

In [22]:
encoder, full_df = get_encoded_df(
  log=log,
  feature_encoding_type=EncodingType.SIMPLE.value,
  prefix_length=CONF['prefix_length'],
  prefix_length_strategy=CONF['prefix_length_strategy'],
  time_encoding_type=CONF['time_encoding'],
  attribute_encoding=CONF['attribute_encoding'],
  padding=CONF['padding'],
  labeling_type=CONF['labeling_type'],
  task_generation_type=TaskGenerationType.ALL_IN_ONE.value,
  target_event=CONF['target_event'],
)
encoder.decode(full_df
               )
full_df.head(10)

Unnamed: 0,trace_id,prefix_1,prefix_2,prefix_3,prefix_4,prefix_5,prefix_6,prefix_7,prefix_8,prefix_9,prefix_10,label
0,173688,A_SUBMITTED-COMPLETE,0,0,0,0,0,0,0,0,0,True
1,173688,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,0,0,0,0,0,0,0,0,True
2,173688,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,0,0,0,0,0,0,0,True
3,173688,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,0,0,0,0,0,0,True
4,173688,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,W_Completeren aanvraag-START,0,0,0,0,0,True
5,173688,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,W_Completeren aanvraag-START,A_ACCEPTED-COMPLETE,0,0,0,0,True
6,173688,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,W_Completeren aanvraag-START,A_ACCEPTED-COMPLETE,O_SELECTED-COMPLETE,0,0,0,True
7,173688,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,W_Completeren aanvraag-START,A_ACCEPTED-COMPLETE,O_SELECTED-COMPLETE,A_FINALIZED-COMPLETE,0,0,True
8,173688,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,W_Completeren aanvraag-START,A_ACCEPTED-COMPLETE,O_SELECTED-COMPLETE,A_FINALIZED-COMPLETE,O_CREATED-COMPLETE,0,True
9,173688,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,W_Completeren aanvraag-START,A_ACCEPTED-COMPLETE,O_SELECTED-COMPLETE,A_FINALIZED-COMPLETE,O_CREATED-COMPLETE,O_SENT-COMPLETE,True


### ONLY_THIS

In [24]:
encoder, full_df = get_encoded_df(
  log=log,
  feature_encoding_type=EncodingType.SIMPLE.value,
  prefix_length=CONF['prefix_length'],
  prefix_length_strategy=CONF['prefix_length_strategy'],
  time_encoding_type=CONF['time_encoding'],
  attribute_encoding=CONF['attribute_encoding'],
  padding=CONF['padding'],
  labeling_type=CONF['labeling_type'],
  task_generation_type=TaskGenerationType.ONLY_THIS.value,
  target_event=CONF['target_event'],
)
encoder.decode(full_df
               )
full_df.head(10)

Unnamed: 0,trace_id,prefix_1,prefix_2,prefix_3,prefix_4,prefix_5,prefix_6,prefix_7,prefix_8,prefix_9,prefix_10,label
0,173688,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,W_Completeren aanvraag-START,A_ACCEPTED-COMPLETE,O_SELECTED-COMPLETE,A_FINALIZED-COMPLETE,O_CREATED-COMPLETE,O_SENT-COMPLETE,True
1,173691,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,W_Completeren aanvraag-START,W_Completeren aanvraag-COMPLETE,W_Completeren aanvraag-START,A_ACCEPTED-COMPLETE,A_FINALIZED-COMPLETE,O_SELECTED-COMPLETE,True
2,173694,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,W_Completeren aanvraag-START,W_Completeren aanvraag-COMPLETE,W_Completeren aanvraag-START,W_Completeren aanvraag-COMPLETE,W_Completeren aanvraag-START,W_Completeren aanvraag-COMPLETE,True
3,173715,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,W_Completeren aanvraag-START,W_Completeren aanvraag-COMPLETE,W_Completeren aanvraag-START,A_ACCEPTED-COMPLETE,O_SELECTED-COMPLETE,A_FINALIZED-COMPLETE,False
4,173718,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,W_Completeren aanvraag-START,A_ACCEPTED-COMPLETE,A_FINALIZED-COMPLETE,O_SELECTED-COMPLETE,O_CREATED-COMPLETE,O_SENT-COMPLETE,True
5,173721,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,W_Completeren aanvraag-START,A_ACCEPTED-COMPLETE,O_SELECTED-COMPLETE,A_FINALIZED-COMPLETE,O_CREATED-COMPLETE,O_SENT-COMPLETE,False
6,173730,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,W_Completeren aanvraag-START,W_Completeren aanvraag-COMPLETE,W_Completeren aanvraag-START,W_Completeren aanvraag-COMPLETE,W_Completeren aanvraag-START,W_Completeren aanvraag-COMPLETE,True
7,173736,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,W_Completeren aanvraag-START,W_Completeren aanvraag-COMPLETE,W_Completeren aanvraag-START,W_Completeren aanvraag-COMPLETE,W_Completeren aanvraag-START,A_ACCEPTED-COMPLETE,True
8,173739,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,W_Completeren aanvraag-START,W_Completeren aanvraag-COMPLETE,W_Completeren aanvraag-START,W_Completeren aanvraag-COMPLETE,W_Completeren aanvraag-START,A_ACCEPTED-COMPLETE,True
9,173745,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,W_Afhandelen leads-SCHEDULE,W_Afhandelen leads-START,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,W_Afhandelen leads-COMPLETE,W_Completeren aanvraag-START,A_ACCEPTED-COMPLETE,O_SELECTED-COMPLETE,False


## FREQUENCY ENCODING

In [21]:
CONF['feature_selection'] = EncodingType.FREQUENCY.value

encoder, full_df_frequency = get_encoded_df(
  log=log,
  feature_encoding_type=EncodingType.SIMPLE.value,
  prefix_length=CONF['prefix_length'],
  prefix_length_strategy=CONF['prefix_length_strategy'],
  time_encoding_type=CONF['time_encoding'],
  attribute_encoding=CONF['attribute_encoding'],
  padding=CONF['padding'],
  labeling_type=CONF['labeling_type'],
  task_generation_type=CONF['task_generation_type'],
  target_event=CONF['target_event'],
)
encoder.decode(full_df_frequency)

In [7]:
full_df_frequency.head(10)

Unnamed: 0,trace_id,A_ACCEPTED-COMPLETE,A_FINALIZED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,A_SUBMITTED-COMPLETE,O_CREATED-COMPLETE,O_SELECTED-COMPLETE,O_SENT-COMPLETE,W_Afhandelen leads-COMPLETE,W_Afhandelen leads-SCHEDULE,W_Afhandelen leads-START,W_Beoordelen fraude-COMPLETE,W_Beoordelen fraude-SCHEDULE,W_Beoordelen fraude-START,W_Completeren aanvraag-COMPLETE,W_Completeren aanvraag-SCHEDULE,W_Completeren aanvraag-START,0,label
0,173688,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
1,173688,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
2,173688,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
3,173688,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,True
4,173688,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,True
5,173688,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,True
6,173688,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,True
7,173688,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,True
8,173688,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,True
9,173688,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,True


## BINARY ENCODING

In [8]:
CONF['feature_selection'] = EncodingType.BINARY.value

encoder, full_df_binary = get_encoded_df(
  log=log,
  feature_encoding_type=CONF['feature_selection'],
  prefix_length=CONF['prefix_length'],
  prefix_length_strategy=CONF['prefix_length_strategy'],
  time_encoding_type=CONF['time_encoding'],
  attribute_encoding=CONF['attribute_encoding'],
  padding=CONF['padding'],
  labeling_type=CONF['labeling_type'],
  task_generation_type=CONF['task_generation_type'],
  target_event=CONF['target_event'],
)
encoder.decode(full_df_binary)
full_df_binary.head(10)

column: A_ACCEPTED-COMPLETE considered number, top 5 values are: [False, False, False, False, False]
column: A_FINALIZED-COMPLETE considered number, top 5 values are: [False, False, False, False, False]
column: A_PARTLYSUBMITTED-COMPLETE considered number, top 5 values are: [False, True, True, True, True]
column: A_PREACCEPTED-COMPLETE considered number, top 5 values are: [False, False, True, True, True]
column: A_SUBMITTED-COMPLETE considered number, top 5 values are: [True, True, True, True, True]
column: O_CREATED-COMPLETE considered number, top 5 values are: [False, False, False, False, False]
column: O_SELECTED-COMPLETE considered number, top 5 values are: [False, False, False, False, False]
column: O_SENT-COMPLETE considered number, top 5 values are: [False, False, False, False, False]
column: W_Afhandelen leads-COMPLETE considered number, top 5 values are: [False, False, False, False, False]
column: W_Afhandelen leads-SCHEDULE considered number, top 5 values are: [False, False, 

Unnamed: 0,trace_id,A_ACCEPTED-COMPLETE,A_FINALIZED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,A_SUBMITTED-COMPLETE,O_CREATED-COMPLETE,O_SELECTED-COMPLETE,O_SENT-COMPLETE,W_Afhandelen leads-COMPLETE,W_Afhandelen leads-SCHEDULE,W_Afhandelen leads-START,W_Beoordelen fraude-COMPLETE,W_Beoordelen fraude-SCHEDULE,W_Beoordelen fraude-START,W_Completeren aanvraag-COMPLETE,W_Completeren aanvraag-SCHEDULE,W_Completeren aanvraag-START,0,label
0,173688,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
1,173688,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
2,173688,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,True
3,173688,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,True
4,173688,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,True
5,173688,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,True
6,173688,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,True
7,173688,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,True
8,173688,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,True
9,173688,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,True


## SIMPLE_INDEX ENCODING

In [9]:
CONF['feature_selection'] = EncodingType.SIMPLE.value

encoder, full_df_simple = get_encoded_df(
  log=log,
  feature_encoding_type=CONF['feature_selection'],
  prefix_length=CONF['prefix_length'],
  prefix_length_strategy=CONF['prefix_length_strategy'],
  time_encoding_type=CONF['time_encoding'],
  attribute_encoding=CONF['attribute_encoding'],
  padding=CONF['padding'],
  labeling_type=CONF['labeling_type'],
  task_generation_type=CONF['task_generation_type'],
  target_event=CONF['target_event'],
)
encoder.decode(full_df_simple)
full_df_simple.head(10)

Unnamed: 0,trace_id,prefix_1,prefix_2,prefix_3,prefix_4,prefix_5,prefix_6,prefix_7,prefix_8,prefix_9,prefix_10,label
0,173688,A_SUBMITTED-COMPLETE,0,0,0,0,0,0,0,0,0,True
1,173688,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,0,0,0,0,0,0,0,0,True
2,173688,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,0,0,0,0,0,0,0,True
3,173688,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,0,0,0,0,0,0,True
4,173688,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,W_Completeren aanvraag-START,0,0,0,0,0,True
5,173688,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,W_Completeren aanvraag-START,A_ACCEPTED-COMPLETE,0,0,0,0,True
6,173688,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,W_Completeren aanvraag-START,A_ACCEPTED-COMPLETE,O_SELECTED-COMPLETE,0,0,0,True
7,173688,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,W_Completeren aanvraag-START,A_ACCEPTED-COMPLETE,O_SELECTED-COMPLETE,A_FINALIZED-COMPLETE,0,0,True
8,173688,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,W_Completeren aanvraag-START,A_ACCEPTED-COMPLETE,O_SELECTED-COMPLETE,A_FINALIZED-COMPLETE,O_CREATED-COMPLETE,0,True
9,173688,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,W_Completeren aanvraag-START,A_ACCEPTED-COMPLETE,O_SELECTED-COMPLETE,A_FINALIZED-COMPLETE,O_CREATED-COMPLETE,O_SENT-COMPLETE,True


In [10]:
CONF['feature_selection'] = EncodingType.SIMPLE.value

encoder, full_df_simple = get_encoded_df(
  log=log,
  feature_encoding_type=CONF['feature_selection'],
  prefix_length=CONF['prefix_length'],
  prefix_length_strategy=CONF['prefix_length_strategy'],
  time_encoding_type=CONF['time_encoding'],
  attribute_encoding=CONF['attribute_encoding'],
  padding=CONF['padding'],
  labeling_type=CONF['labeling_type'],
  task_generation_type=CONF['task_generation_type'],
  target_event=CONF['target_event'],
)
encoder.decode(full_df_simple)
full_df_simple.head(10)

Unnamed: 0,trace_id,prefix_1,prefix_2,prefix_3,prefix_4,prefix_5,prefix_6,prefix_7,prefix_8,prefix_9,prefix_10,label
0,173688,A_SUBMITTED-COMPLETE,0,0,0,0,0,0,0,0,0,True
1,173688,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,0,0,0,0,0,0,0,0,True
2,173688,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,0,0,0,0,0,0,0,True
3,173688,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,0,0,0,0,0,0,True
4,173688,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,W_Completeren aanvraag-START,0,0,0,0,0,True
5,173688,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,W_Completeren aanvraag-START,A_ACCEPTED-COMPLETE,0,0,0,0,True
6,173688,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,W_Completeren aanvraag-START,A_ACCEPTED-COMPLETE,O_SELECTED-COMPLETE,0,0,0,True
7,173688,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,W_Completeren aanvraag-START,A_ACCEPTED-COMPLETE,O_SELECTED-COMPLETE,A_FINALIZED-COMPLETE,0,0,True
8,173688,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,W_Completeren aanvraag-START,A_ACCEPTED-COMPLETE,O_SELECTED-COMPLETE,A_FINALIZED-COMPLETE,O_CREATED-COMPLETE,0,True
9,173688,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,W_Completeren aanvraag-START,A_ACCEPTED-COMPLETE,O_SELECTED-COMPLETE,A_FINALIZED-COMPLETE,O_CREATED-COMPLETE,O_SENT-COMPLETE,True


## SIMPLE_TRACE_INDEX_ENCODING

In [11]:
CONF['feature_selection'] = EncodingType.SIMPLE_TRACE.value

encoder, full_df_simple_trace = get_encoded_df(
  log=log,
  feature_encoding_type=CONF['feature_selection'],
  prefix_length=CONF['prefix_length'],
  prefix_length_strategy=CONF['prefix_length_strategy'],
  time_encoding_type=CONF['time_encoding'],
  attribute_encoding=CONF['attribute_encoding'],
  padding=CONF['padding'],
  labeling_type=CONF['labeling_type'],
  task_generation_type=CONF['task_generation_type'],
  target_event=CONF['target_event'],
)
encoder.decode(full_df_simple_trace)
full_df_simple_trace.head(10)

column: AMOUNT_REQ considered number, top 5 values are: [20000.0, 20000.0, 20000.0, 20000.0, 20000.0]


Unnamed: 0,trace_id,AMOUNT_REQ,prefix_1,prefix_2,prefix_3,prefix_4,prefix_5,prefix_6,prefix_7,prefix_8,prefix_9,prefix_10,label
0,173688,20000.0,A_SUBMITTED-COMPLETE,0,0,0,0,0,0,0,0,0,True
1,173688,20000.0,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,0,0,0,0,0,0,0,0,True
2,173688,20000.0,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,0,0,0,0,0,0,0,True
3,173688,20000.0,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,0,0,0,0,0,0,True
4,173688,20000.0,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,W_Completeren aanvraag-START,0,0,0,0,0,True
5,173688,20000.0,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,W_Completeren aanvraag-START,A_ACCEPTED-COMPLETE,0,0,0,0,True
6,173688,20000.0,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,W_Completeren aanvraag-START,A_ACCEPTED-COMPLETE,O_SELECTED-COMPLETE,0,0,0,True
7,173688,20000.0,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,W_Completeren aanvraag-START,A_ACCEPTED-COMPLETE,O_SELECTED-COMPLETE,A_FINALIZED-COMPLETE,0,0,True
8,173688,20000.0,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,W_Completeren aanvraag-START,A_ACCEPTED-COMPLETE,O_SELECTED-COMPLETE,A_FINALIZED-COMPLETE,O_CREATED-COMPLETE,0,True
9,173688,20000.0,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,W_Completeren aanvraag-START,A_ACCEPTED-COMPLETE,O_SELECTED-COMPLETE,A_FINALIZED-COMPLETE,O_CREATED-COMPLETE,O_SENT-COMPLETE,True


## COMPLEX-INDEX ENCODING

In [12]:
CONF['feature_selection'] = EncodingType.COMPLEX.value
encoder, full_df_complex = get_encoded_df(
  log=log,
  feature_encoding_type=CONF['feature_selection'],
  prefix_length=CONF['prefix_length'],
  prefix_length_strategy=CONF['prefix_length_strategy'],
  time_encoding_type=CONF['time_encoding'],
  attribute_encoding=CONF['attribute_encoding'],
  padding=CONF['padding'],
  labeling_type=CONF['labeling_type'],
  task_generation_type=CONF['task_generation_type'],
  target_event=CONF['target_event'],
)
encoder.decode(full_df_complex)

DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`
DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, us

column: AMOUNT_REQ considered number, top 5 values are: [20000.0, 20000.0, 20000.0, 20000.0, 20000.0]
column: event_nr_1 considered number, top 5 values are: [1.0, 1.0, 1.0, 1.0, 1.0]
column: hour_1 considered number, top 5 values are: [1.0, 1.0, 1.0, 1.0, 1.0]
column: month_1 considered number, top 5 values are: [10.0, 10.0, 10.0, 10.0, 10.0]
column: open_cases_1 considered number, top 5 values are: [1.0, 1.0, 1.0, 1.0, 1.0]
column: time:timestamp_1 considered number, top 5 values are: [1317433124.546, 1317433124.546, 1317433124.546, 1317433124.546, 1317433124.546]
column: timesincecasestart_1 considered number, top 5 values are: [0.0, 0.0, 0.0, 0.0, 0.0]
column: timesincelastevent_1 considered number, top 5 values are: [0.0, 0.0, 0.0, 0.0, 0.0]
column: timesincemidnight_1 considered number, top 5 values are: [98.0, 98.0, 98.0, 98.0, 98.0]
column: weekday_1 considered number, top 5 values are: [5.0, 5.0, 5.0, 5.0, 5.0]
column: event_nr_2 considered number, top 5 values are: [0.0, 2.0,

In [13]:
full_df_complex.head(10)

Unnamed: 0,trace_id,AMOUNT_REQ,prefix_1,Resource_1,event_nr_1,hour_1,lifecycle:transition_1,month_1,open_cases_1,time:timestamp_1,...,hour_10,lifecycle:transition_10,month_10,open_cases_10,time:timestamp_10,timesincecasestart_10,timesincelastevent_10,timesincemidnight_10,weekday_10,label
0,173688,20000.0,A_SUBMITTED-COMPLETE,112.0,1.0,1.0,COMPLETE,10.0,1.0,1317433000.0,...,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,True
1,173688,20000.0,A_SUBMITTED-COMPLETE,112.0,1.0,1.0,COMPLETE,10.0,1.0,1317433000.0,...,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,True
2,173688,20000.0,A_SUBMITTED-COMPLETE,112.0,1.0,1.0,COMPLETE,10.0,1.0,1317433000.0,...,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,True
3,173688,20000.0,A_SUBMITTED-COMPLETE,112.0,1.0,1.0,COMPLETE,10.0,1.0,1317433000.0,...,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,True
4,173688,20000.0,A_SUBMITTED-COMPLETE,112.0,1.0,1.0,COMPLETE,10.0,1.0,1317433000.0,...,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,True
5,173688,20000.0,A_SUBMITTED-COMPLETE,112.0,1.0,1.0,COMPLETE,10.0,1.0,1317433000.0,...,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,True
6,173688,20000.0,A_SUBMITTED-COMPLETE,112.0,1.0,1.0,COMPLETE,10.0,1.0,1317433000.0,...,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,True
7,173688,20000.0,A_SUBMITTED-COMPLETE,112.0,1.0,1.0,COMPLETE,10.0,1.0,1317433000.0,...,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,True
8,173688,20000.0,A_SUBMITTED-COMPLETE,112.0,1.0,1.0,COMPLETE,10.0,1.0,1317433000.0,...,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0.0,True
9,173688,20000.0,A_SUBMITTED-COMPLETE,112.0,1.0,1.0,COMPLETE,10.0,1.0,1317433000.0,...,12.0,COMPLETE,10.0,10.0,2011-10-01 12:45:11.380000+00:00,666.447233,0.00305,765.0,5.0,True


In [14]:
full_df[full_df['trace_id']=='173688']

Unnamed: 0,trace_id,prefix_1,prefix_2,prefix_3,prefix_4,prefix_5,prefix_6,prefix_7,prefix_8,prefix_9,prefix_10,label
0,173688,A_SUBMITTED-COMPLETE,0,0,0,0,0,0,0,0,0,True
1,173688,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,0,0,0,0,0,0,0,0,True
2,173688,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,0,0,0,0,0,0,0,True
3,173688,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,0,0,0,0,0,0,True
4,173688,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,W_Completeren aanvraag-START,0,0,0,0,0,True
5,173688,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,W_Completeren aanvraag-START,A_ACCEPTED-COMPLETE,0,0,0,0,True
6,173688,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,W_Completeren aanvraag-START,A_ACCEPTED-COMPLETE,O_SELECTED-COMPLETE,0,0,0,True
7,173688,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,W_Completeren aanvraag-START,A_ACCEPTED-COMPLETE,O_SELECTED-COMPLETE,A_FINALIZED-COMPLETE,0,0,True
8,173688,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,W_Completeren aanvraag-START,A_ACCEPTED-COMPLETE,O_SELECTED-COMPLETE,A_FINALIZED-COMPLETE,O_CREATED-COMPLETE,0,True
9,173688,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,W_Completeren aanvraag-START,A_ACCEPTED-COMPLETE,O_SELECTED-COMPLETE,A_FINALIZED-COMPLETE,O_CREATED-COMPLETE,O_SENT-COMPLETE,True


# Train/val/test splitting

# The **`shuffle`** parameter specifies whether to shuffle the dataset before splitting. By setting it to `False`, we ensure that the order of the dataset is maintained during splitting (`temporal split`).

In [15]:
train_size, val_size, test_size = CONF['train_val_test_split']
train_df, val_df, test_df = split_train_val_test(full_df, train_size, val_size, test_size, shuffle=False, seed=CONF['seed'])

In [16]:
train_df.head()

Unnamed: 0,trace_id,prefix_1,prefix_2,prefix_3,prefix_4,prefix_5,prefix_6,prefix_7,prefix_8,prefix_9,prefix_10,label
0,173688,A_SUBMITTED-COMPLETE,0,0,0,0,0,0,0,0,0,True
1,173688,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,0,0,0,0,0,0,0,0,True
2,173688,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,0,0,0,0,0,0,0,True
3,173688,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,0,0,0,0,0,0,True
4,173688,A_SUBMITTED-COMPLETE,A_PARTLYSUBMITTED-COMPLETE,A_PREACCEPTED-COMPLETE,W_Completeren aanvraag-SCHEDULE,W_Completeren aanvraag-START,0,0,0,0,0,True


# Hyperparameter optimization

## The function `retrieve_best_model` performs hyperparameter optimization for the predictive models specified in the configuration dictionary (`CONF`). This is done using the `hyperopt` library to find the best hyperparameters for the models based on the specified target metric.
## A predefined optimization space is found for each model in nirdizati_light/hyperparameter_optimisation/common.py.
## Alternatively one can specify a custom optimization space for each model. (see run_full_pipeline.py for an example)

In [17]:
# Instantiate predictive models
predictive_models = [PredictiveModel(predictive_model, train_df, val_df, test_df, prefix_length=CONF['prefix_length']) for predictive_model in CONF['predictive_models']]

# Perform hyperparameter optimization
best_candidates, best_model_idx, best_model_model, best_model_config = retrieve_best_model(
    predictive_models,
    max_evaluations=CONF['hyperparameter_optimisation_evaluations'],
    target=CONF['hyperparameter_optimisation_target']
)

# Evaluate best model
best_model = predictive_models[best_model_idx]
best_model.model = best_model_model
best_model.config = best_model_config
print(f'Best model is {best_model.model_type}')

Running hyperparameter optimization on model DecisionTree...
100%|██████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 119.81trial/s, best loss: 0]


AllTrialsFailed: 

## Predictive Model Evaluation


### This is done based on the types of model that is instantiated: Classification Models use more traditional label-based mettrics (e.g. Accuracy, F1-Score, Precision, Recall), while Regression Models use errors (Root-Mean Squared Error (RMSE), Mean Absolute Error (MAE), etc.)

## Evaluation Metrics for Classification Models

- **F1 Score** (Macro average): Measures the balance between precision and recall.

- **Accuracy**: The proportion of correct predictions out of the total number of predictions.

- **Precision** (Macro average): The proportion of true positive predictions among all positive predictions.

- **Recall** (Macro average): The proportion of true positive predictions among all actual positives.
 
### Evaluation Metrics for Regression Models

- **RMSE**: Sensitive to large errors and penalizes larger errors more heavily, making it useful when large errors are undesirable.
- **MAE**: Simple to interpret and gives the average absolute error, which is easy to understand, but it doesn’t penalize large errors as much as RMSE.
- **R-squared**: Provides a measure of how well the model explains the variation in the data. Higher values indicate a better fit.
- **MAPE**: Gives the error as a percentage, which makes it easier to interpret in relation to the scale of the data.



In [None]:
print('Evaluating best model...')
predicted, scores = best_model.predict(test=True)
actual = test_df['label']

initial_result = evaluate_classifier(actual, predicted, scores)
results = evaluate_classifiers(predictive_models,actual)
plot_model_comparison(results)
print(f'Evaluation: {initial_result}')

# Explanation

In [None]:
test_df_correct = test_df[(test_df['label'] == predicted) & (test_df['label'] == 0)]
cf_dataset = pd.concat([train_df, val_df], ignore_index=True)
full_df = pd.concat([train_df, val_df, test_df])
cf_dataset.loc[len(cf_dataset)] = 0

cf_results = explain(CONF, best_model, encoder=encoder, df=full_df.iloc[:, 1:],
        query_instances=test_df_correct.iloc[:, 1:],
        method='genetic_conformance', optimization='baseline',
        heuristic='heuristic_2', support=0.95,
        timestamp_col_name='Complete Timestamp', # name of the timestamp column in the log
        model_path='./experiments/process_models/process_models',
        random_seed=CONF['seed'], adapted=True, filtering=False)

counterfactuals = cf_results.cf_examples_list[0].final_cfs_df.copy()

encoder.decode(counterfactuals)

encoder.decode(test_df_correct)

cf_results.cf_examples_list[0].final_cfs_df = counterfactuals
cf_results.cf_examples_list[0].final_cfs_df_sparse = counterfactuals
cf_results.cf_examples_list[0].test_instance_df = test_df_correct.iloc[:1,1:].copy()


In [None]:
print(cf_results.visualize_as_dataframe())

In [None]:
print(cf_results.visualize_as_dataframe(show_only_changes=True))

In [None]:
import shap
CONF['explanator'] = ExplainerType.SHAP.value
exp = explain(CONF, best_model, encoder=encoder, test_df=test_df,target_trace=test_df_correct.iloc[0,0])

shap.plots.bar(exp[0])

# Available documentation can be found at: https://rgraziosi-fbk.github.io/nirdizati-light/nirdizati_light.html

# EXERCISES

## EXERCISE 1

- Objective: Evaluate the performance of different predictive models on the same event log.
Steps:
    - Load a given labelled event log into Nirdizati Light.
    - Select at least three different models (e.g., Random Forest, Gradient Boosting, Neural Network).
    - Train and evaluate the models using default hyperparameters.
    - Compare the models using accuracy metrics (e.g., F1-score, precision, recall) and  register time metrics (e.g., training time).

Questions:
- Which model performs best in terms of accuracy?
- Which model is the most efficient in terms of training and prediction speed?
- How would you interpret the trade-offs between accuracy and speed?


In [None]:
### YOUR CODE HERE
prefix_lengths = [] #val1,val2,val3

res = []
for i,prefix in enumerate(prefix_lengths):
    CONF['prefix_length'] = prefix
    encoder, full_df = get_encoded_df(
      log=log,
      feature_encoding_type=CONF['feature_selection'],
      prefix_length=CONF['prefix_length'],
      prefix_length_strategy=CONF['prefix_length_strategy'],
      time_encoding_type=CONF['time_encoding'],
      attribute_encoding=CONF['attribute_encoding'],
      padding=CONF['padding'],
      labeling_type=CONF['labeling_type'],
      task_generation_type=CONF['task_generation_type'],
      target_event=CONF['target_event'],)
    full_df_simple.head(10)
    train_size, val_size, test_size = CONF['train_val_test_split']
    train_df, val_df, test_df = split_train_val_test(full_df, train_size, val_size, test_size, shuffle=False, seed=CONF['seed'])
    # Instantiate predictive models
    predictive_models = [PredictiveModel(predictive_model, train_df, val_df, test_df, prefix_length=CONF['prefix_length']) for predictive_model in CONF['predictive_models']]
    
    # Perform hyperparameter optimization
    best_candidates, best_model_idx, best_model_model, best_model_config = retrieve_best_model(
        predictive_models,
        max_evaluations=CONF['hyperparameter_optimisation_evaluations'],
        target=CONF['hyperparameter_optimisation_target']
    )
    
    # Evaluate best model
    best_model = predictive_models[best_model_idx]
    best_model.model = best_model_model
    best_model.config = best_model_config
    print(f'Best model is {best_model.model_type}')
    print('Evaluating best model...')
    predicted, scores = best_model.predict(test=True)
    actual = test_df['label']
    
    initial_result = evaluate_classifier(actual, predicted, scores)
    results = evaluate_classifiers(predictive_models,actual)
   # plot_model_comparison(results)
    print(f'Evaluation: {initial_result}')
    initial_result['prefix'] = prefix
    pred_model_results = pd.DataFrame([initial_result])
    res.append(pred_model_results)
res_df = pd.concat(res)

In [None]:
# PLOT THE RESULTS BASED ON THE PREFIX USING PANDAS PLOTTING
### YOUR CODE HERE ###


## Exercise 2: Experimenting with Encoding Techniques
### Objective: 
 - Understand how different prefix encoding methods influence model performance.
### Steps:
- Use the same event log and prediction task as in Exercise 1.
- Based on the results from Exercise 1, use the prefix length that returned the best model performance results.
- Experiment with at least three encoding methods (simple-index encoding, complex-index encoding, frequency, binary).
- Train the same predictive model (Decision Tree model) with each encoding method.
- Compare the results in terms of accuracy.
### Questions:
- Which encoding technique yields the highest accuracy? Why?
- What is the prefix encoding technique you would recommend for this task, and why?


In [None]:
CONF

In [None]:
### YOUR CODE HERE
encoding_methods = [] #encoding 1, encoding 2, encoding 3 (see Notebook above)

res = []
for i,prefix in enumerate(prefix_lengths):
    CONF['feature_selection'] = #YOUR CODE HERE
    encoder, full_df = get_encoded_df(
      log=log,
      feature_encoding_type = CONF['feature_selection'],
      prefix_length= ..., # Replace 3 dots with previous prefix length
      prefix_length_strategy=CONF['prefix_length_strategy'],
      time_encoding_type=CONF['time_encoding'],
      attribute_encoding=CONF['attribute_encoding'],
      padding=CONF['padding'],
      labeling_type=CONF['labeling_type'],
      task_generation_type=CONF['task_generation_type'],
      target_event=CONF['target_event'],)
    train_size, val_size, test_size = CONF['train_val_test_split']
    train_df, val_df, test_df = split_train_val_test(full_df, train_size, val_size, test_size, shuffle=False, seed=CONF['seed'])
    # Instantiate predictive models
    predictive_models = [PredictiveModel(predictive_model, train_df, val_df, test_df, prefix_length=CONF['prefix_length']) for predictive_model in CONF['predictive_models']]
    
    # Perform hyperparameter optimization
    best_candidates, best_model_idx, best_model_model, best_model_config = retrieve_best_model(
        predictive_models,
        max_evaluations=CONF['hyperparameter_optimisation_evaluations'],
        target=CONF['hyperparameter_optimisation_target']
    )
    
    # Evaluate best model
    best_model = predictive_models[best_model_idx]
    best_model.model = best_model_model
    best_model.config = best_model_config
    print(f'Best model is {best_model.model_type}')
    print('Evaluating best model...')
    predicted, scores = best_model.predict(test=True)
    actual = test_df['label']
    
    initial_result = evaluate_classifier(actual, predicted, scores)
    results = evaluate_classifiers(predictive_models,actual)
   # plot_model_comparison(results)
    print(f'Evaluation: {initial_result}')
    initial_result['encoding'] = ... # Update line here for encoding
    pred_model_results = pd.DataFrame([initial_result])
    res.append(pred_model_results)
res_df = pd.concat(res)

## Exercise 3: Exploring Predictive Models
### Objective: Evaluate the performance of different predictive models on the same event log.
### Steps:
- Use the best prefix length and prefix encoding combination on the same event log.
- Select at least three different predictive models (e.g., Random Forest, Gradient Boosting, Neural Network).
- Train and evaluate the models using the default hyperparameters.
- Compare the models using accuracy metrics (e.g., F1-score, precision, recall) and register time metrics (e.g., hyperparameter optimisation time).
### Questions:
- Which predictive model performs best in terms of accuracy metrics?
- Which predictive model is the most efficient in terms of optimisation time?
- How would you interpret the trade-offs between accuracy and time metrics?

In [None]:
# Set a seed for reproducibility
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)

# Name of the log to use

# Configuration dictionary for nirdizati light
CONF = {
    # path to log
    'data': os.path.join('bpic2012_O_ACCEPTED-COMPLETE_trunc.xes'),
    # train-validation-test set split percentages
    'train_val_test_split': [0.7, 0.1, 0.2],

    # path to output folder
    'output': 'output_data',

    'prefix_length_strategy': PrefixLengthStrategy.FIXED.value,
    'prefix_length': 10,
    # whether to use padding or not in encoding
    'padding': True,
    # which encoding to use
    'feature_selection': EncodingType.SIMPLE.value,
    # which attribute encoding to use
    'attribute_encoding': EncodingTypeAttribute.LABEL.value,
    # which time encoding to use
    'time_encoding': TimeEncodingType.NONE.value,

    # the label to be predicted (e.g. outcome, next activity)
    'labeling_type': LabelTypes.ATTRIBUTE_STRING.value,
    # whether the model should be trained on the specified prefix length (ONLY_THIS) or to every prefix in range [1, prefix_length] (ALL_IN_ONE)
    'task_generation_type': TaskGenerationType.ALL_IN_ONE.value,

    # list of predictive models and their respective hyperparameter optimization space
    # if it is None, then the default hyperopt space will be used; otherwise, the provided space will be used
    'predictive_models': [
      ### YOUR CODE HERE ###
    ],

    # which metric to optimize hyperparameters for
    'hyperparameter_optimisation_target': HyperoptTarget.F1.value,
    # number of hyperparameter configurations to try
    'hyperparameter_optimisation_evaluations': 10,

    # explainability method to use
    'explanator': ExplainerType.DICE.value,

    'target_event': None,
    'seed': SEED,
}

In [None]:
### STARTING POINT
encoder, full_df = get_encoded_df(
  log=log,
  feature_encoding_type = CONF['feature_selection'],
  prefix_length= CONF['prefix_length'], # Replace 3 dots with previous prefix length
  prefix_length_strategy=CONF['prefix_length_strategy'],
  time_encoding_type=CONF['time_encoding'],
  attribute_encoding=CONF['attribute_encoding'],
  padding=CONF['padding'],
  labeling_type=CONF['labeling_type'],
  task_generation_type=CONF['task_generation_type'],
  target_event=CONF['target_event'],)
train_size, val_size, test_size = CONF['train_val_test_split']
train_df, val_df, test_df = split_train_val_test(full_df, train_size, val_size, test_size, shuffle=False, seed=CONF['seed'])
### YOUR CODE HERE###