# Training Pipeline

Complete pipeline for data preparation and LSTM model training for all decision points.


In [1]:
import sys
import os
import warnings

sys.path.insert(0, os.path.dirname(os.getcwd()))

import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split

from pm4py.objects.bpmn.importer import importer as bpmn_importer
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.objects.conversion.log import converter as log_converter

from preprocessing import DecisionPointExtractor, TrainingDataGenerator, LifecycleFilter
from models import SequenceEncoder, ContextEncoder, LSTMPredictor
from storage import ModelPersistence

pd.options.mode.chained_assignment = None
warnings.filterwarnings("ignore")

## 1. Configuration


In [None]:
BPMN_PATH = os.path.join(os.getcwd(), "..", "..", "..", "process_model", "loan_application.bpmn")
XES_PATH = os.path.join(os.getcwd(), "..", "..", "..", "eventlog/", "eventlog.xes.gz")
MODELS_DIR = os.path.join(os.getcwd(), "..", "models_lstm_new")
DATA_DIR = os.path.join(os.getcwd(), "..", "data", "processed")

CONTEXT_KEYS = ["case:LoanGoal", "case:ApplicationType", "case:RequestedAmount"]
MAX_HISTORY = 15
MIN_SEQ_COUNT = 20
MIN_CLASS_COUNT = 10
RANDOM_STATE = 42


## 2. Load Data


In [3]:
print(f"Loading BPMN: {BPMN_PATH}")
bpmn_model = bpmn_importer.apply(BPMN_PATH)

print(f"Loading event log: {XES_PATH}")
event_log = xes_importer.apply(XES_PATH)
df_log = log_converter.apply(event_log, variant=log_converter.Variants.TO_DATA_FRAME)

print(f"Loaded {len(df_log)} events from {df_log['case:concept:name'].nunique()} cases")


Loading BPMN: d:\Repos\process-simulation-engine\Next-Activity-Prediction\advanced\notebooks\..\..\..\process_model\loan_application.bpmn
Loading event log: d:\Repos\process-simulation-engine\Next-Activity-Prediction\advanced\notebooks\..\..\..\Dataset\BPI Challenge 2017.xes


parsing log, completed traces :: 100%|██████████| 31509/31509 [00:33<00:00, 932.05it/s] 


Loaded 1202267 events from 31509 cases


## 2.1 Apply Lifecycle Filtering

Filter to keep only 'start' and 'complete' lifecycle transitions, then collapse into activity instances with computed processing times.

In [None]:
# Apply lifecycle filtering
lf = LifecycleFilter(df_log)

# Show lifecycle distribution before filtering
print("Lifecycle distribution:")
print(lf.get_lifecycle_distribution())
print()

# Show which activities have start events
activity_summary = lf.get_activity_lifecycle_summary()
print("Activity lifecycle summary:")
print(activity_summary.to_string())
print()

# Filter and collapse to activity instances
df_filtered = lf.transform()

# Show processing time statistics
print("\nProcessing time statistics per activity:")
pt_stats = lf.get_processing_time_stats(df_filtered)
print(pt_stats.to_string())
print()

print(f"\nFiltered from {len(df_log)} to {len(df_filtered)} activity instances")

## 3. Extract Decision Points


In [4]:
extractor = DecisionPointExtractor(bpmn_model)
dp_map = extractor.extract()

print(f"Found {len(dp_map)} decision points")
for dp, config in list(dp_map.items())[:3]:
    print(f"  {dp}: {len(config['incoming'])} incoming, {len(config['outgoing'])} outgoing")


Found 49 decision points
  DP 1: 1 incoming, 5 outgoing
  DP 2: 2 incoming, 1 outgoing
  DP 3: 1 incoming, 5 outgoing


## 4. Prepare Event Log


In [None]:
# Use the lifecycle-filtered DataFrame (df_filtered already has processing_time column)
columns = [
    "case:concept:name", "concept:name", "org:resource",
    "time:timestamp", "processing_time",
    "case:LoanGoal", "case:ApplicationType", "case:RequestedAmount"
]

# Select columns that exist in the filtered DataFrame
available_columns = [c for c in columns if c in df_filtered.columns]
df = df_filtered[available_columns].copy()

# Ensure timestamp is datetime (may already be from lifecycle filter)
if not pd.api.types.is_datetime64_any_dtype(df["time:timestamp"]):
    df["time:timestamp"] = pd.to_datetime(df["time:timestamp"])

df = df.sort_values(["case:concept:name", "time:timestamp"]).reset_index(drop=True)

# Add End markers for each case
end_events = df.sort_values("time:timestamp").groupby("case:concept:name").tail(1).copy()
end_events["time:timestamp"] = end_events["time:timestamp"] + pd.Timedelta(seconds=1)
end_events["concept:name"] = "End"
end_events["processing_time"] = 0.0  # End is instant

df = pd.concat([df, end_events], ignore_index=True)
df = df.sort_values(["case:concept:name", "time:timestamp"]).reset_index(drop=True)

print(f"Prepared {len(df)} activity instances with End markers")
print(f"Using actual processing times from lifecycle-filtered data")

## 5. Generate Training Data


In [6]:
dp_datasets = {}

for dp_name, dp_config in dp_map.items():
    generator = TrainingDataGenerator(
        df, 
        {dp_name: dp_config},
        max_history=MAX_HISTORY,
        min_seq_count=MIN_SEQ_COUNT,
        min_class_count=MIN_CLASS_COUNT
    )
    result = generator.generate()
    
    if dp_name in result:
        dp_datasets[dp_name] = result[dp_name]
        print(f"{dp_name}: {len(result[dp_name])} samples, {result[dp_name]['label'].nunique()} classes")
    else:
        print(f"{dp_name}: no valid samples")

print(f"\nGenerated datasets for {len(dp_datasets)} decision points")


DP 1: 31509 samples, 3 classes
DP 2: no valid samples
DP 3: 47079 samples, 2 classes
DP 4: 31441 samples, 2 classes
DP 5: 31441 samples, 2 classes
DP 6: no valid samples
DP 7: 113969 samples, 4 classes
DP 8: 145410 samples, 4 classes
DP 9: 179311 samples, 4 classes
DP 10: 179311 samples, 4 classes
DP 11: 179256 samples, 3 classes
DP 12: 179256 samples, 3 classes
DP 13: 178967 samples, 2 classes
DP 14: 36213 samples, 2 classes
DP 15: 36213 samples, 2 classes
DP 16: 33084 samples, 6 classes
DP 17: 33084 samples, 6 classes
DP 18: no valid samples
DP 19: 179857 samples, 7 classes
DP 20: 179857 samples, 7 classes
DP 21: 179786 samples, 6 classes
DP 22: 179786 samples, 6 classes
DP 23: 70745 samples, 7 classes
DP 24: no valid samples
DP 25: 5469 samples, 5 classes
DP 26: 73711 samples, 7 classes
DP 27: 426576 samples, 8 classes
DP 28: 426576 samples, 8 classes
DP 29: 426576 samples, 8 classes
DP 30: 426576 samples, 8 classes
DP 31: 185464 samples, 8 classes
DP 32: 426576 samples, 8 classes
D

## 6. Split Data


In [7]:
splits = {}

for dp, data in dp_datasets.items():
    data = data.drop(columns=["sequence_timestamps"], errors="ignore")
    
    temp, holdout = train_test_split(
        data, test_size=0.1, random_state=RANDOM_STATE,
        stratify=data["label"] if data["label"].nunique() > 1 else None
    )
    train, test = train_test_split(
        temp, test_size=2/9, random_state=RANDOM_STATE,
        stratify=temp["label"] if temp["label"].nunique() > 1 else None
    )
    
    splits[dp] = {
        "train": train.reset_index(drop=True),
        "test": test.reset_index(drop=True),
        "holdout": holdout.reset_index(drop=True)
    }
    print(f"{dp}: train={len(train)}, test={len(test)}, holdout={len(holdout)}")

os.makedirs(DATA_DIR, exist_ok=True)
joblib.dump(splits, os.path.join(DATA_DIR, "dp_split_datasets_full_simple.joblib"))
print(f"\nSaved splits to {DATA_DIR}")


DP 1: train=22056, test=6302, holdout=3151
DP 3: train=32955, test=9416, holdout=4708
DP 4: train=22008, test=6288, holdout=3145
DP 5: train=22008, test=6288, holdout=3145
DP 7: train=79778, test=22794, holdout=11397
DP 8: train=101787, test=29082, holdout=14541
DP 9: train=125517, test=35862, holdout=17932
DP 10: train=125517, test=35862, holdout=17932
DP 11: train=125478, test=35852, holdout=17926
DP 12: train=125478, test=35852, holdout=17926
DP 13: train=125276, test=35794, holdout=17897
DP 14: train=25348, test=7243, holdout=3622
DP 15: train=25348, test=7243, holdout=3622
DP 16: train=23158, test=6617, holdout=3309
DP 17: train=23158, test=6617, holdout=3309
DP 19: train=125899, test=35972, holdout=17986
DP 20: train=125899, test=35972, holdout=17986
DP 21: train=125849, test=35958, holdout=17979
DP 22: train=125849, test=35958, holdout=17979
DP 23: train=49521, test=14149, holdout=7075
DP 25: train=3828, test=1094, holdout=547
DP 26: train=51597, test=14742, holdout=7372
DP 27: 

## 7. Train Models


In [8]:
trained_models = {}

for dp, split in splits.items():
    print(f"\n=== Training {dp} ===")
    
    df_pool = pd.concat([split["train"], split["test"]], ignore_index=True)
    df_all = pd.concat([df_pool, split["holdout"]], ignore_index=True)
    n_pool = len(df_pool)
    
    seq_encoder = SequenceEncoder()
    X_acts, X_durs, X_res, y = seq_encoder.fit_transform(df_all)
    seq_encoder.add_unknown_token()
    
    ctx_encoder = ContextEncoder(CONTEXT_KEYS)
    X_ctx = ctx_encoder.fit_transform(df_all)
    
    X_acts_train, X_acts_test = X_acts[:n_pool], X_acts[n_pool:]
    X_durs_train, X_durs_test = X_durs[:n_pool], X_durs[n_pool:]
    X_res_train, X_res_test = X_res[:n_pool], X_res[n_pool:]
    X_ctx_train, X_ctx_test = X_ctx[:n_pool], X_ctx[n_pool:]
    y_train, y_test = y[:n_pool], y[n_pool:]
    
    predictor = LSTMPredictor(
        num_activities=seq_encoder.num_activities,
        num_resources=seq_encoder.num_resources,
        context_dim=ctx_encoder.dim,
        max_seq_len=seq_encoder.max_seq_len,
        num_classes=seq_encoder.num_classes
    )
    predictor.build()
    predictor.train(
        [X_acts_train, X_durs_train, X_res_train, X_ctx_train],
        y_train
    )
    
    trained_models[dp] = {
        "model": predictor.model,
        "activity_encoder": seq_encoder.activity_encoder,
        "resource_encoder": seq_encoder.resource_encoder,
        "label_encoder": seq_encoder.label_encoder,
        "context_encoders": ctx_encoder.encoders,
        "context_keys": CONTEXT_KEYS,
        "max_seq_len": seq_encoder.max_seq_len
    }

print(f"\nTrained {len(trained_models)} models")



=== Training DP 1 ===
Epoch 1/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8836 - loss: 0.2821 - val_accuracy: 0.9982 - val_loss: 0.0775
Epoch 2/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.9995 - loss: 0.0353 - val_accuracy: 1.0000 - val_loss: 0.0135
Epoch 3/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 1.0000 - loss: 0.0070 - val_accuracy: 1.0000 - val_loss: 0.0035
Epoch 4/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 1.0000 - loss: 0.0022 - val_accuracy: 1.0000 - val_loss: 0.0014
Epoch 5/10
[1m200/200[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 1.0000 - loss: 0.0010 - val_accuracy: 1.0000 - val_loss: 7.6724e-04
Epoch 5: early stopping
Restoring model weights from the end of the best epoch: 2.

=== Training DP 3 ===
Epoch 1/10
[1m298/298[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

## 8. Save Models


In [9]:
ModelPersistence.save_all(trained_models, MODELS_DIR)
print(f"Saved all models to {MODELS_DIR}")


Saved all models to d:\Repos\process-simulation-engine\Next-Activity-Prediction\advanced\notebooks\..\models_lstm_new
