<a href="https://colab.research.google.com/github/Altemir1/Process-Mining/blob/main/Predictive_Business_Process_Monitoring.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

What do you have to consider, when you split an event log into train and test sets for training and
testing a prediction model?

1. **Data Handling Perspective**: Each trace should remain intact. Traces must be divided into either training or testing sets, not split in the middle. This ensures we respect trace independence and avoid data leakage.

2. **Deployment Perspective**: In production, predictions are made from partial traces (prefixes), not full ones. Therefore, during training, we simulate this by generating prefixes of varying lengths and labeling them with the trace outcome (OK or NOK). This mirrors how the model will be used in real scenarios.

Loading train event logs

In [None]:
def load_traces_with_labels(filepath):
    traces = []
    labels = []

    with open(filepath, "r") as file:
        for line in file:
            parts = line.strip().split()
            if len(parts) < 2:
                continue  # Skip if line is too short
            *trace, label = parts  # All but last = trace, last = label
            traces.append(trace)
            labels.append(label)

    return traces, labels

train_file = "/content/train.txt"  # Make sure this is the correct path
traces, labels = load_traces_with_labels(train_file)

# Show first 5 traces and labels
for trace, label in zip(traces[:5], labels[:5]):
    print(trace, "->", label)

['A', 'C', 'D', 'C', 'D', 'J', 'H'] -> OK
['A', 'C', 'D', 'J', 'H', 'B'] -> NOK
['A', 'B', 'B', 'B', 'J', 'H', 'B', 'J', 'H'] -> OK
['A', 'C', 'D', 'C', 'D', 'B', 'B', 'J', 'H', 'C', 'D', 'B', 'E', 'F', 'G', 'I', 'C', 'D'] -> NOK
['A', 'B', 'E', 'F', 'G', 'F', 'I', 'B', 'B', 'E', 'F', 'G', 'F', 'F', 'F', 'F', 'I'] -> OK


Generation of prefixes

In [None]:
def generate_prefixes(traces, labels, min_prefix_length=1):
    prefix_traces = []
    prefix_labels = []

    for trace, label in zip(traces, labels):
        for i in range(min_prefix_length, len(trace) + 1):
            prefix = trace[:i]
            prefix_traces.append(prefix)
            prefix_labels.append(label)

    return prefix_traces, prefix_labels

# Generate prefixes
prefix_traces, prefix_labels = generate_prefixes(traces, labels)

# Print some examples
for t, l in zip(prefix_traces[:5], prefix_labels[:5]):
    print(t, "->", l)

['A'] -> OK
['A', 'C'] -> OK
['A', 'C', 'D'] -> OK
['A', 'C', 'D', 'C'] -> OK
['A', 'C', 'D', 'C', 'D'] -> OK


Vectorize prefixe

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert each prefix list into a space-joined string
def flatten_traces(traces):
    return [' '.join(trace) for trace in traces]

# Flatten for training
X_train_text = flatten_traces(prefix_traces)


vectorizer = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b", ngram_range=(1, 2))
X_train = vectorizer.fit_transform(X_train_text)

Check vectorization results

In [None]:
print(X_train.shape)  # (1244, n_features)
print(vectorizer.get_feature_names_out()[:10])

(1244, 33)
['a' 'a b' 'a c' 'b' 'b b' 'b c' 'b e' 'b j' 'c' 'c d']


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

y_train = [1 if label == 'OK' else 0 for label in prefix_labels]
# Reuse y_train from earlier
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Train XGBoost
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
model.fit(X_tr, y_tr)
y_pred_xgb = model.predict(X_val)
acc = accuracy_score(y_val, y_pred_xgb)
print("XGBoost Validation Accuracy:", acc)

Parameters: { "use_label_encoder" } are not used.



XGBoost Validation Accuracy: 0.7630522088353414


Evalutaion on dev dataset

In [None]:
from sklearn.metrics import accuracy_score

# Load dev.txt with labels
def load_labeled_traces(filepath):
    traces = []
    labels = []
    with open(filepath, "r") as file:
        for line in file:
            parts = line.strip().split()
            if len(parts) < 2:
                continue
            *trace, label = parts
            traces.append(trace)
            labels.append(label)
    return traces, labels

# Flatten traces into text format for vectorization
def flatten_traces(traces):
    return [' '.join(trace) for trace in traces if len(trace) > 0]

# Load and preprocess dev.txt
dev_traces, dev_labels = load_labeled_traces("dev.txt")
dev_text = flatten_traces(dev_traces)
X_dev = vectorizer.transform(dev_text)  # same vectorizer used for train

# Predict using best model from random search
y_pred = model.predict(X_dev)

# Calculate and print accuracy
y_true = [1 if label == 'OK' else 0 for label in dev_labels]
accuracy = accuracy_score(y_true, y_pred)

print("Dev Set Accuracy (No Prefix Length):", accuracy)

Dev Set Accuracy (No Prefix Length): 0.6666666666666666


Training on full data

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline

# 1. Load train.txt and dev.txt
def load_labeled_traces(filepath):
    traces = []
    labels = []
    with open(filepath, "r") as file:
        for line in file:
            parts = line.strip().split()
            if len(parts) < 2:
                continue
            *trace, label = parts
            traces.append(trace)
            labels.append(label)
    return traces, labels

train_traces, train_labels = load_labeled_traces("train.txt")
dev_traces, dev_labels = load_labeled_traces("dev.txt")

# Combine train + dev
all_traces = train_traces + dev_traces
all_labels = train_labels + dev_labels

# 2. Flatten traces
def flatten_traces(traces):
    return [' '.join(trace) for trace in traces if len(trace) > 0]

X_text = flatten_traces(all_traces)
y_full = [1 if label == 'OK' else 0 for label in all_labels]

# 3. Vectorize
X_all = vectorizer.fit_transform(X_text)  # reuse or re-fit the best vectorizer

# 4. Train best model on full data
model.fit(X_all, y_full)

# 5. Load test prefixes (no labels)
def load_unlabeled_traces(filepath):
    traces = []
    with open(filepath, "r") as file:
        for line in file:
            trace = line.strip().split()
            traces.append(trace)
    return traces

test_traces = load_unlabeled_traces("test_prefixes_no_labels.txt")
X_test_text = flatten_traces(test_traces)
X_test = vectorizer.transform(X_test_text)

# 6. Predict on test set
y_test_pred = model.predict(X_test)
label_test_pred = ['OK' if pred == 1 else 'NOK' for pred in y_test_pred]

# 7. Save predictions to file
with open("test_predictions.txt", "w") as f:
    for label in label_test_pred:
        f.write(label + "\n")

print("test_predictions.txt created successfully.")

test_predictions.txt created successfully.


Parameters: { "use_label_encoder" } are not used.

