# Comprehensive Evaluation: Unified Next Activity Predictor

Evaluate the unified next activity predictor against BPIC 2017 event log with lifecycle filtering.

**Evaluation Metrics:**
- Activity prediction accuracy (overall, top-k, per-class)
- Lifecycle prediction accuracy
- Joint accuracy (both activity and lifecycle correct)
- Confusion matrices and classification reports
- Trace-level performance analysis
- Baseline comparisons


In [1]:
import sys
import os
import warnings
import importlib
from pathlib import Path

# Add parent to path
sys.path.insert(0, os.path.dirname(os.getcwd()))

import numpy as np
import pandas as pd
from datetime import datetime
from collections import defaultdict
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    accuracy_score,
    precision_recall_fscore_support
)
import matplotlib.pyplot as plt
import seaborn as sns

from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.objects.conversion.log import converter as log_converter

# Force reload modules to pick up changes
import unified
import unified.model
import unified.data_generator
import unified.utils
importlib.reload(unified.model)
importlib.reload(unified.data_generator)
importlib.reload(unified.utils)
importlib.reload(unified)

from unified import UnifiedDataGenerator, UnifiedPredictor, UnifiedModelPersistence, filter_lifecycle_events

pd.options.mode.chained_assignment = None
warnings.filterwarnings("ignore")
plt.style.use("seaborn-v0_8-whitegrid")

print(f"Evaluation started: {datetime.now()}")


Evaluation started: 2026-01-06 22:43:53.700964


  if not hasattr(np, "object"):


## 1. Configuration


In [2]:
# Paths
XES_PATH = os.path.join(os.getcwd(), "..", "..", "..", "Dataset", "BPI Challenge 2017.xes")
MODEL_DIR = os.path.join(os.getcwd(), "..", "..", "..", "models", "unified_next_activity")
OUTPUT_DIR = os.path.join(os.getcwd(), "..", "..", "..", "results", "unified_evaluation")
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Evaluation parameters
TEST_SIZE = 0.2  # Last 20% of cases by timestamp for temporal split
MAX_HISTORY = 20  # Match training configuration
TOP_K_VALUES = [1, 3, 5]  # For top-k accuracy
RANDOM_STATE = 42

print(f"Model directory: {MODEL_DIR}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Test set size: {TEST_SIZE*100:.0f}%")


Model directory: d:\Repos\process-simulation-engine\Next-Activity-Prediction\advanced\notebooks\..\..\..\models\unified_next_activity
Output directory: d:\Repos\process-simulation-engine\Next-Activity-Prediction\advanced\notebooks\..\..\..\results\unified_evaluation
Test set size: 20%


## 2. Load Trained Model


In [3]:
print(f"Loading model from: {MODEL_DIR}")
bundle = UnifiedModelPersistence.load(MODEL_DIR)
model = bundle["model"]
encoder = bundle["encoder"]

print(f"\nModel loaded successfully!")
print(f"  Target activities: {encoder.num_target_activities}")
print(f"  Target lifecycles: {encoder.num_target_lifecycles}")
print(f"  Context keys: {encoder.context_keys}")
print(f"  Max sequence length: {encoder.max_seq_len}")

# Display available activities and lifecycles
print(f"\nAvailable activities ({len(encoder.target_activity_encoder.classes_)}):")
print(encoder.target_activity_encoder.classes_[:10], "..." if len(encoder.target_activity_encoder.classes_) > 10 else "")

print(f"\nAvailable lifecycles ({len(encoder.target_lifecycle_encoder.classes_)}):")
print(encoder.target_lifecycle_encoder.classes_)


Loading model from: d:\Repos\process-simulation-engine\Next-Activity-Prediction\advanced\notebooks\..\..\..\models\unified_next_activity


Model loaded successfully!
  Target activities: 25
  Target lifecycles: 2
  Context keys: ['case:LoanGoal', 'case:ApplicationType', 'case:RequestedAmount']
  Max sequence length: 20

Available activities (25):
['A_Accepted' 'A_Cancelled' 'A_Complete' 'A_Concept' 'A_Denied'
 'A_Incomplete' 'A_Pending' 'A_Submitted' 'A_Validating' 'End'] ...

Available lifecycles (2):
['complete' 'start']


## 3. Load and Filter Event Log


In [4]:
print(f"Loading event log from: {XES_PATH}")
event_log = xes_importer.apply(XES_PATH)
df_log = log_converter.apply(event_log, variant=log_converter.Variants.TO_DATA_FRAME)

print(f"Loaded {len(df_log):,} events from {df_log['case:concept:name'].nunique():,} cases")
print(f"\nColumns: {list(df_log.columns)}")

# Check lifecycle distribution before filtering
if 'lifecycle:transition' in df_log.columns:
    print(f"\nLifecycle transitions before filtering:")
    print(df_log['lifecycle:transition'].value_counts())
    
    # Apply lifecycle filtering (keep only 'start' and 'complete')
    df_log = filter_lifecycle_events(df_log, allowed_lifecycles=['start', 'complete'])
    print(f"\nAfter filtering: {len(df_log):,} events from {df_log['case:concept:name'].nunique():,} cases")
else:
    print("\nWarning: No 'lifecycle:transition' column found. All events will be kept.")

# Ensure timestamp is datetime
df_log["time:timestamp"] = pd.to_datetime(df_log["time:timestamp"])

# Sort by case and timestamp
df_log = df_log.sort_values(["case:concept:name", "time:timestamp"]).reset_index(drop=True)


Loading event log from: d:\Repos\process-simulation-engine\Next-Activity-Prediction\advanced\notebooks\..\..\..\Dataset\BPI Challenge 2017.xes


parsing log, completed traces :: 100%|██████████| 31509/31509 [00:39<00:00, 801.12it/s] 


Loaded 1,202,267 events from 31,509 cases

Columns: ['Action', 'org:resource', 'concept:name', 'EventOrigin', 'EventID', 'lifecycle:transition', 'time:timestamp', 'case:LoanGoal', 'case:ApplicationType', 'case:concept:name', 'case:RequestedAmount', 'FirstWithdrawalAmount', 'NumberOfTerms', 'Accepted', 'MonthlyCost', 'Selected', 'CreditScore', 'OfferedAmount', 'OfferID']

Lifecycle transitions before filtering:
lifecycle:transition
complete     475306
suspend      215402
schedule     149104
start        128227
resume       127160
ate_abort     85224
withdraw      21844
Name: count, dtype: int64

After filtering: 603,533 events from 31,509 cases


## 4. Prepare Test Set


In [5]:
# Temporal split: use last TEST_SIZE% of cases by timestamp
df_log["time:timestamp"] = pd.to_datetime(df_log["time:timestamp"])
case_start_times = df_log.groupby("case:concept:name")["time:timestamp"].min().sort_values()
split_idx = int(len(case_start_times) * (1 - TEST_SIZE))
test_cases = set(case_start_times.iloc[split_idx:].index)

df_test = df_log[df_log["case:concept:name"].isin(test_cases)].copy()
df_train = df_log[~df_log["case:concept:name"].isin(test_cases)].copy()

print(f"Temporal split:")
print(f"  Training cases: {len(df_train['case:concept:name'].unique()):,} ({len(df_train):,} events)")
print(f"  Test cases: {len(df_test['case:concept:name'].unique()):,} ({len(df_test):,} events)")
print(f"  Test period: {df_test['time:timestamp'].min()} to {df_test['time:timestamp'].max()}")

# Prepare test sequences (similar to training data generation but without filtering)
# We'll evaluate at each position in each trace
test_sequences = []

for case_id, group in df_test.groupby("case:concept:name"):
    activities = group["concept:name"].tolist()
    lifecycles = group["lifecycle:transition"].fillna("complete").str.lower().tolist()
    resources = group["org:resource"].fillna("Unknown").tolist()
    timestamps = group["time:timestamp"].tolist()
    
    # Extract context
    context = {}
    for key in encoder.context_keys:
        if key in group.columns:
            context[key] = group[key].iloc[0]
        else:
            context[key] = None
    
    # Add END token at the end
    activities.append("End")
    lifecycles.append("complete")
    resources.append("Unknown")
    if timestamps:
        timestamps.append(timestamps[-1])
    
    # Create sequences for evaluation (skip first event, need at least 2)
    if len(activities) < 2:
        continue
    
    for i in range(1, len(activities)):
        start_idx = max(0, i - MAX_HISTORY)
        seq_activities = activities[start_idx:i]
        seq_lifecycles = lifecycles[start_idx:i]
        seq_resources = resources[start_idx:i]
        seq_timestamps = timestamps[start_idx:i]
        
        # Compute durations
        durations = [0.0]
        for j in range(1, len(seq_timestamps)):
            delta = (seq_timestamps[j] - seq_timestamps[j-1]).total_seconds()
            durations.append(np.log1p(max(0.0, delta)))
        
        test_sequences.append({
            "case_id": case_id,
            "position": i,
            "sequence_activities": seq_activities,
            "sequence_lifecycles": seq_lifecycles,
            "sequence_resources": seq_resources,
            "sequence_durations": durations,
            "target_activity": activities[i],
            "target_lifecycle": lifecycles[i],
            "context": context,
        })

print(f"\nGenerated {len(test_sequences):,} test sequences for evaluation")


Temporal split:
  Training cases: 25,207 (481,714 events)
  Test cases: 6,302 (121,819 events)
  Test period: 2016-10-18 23:42:39.265000+00:00 to 2017-02-01 14:00:30.347000+00:00

Generated 121,819 test sequences for evaluation


In [None]:
print("Running evaluation loop...")
print(f"Evaluating {len(test_sequences):,} sequences...")

# Storage for predictions and ground truth
all_pred_activities = []
all_pred_lifecycles = []
all_true_activities = []
all_true_lifecycles = []
all_top_k_activities = {k: [] for k in TOP_K_VALUES}

# Track errors
errors = []

# Progress tracking
eval_every = max(1, len(test_sequences) // 20)

for idx, seq in enumerate(test_sequences):
    if (idx + 1) % eval_every == 0:
        print(f"  Progress: {idx+1:,}/{len(test_sequences):,} ({100*(idx+1)/len(test_sequences):.1f}%)")
    
    try:
        # Predict
        activity_probs, lifecycle_probs = model.predict(
            seq["sequence_activities"],
            seq["sequence_lifecycles"],
            seq["sequence_resources"],
            seq["sequence_durations"],
            seq["context"],
            top_k=max(TOP_K_VALUES)
        )
        
        # Get top-1 predictions
        pred_activity = activity_probs[0][0] if activity_probs else "UNKNOWN"
        pred_lifecycle = lifecycle_probs[0][0] if lifecycle_probs else "complete"
        
        # Get top-k activities
        for k in TOP_K_VALUES:
            top_k_acts = [act for act, _ in activity_probs[:k]]
            all_top_k_activities[k].append(top_k_acts)
        
        # Store predictions
        all_pred_activities.append(pred_activity)
        all_pred_lifecycles.append(pred_lifecycle)
        all_true_activities.append(seq["target_activity"])
        all_true_lifecycles.append(seq["target_lifecycle"])
        
    except Exception as e:
        errors.append({
            "case_id": seq["case_id"],
            "position": seq["position"],
            "error": str(e)
        })
        # Use fallback predictions
        all_pred_activities.append("UNKNOWN")
        all_pred_lifecycles.append("complete")
        all_true_activities.append(seq["target_activity"])
        all_true_lifecycles.append(seq["target_lifecycle"])
        for k in TOP_K_VALUES:
            all_top_k_activities[k].append([])

print(f"\nEvaluation complete!")
print(f"  Successful predictions: {len(test_sequences) - len(errors):,}")
print(f"  Errors: {len(errors):,}")

if errors:
    print(f"\nSample errors:")
    for err in errors[:5]:
        print(f"  Case {err['case_id']}, position {err['position']}: {err['error']}")


Running evaluation loop...
Evaluating 121,819 sequences...
  Progress: 6,090/121,819 (5.0%)
  Progress: 12,180/121,819 (10.0%)
  Progress: 18,270/121,819 (15.0%)


In [None]:
# Convert to numpy arrays for easier computation
y_true_act = np.array(all_true_activities)
y_pred_act = np.array(all_pred_activities)
y_true_lc = np.array(all_true_lifecycles)
y_pred_lc = np.array(all_pred_lifecycles)

# Overall accuracy
activity_accuracy = accuracy_score(y_true_act, y_pred_act)
lifecycle_accuracy = accuracy_score(y_true_lc, y_pred_lc)

# Joint accuracy (both correct)
joint_correct = (y_true_act == y_pred_act) & (y_true_lc == y_pred_lc)
joint_accuracy = joint_correct.mean()

# Top-k accuracy
top_k_accuracies = {}
for k in TOP_K_VALUES:
    correct = 0
    for i, true_act in enumerate(y_true_act):
        if true_act in all_top_k_activities[k][i]:
            correct += 1
    top_k_accuracies[k] = correct / len(y_true_act)

print("=" * 60)
print("OVERALL METRICS")
print("=" * 60)
print(f"Activity Accuracy:     {activity_accuracy:.4f} ({activity_accuracy*100:.2f}%)")
print(f"Lifecycle Accuracy:    {lifecycle_accuracy:.4f} ({lifecycle_accuracy*100:.2f}%)")
print(f"Joint Accuracy:        {joint_accuracy:.4f} ({joint_accuracy*100:.2f}%)")
print(f"\nTop-K Activity Accuracy:")
for k in TOP_K_VALUES:
    print(f"  Top-{k}: {top_k_accuracies[k]:.4f} ({top_k_accuracies[k]*100:.2f}%)")


In [None]:
# Per-activity metrics
print("\n" + "=" * 60)
print("ACTIVITY PREDICTION METRICS")
print("=" * 60)
activity_report = classification_report(
    y_true_act, y_pred_act,
    output_dict=True,
    zero_division=0
)

print("\nClassification Report (Activities):")
print(classification_report(y_true_act, y_pred_act, zero_division=0))

# Per-lifecycle metrics
print("\n" + "=" * 60)
print("LIFECYCLE PREDICTION METRICS")
print("=" * 60)
lifecycle_report = classification_report(
    y_true_lc, y_pred_lc,
    output_dict=True,
    zero_division=0
)

print("\nClassification Report (Lifecycles):")
print(classification_report(y_true_lc, y_pred_lc, zero_division=0))

# Conditional accuracies
act_given_lc_correct = (y_true_act == y_pred_act)[y_true_lc == y_pred_lc]
lc_given_act_correct = (y_true_lc == y_pred_lc)[y_true_act == y_pred_act]

print("\n" + "=" * 60)
print("CONDITIONAL ACCURACIES")
print("=" * 60)
if len(act_given_lc_correct) > 0:
    print(f"Activity accuracy given lifecycle correct: {act_given_lc_correct.mean():.4f} ({act_given_lc_correct.mean()*100:.2f}%)")
else:
    print("Activity accuracy given lifecycle correct: N/A (no lifecycle matches)")
    
if len(lc_given_act_correct) > 0:
    print(f"Lifecycle accuracy given activity correct: {lc_given_act_correct.mean():.4f} ({lc_given_act_correct.mean()*100:.2f}%)")
else:
    print("Lifecycle accuracy given activity correct: N/A (no activity matches)")


## 7. Baseline Comparisons


In [None]:
# Most frequent baseline
most_freq_activity = pd.Series(y_true_act).mode()[0] if len(pd.Series(y_true_act).mode()) > 0 else "UNKNOWN"
most_freq_lifecycle = pd.Series(y_true_lc).mode()[0] if len(pd.Series(y_true_lc).mode()) > 0 else "complete"

baseline_act_pred = np.full_like(y_true_act, most_freq_activity)
baseline_lc_pred = np.full_like(y_true_lc, most_freq_lifecycle)

baseline_act_acc = accuracy_score(y_true_act, baseline_act_pred)
baseline_lc_acc = accuracy_score(y_true_lc, baseline_lc_pred)
baseline_joint_acc = ((y_true_act == baseline_act_pred) & (y_true_lc == baseline_lc_pred)).mean()

# Random baseline (uniform distribution)
np.random.seed(RANDOM_STATE)
unique_activities = np.unique(y_true_act)
unique_lifecycles = np.unique(y_true_lc)

random_act_pred = np.random.choice(unique_activities, size=len(y_true_act))
random_lc_pred = np.random.choice(unique_lifecycles, size=len(y_true_lc))

random_act_acc = accuracy_score(y_true_act, random_act_pred)
random_lc_acc = accuracy_score(y_true_lc, random_lc_pred)
random_joint_acc = ((y_true_act == random_act_pred) & (y_true_lc == random_lc_pred)).mean()

print("=" * 60)
print("BASELINE COMPARISONS")
print("=" * 60)
print(f"\nMost Frequent Baseline:")
print(f"  Activity Accuracy:     {baseline_act_acc:.4f} ({baseline_act_acc*100:.2f}%)")
print(f"  Lifecycle Accuracy:    {baseline_lc_acc:.4f} ({baseline_lc_acc*100:.2f}%)")
print(f"  Joint Accuracy:        {baseline_joint_acc:.4f} ({baseline_joint_acc*100:.2f}%)")

print(f"\nRandom Baseline:")
print(f"  Activity Accuracy:     {random_act_acc:.4f} ({random_act_acc*100:.2f}%)")
print(f"  Lifecycle Accuracy:     {random_lc_acc:.4f} ({random_lc_acc*100:.2f}%)")
print(f"  Joint Accuracy:        {random_joint_acc:.4f} ({random_joint_acc*100:.2f}%)")

print(f"\nModel vs Baselines:")
print(f"  Activity improvement over most frequent: {((activity_accuracy - baseline_act_acc) / baseline_act_acc * 100):+.2f}%")
print(f"  Activity improvement over random:       {((activity_accuracy - random_act_acc) / random_act_acc * 100):+.2f}%")
print(f"  Joint improvement over most frequent:   {((joint_accuracy - baseline_joint_acc) / baseline_joint_acc * 100):+.2f}%")
print(f"  Joint improvement over random:          {((joint_accuracy - random_joint_acc) / random_joint_acc * 100):+.2f}%")


## 8. Visualizations


In [None]:
# Confusion matrix for activities
cm_activities = confusion_matrix(y_true_act, y_pred_act, labels=encoder.target_activity_encoder.classes_)

plt.figure(figsize=(14, 12))
sns.heatmap(
    cm_activities,
    annot=True,
    fmt="d",
    cmap="Blues",
    xticklabels=encoder.target_activity_encoder.classes_,
    yticklabels=encoder.target_activity_encoder.classes_,
    cbar_kws={"label": "Count"}
)
plt.title("Confusion Matrix: Activity Predictions", fontsize=16, fontweight="bold")
plt.xlabel("Predicted Activity", fontsize=12)
plt.ylabel("True Activity", fontsize=12)
plt.xticks(rotation=45, ha="right")
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "confusion_matrix_activities.png"), dpi=300, bbox_inches="tight")
plt.show()


In [None]:
# Confusion matrix for lifecycles
cm_lifecycles = confusion_matrix(y_true_lc, y_pred_lc, labels=encoder.target_lifecycle_encoder.classes_)

plt.figure(figsize=(8, 6))
sns.heatmap(
    cm_lifecycles,
    annot=True,
    fmt="d",
    cmap="Greens",
    xticklabels=encoder.target_lifecycle_encoder.classes_,
    yticklabels=encoder.target_lifecycle_encoder.classes_,
    cbar_kws={"label": "Count"}
)
plt.title("Confusion Matrix: Lifecycle Predictions", fontsize=16, fontweight="bold")
plt.xlabel("Predicted Lifecycle", fontsize=12)
plt.ylabel("True Lifecycle", fontsize=12)
plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "confusion_matrix_lifecycles.png"), dpi=300, bbox_inches="tight")
plt.show()


In [None]:
# Top-k accuracy comparison
fig, ax = plt.subplots(figsize=(10, 6))
k_values = [1] + TOP_K_VALUES
acc_values = [activity_accuracy] + [top_k_accuracies[k] for k in TOP_K_VALUES]

bars = ax.bar(range(len(k_values)), acc_values, color="steelblue", alpha=0.7)
ax.set_xlabel("Top-K", fontsize=12)
ax.set_ylabel("Accuracy", fontsize=12)
ax.set_title("Top-K Activity Prediction Accuracy", fontsize=14, fontweight="bold")
ax.set_xticks(range(len(k_values)))
ax.set_xticklabels([f"Top-{k}" for k in k_values])
ax.set_ylim([0, 1])
ax.grid(axis="y", alpha=0.3)

# Add value labels on bars
for bar, acc in zip(bars, acc_values):
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{acc:.3f}',
            ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "top_k_accuracy.png"), dpi=300, bbox_inches="tight")
plt.show()


In [None]:
# Per-activity performance
activity_performance = {}
for activity in encoder.target_activity_encoder.classes_:
    mask = y_true_act == activity
    if mask.sum() > 0:
        activity_performance[activity] = {
            "accuracy": accuracy_score(y_true_act[mask], y_pred_act[mask]),
            "count": mask.sum()
        }

# Sort by count and take top 15
sorted_activities = sorted(activity_performance.items(), key=lambda x: x[1]["count"], reverse=True)[:15]

fig, ax = plt.subplots(figsize=(14, 8))
activities = [a[0] for a in sorted_activities]
accuracies = [a[1]["accuracy"] for a in sorted_activities]
counts = [a[1]["count"] for a in sorted_activities]

x_pos = np.arange(len(activities))
bars = ax.barh(x_pos, accuracies, color="coral", alpha=0.7)
ax.set_yticks(x_pos)
ax.set_yticklabels(activities)
ax.set_xlabel("Accuracy", fontsize=12)
ax.set_title("Per-Activity Prediction Accuracy (Top 15 by Frequency)", fontsize=14, fontweight="bold")
ax.set_xlim([0, 1])
ax.grid(axis="x", alpha=0.3)

# Add count labels
for i, (bar, count) in enumerate(zip(bars, counts)):
    width = bar.get_width()
    ax.text(width, bar.get_y() + bar.get_height()/2.,
            f' {width:.3f} (n={count})',
            ha='left', va='center', fontsize=9)

plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "per_activity_accuracy.png"), dpi=300, bbox_inches="tight")
plt.show()


## 9. Trace-Level Analysis


In [None]:
# Analyze accuracy by position in trace
position_accuracies = defaultdict(list)
position_counts = defaultdict(int)

for idx, seq in enumerate(test_sequences):
    pos = seq["position"]
    position_counts[pos] += 1
    if y_pred_act[idx] == y_true_act[idx]:
        position_accuracies[pos].append(1)
    else:
        position_accuracies[pos].append(0)

# Compute average accuracy per position
pos_avg_acc = {pos: np.mean(accs) for pos, accs in position_accuracies.items() if len(accs) > 0}
sorted_positions = sorted(pos_avg_acc.keys())

fig, ax = plt.subplots(figsize=(12, 6))
positions = sorted_positions[:50]  # Show first 50 positions
accs = [pos_avg_acc[p] for p in positions]
counts = [position_counts[p] for p in positions]

ax.plot(positions, accs, marker="o", linewidth=2, markersize=4, color="steelblue")
ax.set_xlabel("Position in Trace", fontsize=12)
ax.set_ylabel("Accuracy", fontsize=12)
ax.set_title("Prediction Accuracy by Position in Trace", fontsize=14, fontweight="bold")
ax.set_ylim([0, 1])
ax.grid(alpha=0.3)

# Add count annotations for positions with low counts
for pos, acc, count in zip(positions, accs, counts):
    if count < 10:
        ax.annotate(f"n={count}", (pos, acc), fontsize=7, alpha=0.6)

plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, "accuracy_by_position.png"), dpi=300, bbox_inches="tight")
plt.show()

print(f"Average accuracy by position (first 10 positions):")
for pos in sorted_positions[:10]:
    print(f"  Position {pos}: {pos_avg_acc[pos]:.4f} (n={position_counts[pos]})")


## 10. Results Summary and Export


In [None]:
# Create summary DataFrame
summary_data = {
    "Metric": [
        "Activity Accuracy",
        "Lifecycle Accuracy",
        "Joint Accuracy",
        f"Top-{TOP_K_VALUES[0]} Accuracy",
        f"Top-{TOP_K_VALUES[1]} Accuracy",
        f"Top-{TOP_K_VALUES[2]} Accuracy",
        "Baseline (Most Frequent) Activity Accuracy",
        "Baseline (Random) Activity Accuracy",
        "Baseline (Most Frequent) Joint Accuracy",
        "Baseline (Random) Joint Accuracy",
    ],
    "Value": [
        activity_accuracy,
        lifecycle_accuracy,
        joint_accuracy,
        top_k_accuracies[TOP_K_VALUES[0]],
        top_k_accuracies[TOP_K_VALUES[1]],
        top_k_accuracies[TOP_K_VALUES[2]],
        baseline_act_acc,
        random_act_acc,
        baseline_joint_acc,
        random_joint_acc,
    ]
}

summary_df = pd.DataFrame(summary_data)
print("\n" + "=" * 60)
print("SUMMARY METRICS")
print("=" * 60)
print(summary_df.to_string(index=False))

# Save summary
summary_df.to_csv(os.path.join(OUTPUT_DIR, "summary_metrics.csv"), index=False)
print(f"\nSummary saved to: {os.path.join(OUTPUT_DIR, 'summary_metrics.csv')}")


In [None]:
# Save detailed predictions
predictions_df = pd.DataFrame({
    "case_id": [seq["case_id"] for seq in test_sequences],
    "position": [seq["position"] for seq in test_sequences],
    "true_activity": y_true_act,
    "pred_activity": y_pred_act,
    "true_lifecycle": y_true_lc,
    "pred_lifecycle": y_pred_lc,
    "correct_activity": y_true_act == y_pred_act,
    "correct_lifecycle": y_true_lc == y_pred_lc,
    "correct_joint": joint_correct,
})

predictions_df.to_csv(os.path.join(OUTPUT_DIR, "detailed_predictions.csv"), index=False)
print(f"Detailed predictions saved to: {os.path.join(OUTPUT_DIR, 'detailed_predictions.csv')}")
print(f"  Total predictions: {len(predictions_df):,}")


In [None]:
# Save per-activity metrics
activity_metrics = []
for activity in encoder.target_activity_encoder.classes_:
    mask = y_true_act == activity
    if mask.sum() > 0:
        precision, recall, f1, support = precision_recall_fscore_support(
            y_true_act[mask], y_pred_act[mask], average="weighted", zero_division=0
        )
        acc = accuracy_score(y_true_act[mask], y_pred_act[mask])
        activity_metrics.append({
            "activity": activity,
            "accuracy": acc,
            "precision": precision,
            "recall": recall,
            "f1_score": f1,
            "support": mask.sum()
        })

activity_metrics_df = pd.DataFrame(activity_metrics).sort_values("support", ascending=False)
activity_metrics_df.to_csv(os.path.join(OUTPUT_DIR, "per_activity_metrics.csv"), index=False)
print(f"Per-activity metrics saved to: {os.path.join(OUTPUT_DIR, 'per_activity_metrics.csv')}")

# Save per-lifecycle metrics
lifecycle_metrics = []
for lifecycle in encoder.target_lifecycle_encoder.classes_:
    mask = y_true_lc == lifecycle
    if mask.sum() > 0:
        precision, recall, f1, support = precision_recall_fscore_support(
            y_true_lc[mask], y_pred_lc[mask], average="weighted", zero_division=0
        )
        acc = accuracy_score(y_true_lc[mask], y_pred_lc[mask])
        lifecycle_metrics.append({
            "lifecycle": lifecycle,
            "accuracy": acc,
            "precision": precision,
            "recall": recall,
            "f1_score": f1,
            "support": mask.sum()
        })

lifecycle_metrics_df = pd.DataFrame(lifecycle_metrics).sort_values("support", ascending=False)
lifecycle_metrics_df.to_csv(os.path.join(OUTPUT_DIR, "per_lifecycle_metrics.csv"), index=False)
print(f"Per-lifecycle metrics saved to: {os.path.join(OUTPUT_DIR, 'per_lifecycle_metrics.csv')}")


In [None]:
print("\n" + "=" * 60)
print("EVALUATION COMPLETE")
print("=" * 60)
print(f"\nAll results saved to: {OUTPUT_DIR}")
print(f"\nFiles generated:")
print(f"  - summary_metrics.csv")
print(f"  - detailed_predictions.csv")
print(f"  - per_activity_metrics.csv")
print(f"  - per_lifecycle_metrics.csv")
print(f"  - confusion_matrix_activities.png")
print(f"  - confusion_matrix_lifecycles.png")
print(f"  - top_k_accuracy.png")
print(f"  - per_activity_accuracy.png")
print(f"  - accuracy_by_position.png")
print(f"\nCompleted: {datetime.now()}")
