# Trace Generator Benchmark

Generate business process traces using TraceGenerator and compare them with the original event log using SimulationBenchmark.

**Workflow:**
1. Load trained model and initialize TraceGenerator
2. Generate traces from various case attributes
3. Load original event log for comparison
4. Compare using SimulationBenchmark
5. Visualize key differences


In [None]:
import sys
import os
import warnings
from pathlib import Path
from datetime import datetime

# Add project root to path
project_root = Path(__file__).parent.parent.parent.parent if '__file__' in globals() else Path.cwd().parent.parent.parent
sys.path.insert(0, str(project_root))

# Add Next-Activity-Prediction to path
na_root = project_root / "Next-Activity-Prediction"
if str(na_root) not in sys.path:
    sys.path.insert(0, str(na_root))

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from bpic17_simplified import TraceGenerator
from integration.SimulationBenchmark import SimulationBenchmark

try:
    import pm4py
    PM4PY_AVAILABLE = True
except ImportError:
    PM4PY_AVAILABLE = False
    warnings.warn("pm4py not available. XES file loading will not work.")

pd.options.mode.chained_assignment = None
warnings.filterwarnings("ignore")

print(f"Benchmarking started: {datetime.now()}")


## 1. Configuration


In [None]:
# Paths
MODEL_PATH = os.path.join(project_root, "models", "bpic17_simplified")
EVENT_LOG_PATH = os.path.join(project_root, "eventlog", "eventlog.xes.gz")
OUTPUT_DIR = os.path.join(project_root, "integration", "output", "trace_generator")

# Trace generation parameters
NUM_TRACES = 200
MAX_TRACE_LENGTH = 100

os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"Model path: {MODEL_PATH}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Number of traces to generate: {NUM_TRACES}")


## 2. Load Original Event Log
where the 

In [None]:
if not PM4PY_AVAILABLE:
    print("⚠ pm4py not available. Cannot load XES file.")
    print("Please install pm4py: pip install pm4py")
else:
    print("Loading original event log...")
    original_log = pm4py.read_xes(EVENT_LOG_PATH)
    df_original = pm4py.convert_to_dataframe(original_log)
    
    # Filter to start/complete only (matching our model)
    df_original = df_original[df_original['lifecycle:transition'].isin(['start', 'complete'])].copy()
    
    # Sample cases for comparison
    unique_cases = df_original['case:concept:name'].unique()
    if len(unique_cases) > NUM_TRACES:
        np.random.seed(42)
        sampled_cases = np.random.choice(unique_cases, NUM_TRACES, replace=False)
        df_original = df_original[df_original['case:concept:name'].isin(sampled_cases)].copy()
    
    original_csv = os.path.join(OUTPUT_DIR, "original_log.csv")
    df_original.to_csv(original_csv, index=False)
    print(f"✓ Original log loaded: {len(df_original)} events, {df_original['case:concept:name'].nunique()} cases")
    print(f"  Saved to: {original_csv}")
    
    # Extract case attribute distributions for realistic trace generation
    case_attributes_dist = df_original.groupby('case:concept:name').first()
    loan_goals = case_attributes_dist['case:LoanGoal'].dropna().unique().tolist()
    app_types = case_attributes_dist['case:ApplicationType'].dropna().unique().tolist()
    amounts = case_attributes_dist['case:RequestedAmount'].dropna().tolist()
    
    print(f"\nFound case attributes:")
    print(f"  Loan goals: {len(loan_goals)} unique values")
    print(f"  Application types: {len(app_types)} unique values")
    print(f"  Requested amounts: min={min(amounts):.0f}, max={max(amounts):.0f}, mean={np.mean(amounts):.0f}")


## 3. Initialize Trace Generator


In [None]:
print("Loading TraceGenerator...")
generator = TraceGenerator(
    model_path=MODEL_PATH,
    max_trace_length=MAX_TRACE_LENGTH,
    seed=42
)
print("✓ TraceGenerator loaded")


## 4. Generate Traces


In [None]:
# Generate case attributes based on original log distribution
np.random.seed(42)

case_attributes = []
for i in range(NUM_TRACES):
    loan_goal = np.random.choice(loan_goals) if loan_goals else "Home improvement"
    app_type = np.random.choice(app_types) if app_types else "New credit"
    amount = np.random.choice(amounts) if amounts else 10000.0
    
    case_attributes.append({
        "loan_goal": loan_goal,
        "application_type": app_type,
        "requested_amount": float(amount),
        "case_id": f"generated_case_{i+1:04d}"
    })

print(f"Generating {NUM_TRACES} traces...")
print("This may take a few minutes...")

traces = generator.generate_traces(case_attributes)

print(f"✓ Generated {len(traces)} traces")
print(f"  Total events: {sum(len(t) for t in traces)}")
print(f"  Average trace length: {np.mean([len(t) for t in traces]):.1f} activities")
print(f"  Min trace length: {min(len(t) for t in traces)} activities")
print(f"  Max trace length: {max(len(t) for t in traces)} activities")


## 5. Convert Traces to Event Log Format


In [None]:
print("Converting traces to event log format...")
df_generated = generator.traces_to_dataframe(traces)

generated_csv = os.path.join(OUTPUT_DIR, "generated_log.csv")
df_generated.to_csv(generated_csv, index=False)

print(f"✓ Generated log saved: {len(df_generated)} events, {df_generated['case:concept:name'].nunique()} cases")
print(f"  Saved to: {generated_csv}")

print(f"\nSample generated events (first 5):")
for idx, row in df_generated.head().iterrows():
    print(f"  [{row['time:timestamp']}] {row['case:concept:name']}: {row['concept:name']}")


## 6. Run Benchmark Comparison


In [None]:
if not PM4PY_AVAILABLE:
    print("⚠ Cannot run benchmark without pm4py. Skipping...")
else:
    print("=" * 60)
    print("TRACE GENERATOR BENCHMARK")
    print("=" * 60)
    print(f"Original log: {original_csv}")
    print(f"Generated log: {generated_csv}")
    print("=" * 60)
    print()
    
    benchmark = SimulationBenchmark(original_csv, generated_csv)
    
    print("Computing metrics...")
    results = benchmark.compute_all_metrics()
    
    print("\n" + "=" * 60)
    print("SUMMARY")
    print("=" * 60)
    benchmark.print_summary()


## 7. Visualize Key Metrics


In [None]:
if not PM4PY_AVAILABLE:
    print("⚠ Cannot visualize without pm4py. Skipping...")
else:
    # Set up plotting style
    plt.style.use('seaborn-v0_8')
    sns.set_palette("husl")
    
    # 1. Events per case distribution
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # Events per case
    orig_events_per_case = df_original.groupby('case:concept:name').size()
    gen_events_per_case = df_generated.groupby('case:concept:name').size()
    
    axes[0, 0].hist(orig_events_per_case, bins=30, alpha=0.7, label='Original', density=True)
    axes[0, 0].hist(gen_events_per_case, bins=30, alpha=0.7, label='Generated', density=True)
    axes[0, 0].set_xlabel('Events per Case')
    axes[0, 0].set_ylabel('Density')
    axes[0, 0].set_title('Events per Case Distribution')
    axes[0, 0].legend()
    axes[0, 0].grid(True, alpha=0.3)
    
    # Activity frequency (top 20)
    orig_act_freq = df_original['concept:name'].value_counts().head(20)
    gen_act_freq = df_generated['concept:name'].value_counts().head(20)
    
    all_activities = list(set(orig_act_freq.index) | set(gen_act_freq.index))
    orig_counts = [orig_act_freq.get(act, 0) for act in all_activities]
    gen_counts = [gen_act_freq.get(act, 0) for act in all_activities]
    
    x = np.arange(len(all_activities))
    width = 0.35
    axes[0, 1].bar(x - width/2, orig_counts, width, label='Original', alpha=0.7)
    axes[0, 1].bar(x + width/2, gen_counts, width, label='Generated', alpha=0.7)
    axes[0, 1].set_xlabel('Activity')
    axes[0, 1].set_ylabel('Frequency')
    axes[0, 1].set_title('Top 20 Activity Frequencies')
    axes[0, 1].set_xticks(x)
    axes[0, 1].set_xticklabels(all_activities, rotation=45, ha='right')
    axes[0, 1].legend()
    axes[0, 1].grid(True, alpha=0.3, axis='y')
    
    # Start activities
    orig_starts = df_original.groupby('case:concept:name').first()['concept:name'].value_counts()
    gen_starts = df_generated.groupby('case:concept:name').first()['concept:name'].value_counts()
    
    all_starts = list(set(orig_starts.index) | set(gen_starts.index))
    orig_start_counts = [orig_starts.get(act, 0) for act in all_starts]
    gen_start_counts = [gen_starts.get(act, 0) for act in all_starts]
    
    x = np.arange(len(all_starts))
    axes[1, 0].bar(x - width/2, orig_start_counts, width, label='Original', alpha=0.7)
    axes[1, 0].bar(x + width/2, gen_start_counts, width, label='Generated', alpha=0.7)
    axes[1, 0].set_xlabel('Start Activity')
    axes[1, 0].set_ylabel('Frequency')
    axes[1, 0].set_title('Start Activity Distribution')
    axes[1, 0].set_xticks(x)
    axes[1, 0].set_xticklabels(all_starts, rotation=45, ha='right')
    axes[1, 0].legend()
    axes[1, 0].grid(True, alpha=0.3, axis='y')
    
    # End activities
    orig_ends = df_original.groupby('case:concept:name').last()['concept:name'].value_counts()
    gen_ends = df_generated.groupby('case:concept:name').last()['concept:name'].value_counts()
    
    all_ends = list(set(orig_ends.index) | set(gen_ends.index))
    orig_end_counts = [orig_ends.get(act, 0) for act in all_ends]
    gen_end_counts = [gen_ends.get(act, 0) for act in all_ends]
    
    x = np.arange(len(all_ends))
    axes[1, 1].bar(x - width/2, orig_end_counts, width, label='Original', alpha=0.7)
    axes[1, 1].bar(x + width/2, gen_end_counts, width, label='Generated', alpha=0.7)
    axes[1, 1].set_xlabel('End Activity')
    axes[1, 1].set_ylabel('Frequency')
    axes[1, 1].set_title('End Activity Distribution')
    axes[1, 1].set_xticks(x)
    axes[1, 1].set_xticklabels(all_ends, rotation=45, ha='right')
    axes[1, 1].legend()
    axes[1, 1].grid(True, alpha=0.3, axis='y')
    
    plt.tight_layout()
    plot_path = os.path.join(OUTPUT_DIR, "benchmark_visualizations.png")
    plt.savefig(plot_path, dpi=150, bbox_inches='tight')
    print(f"✓ Visualizations saved to: {plot_path}")
    plt.show()


## 8. Export Benchmark Results


In [None]:
if not PM4PY_AVAILABLE:
    print("⚠ Cannot export results without pm4py. Skipping...")
else:
    output_path = os.path.join(OUTPUT_DIR, "benchmark_results.xlsx")
    print(f"Exporting results to: {output_path}")
    benchmark.export_results(output_path)
    
    print("\n" + "=" * 60)
    print("BENCHMARK COMPLETE")
    print("=" * 60)
    print(f"Results saved to: {output_path}")
    print(f"Generated log saved to: {generated_csv}")
    print(f"Original log saved to: {original_csv}")
    print(f"Benchmarking completed at: {datetime.now()}")
