# Process Simulation Engine - Demo

This notebook demonstrates the simulation engine capabilities:
- Running a simulation with several hundred cases
- Displaying the generated event log
- Computing basic metrics (case duration, activities, events per case)
- Visualizing the process with DFG (Directly Follows Graph)
- Comprehensive statistics analysis

## 1. Setup and Configuration


In [None]:
import sys
import os
from pathlib import Path
from datetime import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Add project root to path
project_root = Path().resolve()
sys.path.insert(0, str(project_root))

# Import simulation components
from integration.config import SimulationConfig
from integration.setup import setup_simulation
from simulation.engine import DESEngine
from simulation.log_exporter import LogExporter
from resources import ResourceAllocator

# Import pm4py for analysis
try:
    import pm4py
    PM4PY_AVAILABLE = True
except ImportError:
    PM4PY_AVAILABLE = False
    print("WARNING: pm4py not available. Install with: pip install pm4py")

# Configuration
EVENT_LOG_PATH = "eventlog/eventlog.xes.gz"
NUM_CASES = 500  # Simulate 500 cases for demo
OUTPUT_DIR = "integration/output"

print(f"Project root: {project_root}")
print(f"Event log path: {EVENT_LOG_PATH}")
print(f"Number of cases to simulate: {NUM_CASES}")
print(f"PM4Py available: {PM4PY_AVAILABLE}")


## 2. Load Event Log and Setup Simulation


In [None]:
def load_event_log(path: str) -> pd.DataFrame:
    """Load event log from XES or CSV file."""
    if path.endswith('.xes') or path.endswith('.xes.gz'):
        log = pm4py.read_xes(path)
        df = pm4py.convert_to_dataframe(log)
    elif path.endswith('.csv'):
        df = pd.read_csv(path)
    else:
        raise ValueError(f"Unsupported file format: {path}")
    
    print(f"Loaded event log: {len(df)} events, {df['case:concept:name'].nunique()} cases")
    return df

# Load original event log
print(f"Loading event log from: {EVENT_LOG_PATH}")
df_original = load_event_log(EVENT_LOG_PATH)

# Get start date from event log
if 'time:timestamp' in df_original.columns:
    start_date = pd.to_datetime(df_original['time:timestamp']).min().to_pydatetime()
else:
    start_date = datetime(2016, 1, 4, 8, 0)

print(f"Simulation start date: {start_date}")


## 3. Run Simulation


In [None]:
# Create configuration (using advanced mode for better results)
config = SimulationConfig.all_advanced(
    event_log_path=EVENT_LOG_PATH,
    num_cases=NUM_CASES,
)

print("=" * 60)
print("SIMULATION CONFIGURATION")
print("=" * 60)
print(f"  Processing time mode: {config.processing_time_mode}")
print(f"  Case arrival mode: {config.case_arrival_mode}")
print(f"  Case attribute mode: {config.case_attribute_mode}")
print(f"  Number of cases: {config.num_cases}")
print("=" * 60)

# Create resource allocator
print("\nCreating resource allocator...")
allocator = ResourceAllocator(log_path=EVENT_LOG_PATH)
print("Resource allocator created")

# Setup predictors
print("\nSetting up predictors...")
arrivals, next_act_pred, proc_pred, attr_pred = setup_simulation(
    config,
    df=df_original,
    start_date=start_date,
)
print(f"Generated {len(arrivals)} arrival timestamps")

# Adjust start_time to be the earliest of simulation start date or first arrival
engine_start_time = start_date
if arrivals and len(arrivals) > 0:
    if arrivals[0] < start_date:
        engine_start_time = arrivals[0]
        print(f"Adjusting simulation start time to first arrival: {engine_start_time}")

# Create and run engine
print("\nInitializing DESEngine...")
engine = DESEngine(
    resource_allocator=allocator,
    arrival_timestamps=arrivals,
    next_activity_predictor=next_act_pred,
    processing_time_predictor=proc_pred,
    case_attribute_predictor=attr_pred,
    start_time=engine_start_time,
)

print("\nRunning simulation...")
events = engine.run(num_cases=len(arrivals))

print("\n" + "=" * 60)
print("SIMULATION RESULTS")
print("=" * 60)
print(f"  Cases started: {engine.stats['cases_started']}")
print(f"  Cases completed: {engine.stats['cases_completed']}")
print(f"  Events generated: {len(events)}")
print(f"  Outside hours: {engine.stats['outside_hours_count']}")
print(f"  No eligible: {engine.stats['no_eligible_failures']}")
print("=" * 60)


## 4. Export and Display Event Log


In [None]:
# Export to CSV and XES
os.makedirs(OUTPUT_DIR, exist_ok=True)

csv_path = os.path.join(OUTPUT_DIR, "simulated_log.csv")
xes_path = os.path.join(OUTPUT_DIR, "simulated_log.xes")

LogExporter.to_csv(events, csv_path)
print(f"Exported CSV to: {csv_path}")

if PM4PY_AVAILABLE:
    try:
        LogExporter.to_xes(events, xes_path)
        print(f"Exported XES to: {xes_path}")
    except Exception as e:
        print(f"Could not export XES: {e}")

# Convert to DataFrame for analysis
df_simulated = pd.DataFrame(events)

# Display first few rows
print("\n" + "=" * 60)
print("SAMPLE EVENT LOG (First 10 rows)")
print("=" * 60)
display_cols = ['case:concept:name', 'concept:name', 'time:timestamp', 'org:resource']
available_cols = [col for col in display_cols if col in df_simulated.columns]
print(df_simulated[available_cols].head(10).to_string())

print(f"\nTotal columns in event log: {len(df_simulated.columns)}")
print(f"Column names: {list(df_simulated.columns)}")


## 5. Basic Metrics

In [None]:
# Ensure timestamp column is datetime
if 'time:timestamp' in df_simulated.columns:
    df_simulated['time:timestamp'] = pd.to_datetime(df_simulated['time:timestamp'])

# Case duration (in seconds)
case_durations = df_simulated.groupby('case:concept:name')['time:timestamp'].agg(
    lambda x: (x.max() - x.min()).total_seconds()
)

# Events per case
events_per_case = df_simulated.groupby('case:concept:name').size()

# Activities per case (unique activities)
activities_per_case = df_simulated.groupby('case:concept:name')['concept:name'].nunique()

print("=" * 60)
print("BASIC METRICS")
print("=" * 60)
print(f"\nCase Duration (seconds):")
print(f"  Mean: {case_durations.mean():.2f}")
print(f"  Std:  {case_durations.std():.2f}")
print(f"  Min:  {case_durations.min():.2f}")
print(f"  Max:  {case_durations.max():.2f}")

print(f"\nEvents per Case:")
print(f"  Mean: {events_per_case.mean():.2f}")
print(f"  Std:  {events_per_case.std():.2f}")
print(f"  Min:  {events_per_case.min()}")
print(f"  Max:  {events_per_case.max()}")

print(f"\nActivities per Case (unique):")
print(f"  Mean: {activities_per_case.mean():.2f}")
print(f"  Std:  {activities_per_case.std():.2f}")
print(f"  Min:  {activities_per_case.min()}")
print(f"  Max:  {activities_per_case.max()}")
print("=" * 60)

# Visualizations
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Case duration distribution
axes[0].hist(case_durations / 3600, bins=30, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Case Duration (hours)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Case Duration Distribution')
axes[0].grid(True, alpha=0.3)

# Events per case distribution
axes[1].hist(events_per_case, bins=30, edgecolor='black', alpha=0.7, color='orange')
axes[1].set_xlabel('Number of Events')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Events per Case Distribution')
axes[1].grid(True, alpha=0.3)

# Activities per case distribution
axes[2].hist(activities_per_case, bins=30, edgecolor='black', alpha=0.7, color='green')
axes[2].set_xlabel('Number of Unique Activities')
axes[2].set_ylabel('Frequency')
axes[2].set_title('Activities per Case Distribution')
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()


## 6. DFG Visualization (Directly Follows Graph)


In [None]:
if PM4PY_AVAILABLE:
    # Discover DFG (frequency-based)
    dfg_freq, start_activities, end_activities = pm4py.discover_dfg(df_simulated)
    
    # Get activity counts for visualization
    activity_counts = df_simulated['concept:name'].value_counts().to_dict()
    
    # Visualize frequency DFG
    print("Creating Frequency DFG...")
    gviz_freq = pm4py.visualize_dfg(
        dfg_freq,
        start_activities=start_activities,
        end_activities=end_activities,
        activities_count=activity_counts,
        variant="frequency"
    )
    
    # Save and display
    freq_output = os.path.join(OUTPUT_DIR, "dfg_frequency.png")
    pm4py.save_vis_dfg(gviz_freq, freq_output)
    print(f"Frequency DFG saved to: {freq_output}")
    
    # Discover performance DFG (with timing information)
    print("\nCreating Performance DFG (with timing)...")
    
    # Calculate service times (time between events)
    df_sorted = df_simulated.sort_values(['case:concept:name', 'time:timestamp'])
    df_sorted['time_diff'] = df_sorted.groupby('case:concept:name')['time:timestamp'].diff()
    
    # Calculate average service time per activity
    serv_time = {}
    for activity in df_sorted['concept:name'].unique():
        activity_events = df_sorted[df_sorted['concept:name'] == activity]
        # Use time_diff where available, otherwise estimate from case duration
        times = activity_events['time_diff'].dropna()
        if len(times) > 0:
            serv_time[activity] = times.mean().total_seconds() / 3600  # Convert to hours
    
    # Discover performance DFG
    dfg_perf, start_activities, end_activities = pm4py.discover_dfg(df_simulated)
    
    # Visualize performance DFG
    gviz_perf = pm4py.visualize_dfg(
        dfg_perf,
        start_activities=start_activities,
        end_activities=end_activities,
        serv_time=serv_time,
        variant="performance"
    )
    
    # Save and display
    perf_output = os.path.join(OUTPUT_DIR, "dfg_performance.png")
    pm4py.save_vis_dfg(gviz_perf, perf_output)
    print(f"Performance DFG saved to: {perf_output}")
    
    # Display the graphs
    from IPython.display import Image, display
    
    print("\n" + "=" * 60)
    print("FREQUENCY DFG")
    print("=" * 60)
    display(Image(freq_output))
    
    print("\n" + "=" * 60)
    print("PERFORMANCE DFG (with timing)")
    print("=" * 60)
    display(Image(perf_output))
else:
    print("PM4Py not available. Skipping DFG visualization.")


In [None]:
# Calculate comprehensive statistics
stats = {}

# Basic counts
stats['number_of_cases'] = df_simulated['case:concept:name'].nunique()
stats['number_of_events'] = len(df_simulated)

# Process variants
if PM4PY_AVAILABLE:
    variants = pm4py.get_variants(df_simulated)
    stats['number_of_process_variants'] = len(variants)
else:
    # Manual variant calculation
    variants_dict = {}
    for case_id, case_df in df_simulated.groupby('case:concept:name'):
        variant = tuple(case_df.sort_values('time:timestamp')['concept:name'].tolist())
        variants_dict[variant] = variants_dict.get(variant, 0) + 1
    stats['number_of_process_variants'] = len(variants_dict)

# Case and event labels
stats['number_of_case_labels'] = df_simulated['case:concept:name'].nunique()
stats['number_of_event_labels'] = df_simulated['concept:name'].nunique()

# Case length (number of events per case)
case_lengths = df_simulated.groupby('case:concept:name').size()
stats['mean_case_length'] = case_lengths.mean()
stats['std_case_length'] = case_lengths.std()

# Case duration statistics
case_durations_seconds = case_durations
case_durations_days = case_durations_seconds / (24 * 3600)
case_durations_minutes = case_durations_seconds / 60

stats['mean_case_duration_days'] = case_durations_days.mean()
stats['std_case_duration_days'] = case_durations_days.std()
stats['mean_case_duration_minutes'] = case_durations_minutes.mean()
stats['std_case_duration_minutes'] = case_durations_minutes.std()
stats['mean_case_duration_seconds'] = case_durations_seconds.mean()
stats['std_case_duration_seconds'] = case_durations_seconds.std()

# Categorical event attributes
categorical_attrs = []
for col in df_simulated.columns:
    if col not in ['case:concept:name', 'concept:name', 'time:timestamp']:
        # Check if column is categorical (object type with limited unique values)
        if df_simulated[col].dtype == 'object':
            unique_ratio = df_simulated[col].nunique() / len(df_simulated)
            if unique_ratio < 0.5:  # Less than 50% unique values suggests categorical
                categorical_attrs.append(col)
        elif df_simulated[col].dtype in ['int64', 'int32']:
            # Integer columns with few unique values might be categorical
            if df_simulated[col].nunique() < 20:
                categorical_attrs.append(col)

stats['number_of_categorical_event_attributes'] = len(categorical_attrs)

# Display statistics
print("=" * 60)
print("COMPREHENSIVE STATISTICS")
print("=" * 60)
print(f"\nBasic Counts:")
print(f"  Number of cases: {stats['number_of_cases']:,}")
print(f"  Number of events: {stats['number_of_events']:,}")
print(f"  Number of process variants: {stats['number_of_process_variants']:,}")
print(f"  Number of case labels: {stats['number_of_case_labels']:,}")
print(f"  Number of event labels (activities): {stats['number_of_event_labels']}")

print(f"\nCase Length Statistics:")
print(f"  Mean case length (events per case): {stats['mean_case_length']:.2f}")
print(f"  Std case length: {stats['std_case_length']:.2f}")

print(f"\nCase Duration Statistics:")
print(f"  Mean case duration: {stats['mean_case_duration_days']:.2f} days ({stats['mean_case_duration_minutes']:.2f} minutes, {stats['mean_case_duration_seconds']:.2f} seconds)")
print(f"  Std case duration: {stats['std_case_duration_days']:.2f} days ({stats['std_case_duration_minutes']:.2f} minutes, {stats['std_case_duration_seconds']:.2f} seconds)")

print(f"\nEvent Attributes:")
print(f"  Number of categorical event attributes: {stats['number_of_categorical_event_attributes']}")
if categorical_attrs:
    print(f"  Categorical attributes: {', '.join(categorical_attrs[:10])}")
    if len(categorical_attrs) > 10:
        print(f"  ... and {len(categorical_attrs) - 10} more")

print("=" * 60)

# Create summary DataFrame
summary_df = pd.DataFrame([
    ['Number of cases', f"{stats['number_of_cases']:,}"],
    ['Number of events', f"{stats['number_of_events']:,}"],
    ['Number of process variants', f"{stats['number_of_process_variants']:,}"],
    ['Number of case labels', f"{stats['number_of_case_labels']:,}"],
    ['Number of event labels (activities)', f"{stats['number_of_event_labels']}"],
    ['Mean case length', f"{stats['mean_case_length']:.2f}"],
    ['Std case length', f"{stats['std_case_length']:.2f}"],
    ['Mean case duration (days)', f"{stats['mean_case_duration_days']:.2f}"],
    ['Std case duration (days)', f"{stats['std_case_duration_days']:.2f}"],
    ['Mean case duration (minutes)', f"{stats['mean_case_duration_minutes']:.2f}"],
    ['Std case duration (minutes)', f"{stats['std_case_duration_minutes']:.2f}"],
    ['Mean case duration (seconds)', f"{stats['mean_case_duration_seconds']:.2f}"],
    ['Std case duration (seconds)', f"{stats['std_case_duration_seconds']:.2f}"],
    ['Number of categorical event attributes', f"{stats['number_of_categorical_event_attributes']}"],
], columns=['Metric', 'Value'])

print("\nSummary Table:")
display(summary_df)


## 8. Additional Visualizations


In [None]:
# Activity frequency
activity_freq = df_simulated['concept:name'].value_counts()

fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Top activities
top_n = 15
activity_freq.head(top_n).plot(kind='barh', ax=axes[0, 0], color='steelblue')
axes[0, 0].set_xlabel('Frequency')
axes[0, 0].set_title(f'Top {top_n} Most Frequent Activities')
axes[0, 0].grid(True, alpha=0.3, axis='x')

# Case duration distribution (detailed)
axes[0, 1].hist(case_durations_days, bins=50, edgecolor='black', alpha=0.7, color='coral')
axes[0, 1].set_xlabel('Case Duration (days)')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Case Duration Distribution (Days)')
axes[0, 1].axvline(stats['mean_case_duration_days'], color='red', linestyle='--', label=f"Mean: {stats['mean_case_duration_days']:.2f}")
axes[0, 1].legend()
axes[0, 1].grid(True, alpha=0.3)

# Events over time
df_simulated_sorted = df_simulated.sort_values('time:timestamp')
df_simulated_sorted['cumulative_events'] = range(1, len(df_simulated_sorted) + 1)
df_simulated_sorted['cumulative_cases'] = df_simulated_sorted.groupby('case:concept:name').ngroup() + 1
df_simulated_sorted['cumulative_cases'] = df_simulated_sorted['cumulative_cases'].cummax()

axes[1, 0].plot(df_simulated_sorted['time:timestamp'], df_simulated_sorted['cumulative_events'], label='Cumulative Events', linewidth=2)
axes[1, 0].set_xlabel('Time')
axes[1, 0].set_ylabel('Cumulative Count')
axes[1, 0].set_title('Cumulative Events Over Time')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)
axes[1, 0].tick_params(axis='x', rotation=45)

# Case length vs duration scatter
case_stats = pd.DataFrame({
    'length': case_lengths,
    'duration_days': case_durations_days
})

axes[1, 1].scatter(case_stats['length'], case_stats['duration_days'], alpha=0.5, s=20)
axes[1, 1].set_xlabel('Case Length (number of events)')
axes[1, 1].set_ylabel('Case Duration (days)')
axes[1, 1].set_title('Case Length vs Duration')
axes[1, 1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(os.path.join(OUTPUT_DIR, 'comprehensive_analysis.png'), dpi=150, bbox_inches='tight')
print(f"Comprehensive analysis plot saved to: {os.path.join(OUTPUT_DIR, 'comprehensive_analysis.png')}")
plt.show()
