In [4]:
import pandas as pd
import pm4py
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


from pm4py.objects.conversion.log import converter as log_converter
from pm4py.statistics.variants.log import get as variants_get
from pm4py.statistics.variants.log import get as variants_get_traces



In [5]:
print("MERGING ALL DATASETS")



# Add variant index to variant_df if not present
if 'variant_index' not in variant_df.columns:
    variant_df['variant_index'] = range(1, len(variant_df) + 1)


# Step 1: Merge df with case_variant_df
df_merged = df.merge(
    case_variant_df,
    left_on='case:concept:name',
    right_on='case_id',
    how='left'
)

# Step 2: Merge with variant_df
df_merged = df_merged.merge(
    variant_df[['variant', 'frequency', 'percentage', 'variant_index']],
    on='variant',
    how='left'
)

# Step 3: Merge with performance metrics
df_merged = df_merged.merge(
    case_performance[['case:concept:name', 'cycle_time', 'total_waiting_time', 
                     'lead_time', 'cycle_time_efficiency']],
    on='case:concept:name',
    how='left'
)

# Display sample
print(f"\nSample The Whole Dataset")
display(df_merged.head(10))

display(df_merged.columns)


## Variant Analysis

### Variant Filtering & Exploration

In [None]:
# 1. Filter specific variant (e.g., variant #5)
filtered_df = df_merged.query('variant_index == 5')
print(f"Variant #5 has {len(filtered_df)} events")

# 2. Resources working on rare variants (>50th position)
rare_variants = df_merged[df_merged['variant_index'] > 50]
print("\nResources on rare variants:")
print(rare_variants['resource'].value_counts().head())

# 3. Relationship: variant length vs time
print("\nAverage time by variant length:")
print(df_merged.groupby('variant_length')['frequency'].mean())

# 4. Most common activities in complex variants (length > 8)
complex_variants = df_merged[df_merged['variant_length'] > 8]
print("\nTop activities in complex variants:")
print(complex_variants['concept:name'].value_counts().head())

# 5. Compare standard vs exceptional cases
standard = df_merged[df_merged['variant_index'] == 1]
exceptional = df_merged[df_merged['variant_index'] > 20]
print(f"\nStandard cases: {len(standard)} | Exceptional cases: {len(exceptional)}")

### Performance Measurements by Variant

In [None]:
# Calculate summary statistics for each variant
performance_cols = ['cycle_time', 'total_waiting_time', 'lead_time', 'cycle_time_efficiency']

variant_perf_summary = df_merged.groupby("variant_index")[performance_cols].agg(['mean', 'median', 'std']).reset_index()
variant_perf_summary.columns = ['_'.join(col).rstrip('_') for col in variant_perf_summary.columns.values]

print("Performance summary for variant #5:")
display(variant_perf_summary[variant_perf_summary['variant_index'] == 5])

In [None]:
# Identify best and worst performing variants

# Top 5 most efficient variants
most_efficient = variant_perf_summary.nlargest(5, 'cycle_time_efficiency_mean')
print("Top 5 most efficient variants:")
print(most_efficient[['variant_index', 'cycle_time_efficiency_mean', 'lead_time_mean']])

# Problematic variants (high waiting time + low efficiency)
problematic = variant_perf_summary.query(
    'total_waiting_time_mean > 600 and cycle_time_efficiency_mean < 0.25'
)
print("\nProblematic variants (high waiting, low efficiency):")
print(problematic[['variant_index', 'total_waiting_time_mean', 'cycle_time_efficiency_mean']])

### Visualization Functions

In [None]:
def plot_variant_performance(metric='cycle_time_median', top_n=10):
    """
    Bar plot of a performance metric for top-N most frequent variants
    """
    if metric not in variant_perf_summary.columns:
        raise ValueError(f"{metric} not found. Available: {list(variant_perf_summary.columns)}")
    
    # Get top N variants by frequency
    top_variants = (
        variants_df
        .sort_values('percentage', ascending=False)
        .head(top_n)['variant_index']
        .tolist()
    )
    
    df_plot = variant_perf_summary.query('variant_index in @top_variants').sort_values(metric, ascending=False)
    
    plt.figure(figsize=(8, 4))
    sns.barplot(data=df_plot, y=metric, x='variant_index')
    plt.title(f"Top {top_n} Variants by {metric.replace('_', ' ').title()}")
    plt.xlabel("Variant Index")
    plt.ylabel(metric.replace('_', ' ').title())
    plt.tight_layout()
    plt.show()

# Example usage
plot_variant_performance(metric='cycle_time_median', top_n=5)

In [None]:
def plot_variant_violin(metric='processing_time', top_k=10, log_scale=False):
    """
    Violin plot showing distribution of a metric across top-K variants
    """
    # Get top K variants
    top_variant_indices = (
        variants_df
        .sort_values('percentage', ascending=False)
        .head(top_k)['variant_index']
        .tolist()
    )
    
    df_plot = df_merged[df_merged['variant_index'].isin(top_variant_indices)].copy()
    df_plot['variant_index'] = pd.Categorical(
        df_plot['variant_index'],
        categories=top_variant_indices,
        ordered=True
    )
    
    plt.figure(figsize=(12, 6))
    sns.violinplot(
        data=df_plot,
        x='variant_index',
        y=metric,
        inner='box',
        scale='width',
        palette='Blues'
    )
    plt.title(f"Distribution of {metric.replace('_', ' ').title()} Across Top {top_k} Variants")
    plt.xlabel("Variant Index (most frequent first)")
    plt.ylabel(metric.replace('_', ' ').title())
    if log_scale:
        plt.yscale('log')
    plt.tight_layout()
    plt.show()

# Example usage
plot_variant_violin(metric='waiting_time', top_k=5)

### Step 6: Time Visualization