In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from matching.glema.common.utils.plot_utils import ColorScheme

In [None]:
size_summary_file = "./generation_meta/dpdf/size_summary.csv"
benchmark_file = "./generation_meta/dpdf/benchmark.csv"

In [None]:
def transform_project( x: str, prefix="dpdf" ):
    x = x.lower()
    x = x.replace( " ", "_" )
    x = x.replace( ".", "" )
    return f"{prefix}-{x}"

In [None]:
df_sizes = pd.read_csv( size_summary_file )
df_sizes[ 'project' ] = df_sizes[ 'project' ].apply( transform_project )
df_benchmarks = pd.read_csv( benchmark_file )
df_benchmarks.rename( columns={ "name": "project" }, inplace=True )

In [None]:
df_benchmarks

In [None]:
df_sizes

In [None]:
process_groups: dict[ str, list[ str ] ] = {
    "Fetching": [
        "LoadPatternFileModule",
        "ReadPatternsModule",
        "AttachPatternsToContext",
        "LoadDatasetFileModule"
    ],
    "Translating": [
        "GenerateCpgModule",
        "TranslationToGraphModule"
    ],
    "Processing": [
        "RemoveBlacklistElementsModule",
        "FilterInternalScopeModule",
        "PropagateRecordScopeModule",
        "ComputeRecordPathsModule",
        "ComputeRecordInteractionsModule",
        "CpgFilterEdgesModule"
    ],
    "Persisting": [
        "MarkPatternsModule",
        "PersistCpgModule"
    ]
}

In [None]:
# Create a mapping from each process name to its group
process_to_group = { }
for group, processes in process_groups.items():
    for proc in processes:
        process_to_group[ proc ] = group

# Map process names in df to their respective group
df_benchmarks[ 'process_group' ] = df_benchmarks[ 'process_name' ].map( process_to_group )

# Convert process_time_sec to a numeric type (in case it's read as a string)
df_benchmarks[ 'process_time_sec' ] = pd.to_numeric( df_benchmarks[ 'process_time_sec' ], errors='coerce' )

# Group by "name" and "group", summing the process_time_sec, then unstack so each group is a column
df = df_benchmarks.groupby( [ 'project', 'process_group' ] )[ 'process_time_sec' ].sum().unstack(
    fill_value=0 ).reset_index()
group_cols = list( process_groups.keys() )  # ['fetching', 'translating', 'processing', 'persisting']
df = df[ (df[ group_cols ] != 0).all( axis=1 ) ]

df = pd.merge( df, df_sizes, on='project' )
df

In [None]:
custom_colors = [ ColorScheme.PRIMARY, ColorScheme.PRIMARY_LIGHT, ColorScheme.SECONDARY, ColorScheme.SECONDARY_LIGHT ]

# Define the group order
groups = process_groups.keys()

# Prepare data for boxplot: a list of series corresponding to each group
data = [ df[ group ].dropna() for group in groups ]

# Create the boxplot with a log scale on the y-axis
fig, ax = plt.subplots( figsize=(10, 6) )
bp = ax.boxplot( data,
                 patch_artist=True,
                 tick_labels=groups,
                 showfliers=False,
                 medianprops=dict( color=ColorScheme.HIGHLIGHT, linewidth=2 ),
                 widths=0.8 )

# Apply the custom colors to each box
for patch, color in zip( bp[ 'boxes' ], custom_colors ):
    patch.set_facecolor( color )

# Set y-axis to log scale
ax.set_yscale( 'log' )

# Compute the global maximum across all groups
all_data = pd.concat( data )
global_max = all_data.max()

# Manually set the top limit to 30% above the global max
# (Adjust as needed if your data is very spread out)
ax.set_ylim( top=global_max * 1.3 * 4 )

# Add grid lines on the y-axis to act as a scale line
ax.yaxis.grid( True, which='both', linewidth=0.2 )

# Calculate and annotate statistics for each group
for i, group in enumerate( groups ):
    group_data = df[ group ].dropna()
    med_val = group_data.median()
    std_val = group_data.std()
    min_val = group_data.min()
    max_val = group_data.max()

    # Create annotation text
    annotation_text = (f"med: {med_val:.3f}\n"
                       f"std: {std_val:.3f}\n"
                       f"min: {min_val:.3f}\n"
                       f"max: {max_val:.3f}")

    # Position annotation: For log scale, multiply the max value by a factor
    x_pos = i + 1
    #y_pos = max_val * 1.05  # 10% above the max value
    y_pos = 400
    ax.text( x_pos, y_pos, annotation_text,
             ha='center', va='bottom', fontsize=9,
             bbox=dict( facecolor='white', alpha=0.5, edgecolor='gray' ) )

ax.set_ylabel( "Process Time in Sec (Log Scale)" )
plt.tight_layout()
#plt.show()

plt.savefig( "plots/benchmark_groups" )

In [None]:
df

In [None]:
groups = list( process_groups.keys() )

# Create 10 equal-width bins for the "bytes" column
num_bins = 20
df[ 'byte_interval' ] = pd.qcut( df[ 'bytes' ], q=num_bins )

# Group by the byte interval and compute the average process times for each process group
grouped = df.groupby( 'byte_interval' )[ groups ].median()

normalized = grouped.copy()
for col in normalized.columns:
    min_val = normalized[ col ].min()
    max_val = normalized[ col ].max()
    # Avoid division by zero if all values are equal
    if max_val - min_val > 0:
        normalized[ col ] = (normalized[ col ] - min_val) / (max_val - min_val)
    else:
        normalized[ col ] = 0.0

# Compute the midpoint for each byte interval to serve as the x-axis values.
#mid_points = normalized.index.map( lambda interval: (interval.left + interval.right) / 2 )

# Convert normalized values to percentages
normalized_pct = normalized * 100
# Create x-axis values as bin numbers 1, 2, ..., number of bins
x = np.arange( 1, len( normalized_pct ) + 1 )

# Extract values for each process group from the grouped DataFrame
fetching = normalized_pct[ groups[ 0 ] ].values
translating = normalized_pct[ groups[ 1 ] ].values
processing = normalized_pct[ groups[ 2 ] ].values
persisting = normalized_pct[ groups[ 3 ] ].values

# Create the stackplot
fig, ax = plt.subplots( figsize=(10, 6) )
#ax.set_yscale( 'log' )

ax.stackplot( x, fetching, translating, processing, persisting,
              labels=groups, colors=custom_colors )

ax.yaxis.grid( True, which='both', linewidth=0.2 )
ax.set_xlabel( "Source Code Memory Size Interval" )
ax.set_ylabel( "Median Process Time (%)" )
ax.legend( loc='upper left' )

# Set a specific number of x-axis ticks.
desired_num_ticks = 20  # Change this to the number of ticks you want
ticks = np.linspace( 1, num_bins, desired_num_ticks, dtype=int )
ax.set_xticks( ticks )
ax.set_xticklabels( ticks )

plt.tight_layout()

plt.savefig( "plots/benchmark_scaling" )