In [None]:
import thicket as th
import pandas as pd
import matplotlib.pyplot as plt
from glob import glob
import numpy as np
import os
import sys
sys.path.append(
    "/scratch/group/csce-435-f25/python-3.10.8/lib/python3.10/site-packages")


pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

In [None]:
cali_files = glob("cali-files/*.cali")
tk = th.Thicket.from_caliperreader(cali_files)
df = tk.dataframe.reset_index()
df_merged = df.merge(tk.metadata, left_on='profile', right_index=True)
df = df_merged[df_merged['data_type'] != 'float']

In [None]:
df_sorted = df.sort_values(by=['input_size', 'num_procs', 'data_type', 'input_type'])
df_sorted.head(10)

In [None]:
print(df['name'].unique())

In [None]:
print(tk.tree(metric_column="Avg time/rank"))

- Experiment 1: 2 nodes, 64 processes, vary number of elements
- Experiment 2 (strong scaling): 2^22 elements, vary number of processes 
- Experiment 3 (weak scaling): (2^16 elements, 16 processors), (2^18 elements, 32 processors), (2^20 elements, 64 processors), (2^22 elements, 128 processors), (2^24 elements, 256 processors), (2^26 elements, 512 processors), (2^28 elements, 1024 processors)
- Experiment 4 (vary data types): 2^22 elements, 64 processes, double/int 
- Experiment 5 (vary initial sort level): 2^22 elements, 64 processes, (sorted/perturbed/random/reversed)
- Assume initial random sort level

In [None]:
# === Merge Sort Caliper Analysis with Thicket ===

# Load Caliper files and Thicket
df = df_sorted
input_type_map = {0: 'Sorted', 1: 'Perturbed', 2: 'Random', 3: 'Reversed'}
regions = ['comp_large', 'comm', 'main']
input_sizes = [2**16, 2**18, 2**20, 2**22, 2**24, 2**26, 2**28]
input_types = [0, 1, 2, 3]
num_procs_list = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
time_col = 'Total time' if 'Total time' in df.columns else 'time'

# --- Plot Style ---
os.makedirs('plots', exist_ok=True)
plt.style.use('default')
color_list = None # Use default matplotlib colors
marker_list = ['x', 'o', 's', 'D']
line_styles = ['-', '--', '-.', ':']

# --- Experiment 1: 2 nodes, 64 processes, vary number of elements ---
fixed_procs = 64
for region in regions:
    fig, ax = plt.subplots(figsize=(7, 6))
    for idx, input_type in enumerate(input_types):
        times, sizes = [], []
        for input_size in input_sizes:
            mask = (df['name'] == region) & \
                   (df['input_size'] == input_size) & \
                   (df['num_procs'] == fixed_procs) & \
                   (df['input_type'] == input_type)
            data = df[mask]
            if len(data) > 0:
                times.append(data[time_col].iloc[0])
                sizes.append(input_size)
        if len(times) > 0:
            ax.plot([np.log2(s) for s in sizes], times,
                    marker=marker_list[idx], linestyle=line_styles[idx],
                    label=input_type_map[input_type])
    ax.set_xlabel('Input Size (log 2 scaled)')
    ax.set_ylabel('Time (s)')
    ax.set_title(f'{region} Time vs Input Size (64 procs)')
    ax.grid(True, alpha=0.3)
    ax.legend(title='Input Type', loc='best')
    plt.tight_layout()
    plt.savefig(f'plots/exp1_{region}_64procs.png', dpi=150, bbox_inches='tight')
    plt.show()





In [None]:
# --- Experiment 2: Strong scaling (2^22 elements, vary number of processes) ---
fixed_size = 2**22
for region in regions:
    fig, ax = plt.subplots(figsize=(7, 6))
    for idx, input_type in enumerate(input_types):
        times, procs = [], []
        for num_proc in num_procs_list:
            mask = (df['name'] == region) & \
                   (df['input_size'] == fixed_size) & \
                   (df['num_procs'] == num_proc) & \
                   (df['input_type'] == input_type)
            data = df[mask]
            if len(data) > 0:
                times.append(data[time_col].iloc[0])
                procs.append(num_proc)
        if len(times) > 0:
            ax.plot(procs, times,
                    marker=marker_list[idx], linestyle=line_styles[idx],
                    label=input_type_map[input_type])
    ax.set_xlabel('Number of Processes')
    ax.set_ylabel('Time (s)')
    ax.set_title(f'{region} Strong Scaling (2^22 elements)')
    ax.set_xscale('log', base=2)
    ax.grid(True, alpha=0.3)
    ax.legend(title='Input Type', loc='best')
    plt.tight_layout()
    plt.savefig(f'plots/exp2_{region}_strong_scaling.png',
                dpi=150, bbox_inches='tight')
    plt.show()

In [None]:
# --- Experiment 3: Weak scaling ---
weak_scaling_configs = [
    (2**16, 16), (2**18, 32), (2**20, 64),
    (2**22, 128), (2**24, 256), (2**26, 512), (2**28, 1024)
]
for region in regions:
    fig, ax = plt.subplots(figsize=(7, 6))
    for idx, input_type in enumerate(input_types):
        times, procs = [], []
        for input_size, num_proc in weak_scaling_configs:
            mask = (df['name'] == region) & \
                   (df['input_size'] == input_size) & \
                   (df['num_procs'] == num_proc) & \
                   (df['input_type'] == input_type)
            data = df[mask]
            if len(data) > 0:
                times.append(data[time_col].iloc[0])
                procs.append(num_proc)
        if len(times) > 0:
            ax.plot(procs, times,
                    marker=marker_list[idx], linestyle=line_styles[idx],
                    label=input_type_map[input_type])
    ax.set_xlabel('Number of Processes')
    ax.set_ylabel('Time (s)')
    ax.set_title(f'{region} Weak Scaling')
    ax.set_xscale('log', base=2)
    ax.grid(True, alpha=0.3)
    ax.legend(title='Input Type', loc='best')
    plt.tight_layout()
    plt.savefig(f'plots/exp3_{region}_weak_scaling.png',
                dpi=150, bbox_inches='tight')
    plt.show()

In [None]:
# --- Experiment 4: Data type comparison (int vs float) ---
if 'data_type' in df.columns:
    for region in regions:
        fig, ax = plt.subplots(figsize=(7, 6))
        width = 0.35
        for idx, data_type in enumerate(df['data_type'].unique()):
            times, types = [], []
            for input_type in input_types:
                mask = (df['name'] == region) & \
                       (df['input_size'] == fixed_size) & \
                       (df['num_procs'] == fixed_procs) & \
                       (df['input_type'] == input_type) & \
                       (df['data_type'] == data_type)
                data = df[mask]
                if len(data) > 0:
                    times.append(data[time_col].iloc[0])
                    types.append(input_type_map[input_type])
            if len(times) > 0:
                x_pos = np.arange(len(types)) + idx * width
                ax.bar(x_pos, times, width=width, label=data_type)
        ax.set_xlabel('Input Type')
        ax.set_ylabel('Time (s)')
        ax.set_title(f'{region} Data Type Comparison (2^22 elements, 64 procs)')
        ax.set_xticks(np.arange(len(input_types)) + width/2)
        ax.set_xticklabels([input_type_map[t] for t in input_types])
        ax.legend(title='Data Type', loc='best')
        ax.grid(True, alpha=0.3, axis='y')
        plt.tight_layout()
        plt.savefig(f'plots/exp4_{region}_datatype.png', dpi=150, bbox_inches='tight')
        plt.show()
else:
    print("Warning: 'data_type' column not found. Run experiments with both int and float.")



In [None]:
# --- Experiment 5: Input type comparison (2^22 elements, 64 processes) ---
for region in regions:
    fig, ax = plt.subplots(figsize=(7, 6))
    times, labels = [], []
    for idx, input_type in enumerate(input_types):
        mask = (df['name'] == region) & \
               (df['input_size'] == fixed_size) & \
               (df['num_procs'] == fixed_procs) & \
               (df['input_type'] == input_type)
        data = df[mask]
        if len(data) > 0:
            times.append(data[time_col].iloc[0])
            labels.append(input_type_map[input_type])
    if len(times) > 0:
        bars = ax.bar(labels, times)
        ax.set_ylabel('Time (s)')
        ax.set_title(f'{region} Input Type Comparison (2^22 elements, 64 procs)')
        ax.grid(True, alpha=0.3, axis='y')
        for bar in bars:
            height = bar.get_height()
            ax.text(bar.get_x() + bar.get_width()/2., height, f'{height:.3f}s', ha='center', va='bottom')
        plt.tight_layout()
        plt.savefig(f'plots/exp5_{region}_inputtype.png', dpi=150, bbox_inches='tight')
        plt.show()

In [None]:
df = df_merged
regions = ['comp_large', 'comm', 'main']
input_types = [0, 1, 2, 3]
input_type_map = {0: 'Sorted', 1: 'Perturbed', 2: 'Random', 3: 'Reversed'}
input_sizes = [2**16, 2**18, 2**20, 2**22, 2**24, 2**26, 2**28]
num_procs_list = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]
weak_scaling_configs = [
    (2**16, 16), (2**18, 32), (2**20, 64),
    (2**22, 128), (2**24, 256), (2**26, 512), (2**28, 1024)
]
time_col = 'Total time' if 'Total time' in df.columns else 'time'
plt.style.use('default')
marker_list = ['x', 'o', 's', 'D']
line_styles = ['-', '--', '-.', ':']

# 1. Strong scaling plots for each input_size
for region in regions:
    for input_size in input_sizes:
        fig, ax = plt.subplots(figsize=(8, 6))
        for idx, input_type in enumerate(input_types):
            times, procs = [], []
            for num_proc in num_procs_list:
                mask = (df['name'] == region) & \
                       (df['input_size'] == input_size) & \
                       (df['num_procs'] == num_proc) & \
                       (df['input_type'] == input_type)
                data = df[mask]
                if len(data) > 0:
                    times.append(data[time_col].iloc[0])
                    procs.append(num_proc)
            if len(times) > 0:
                ax.plot(procs, times, marker=marker_list[idx], linestyle=line_styles[idx],
                        label=input_type_map[input_type])
        ax.set_xlabel('Number of Processes')
        ax.set_ylabel('Time (s)')
        ax.set_title(f'{region} Strong Scaling (Input Size={input_size})')
        ax.set_xscale('log', base=2)
        ax.grid(True, alpha=0.3)
        ax.legend(title='Input Type', loc='best')
        plt.tight_layout()
        plt.savefig(
            f'plots/strong_scaling_{region}_inputsize_{input_size}.png', dpi=150)
        plt.show()


In [None]:
# 2. Strong scaling speedup plot for each input_type
fixed_size = 2**22
for region in regions:
    for idx, input_type in enumerate(input_types):
        fig, ax = plt.subplots(figsize=(8, 6))
        times, procs = [], []
        for num_proc in num_procs_list:
            mask = (df['name'] == region) & \
                   (df['input_size'] == fixed_size) & \
                   (df['num_procs'] == num_proc) & \
                   (df['input_type'] == input_type)
            data = df[mask]
            if len(data) > 0:
                times.append(data[time_col].iloc[0])
                procs.append(num_proc)
        if len(times) > 0:
            speedup = [times[0] / t for t in times]
            ax.plot(procs, speedup, marker=marker_list[idx], linestyle=line_styles[idx],
                    label=input_type_map[input_type])
        ax.set_xlabel('Number of Processes')
        ax.set_ylabel('Speedup')
        ax.set_title(
            f'{region} Strong Scaling Speedup ({input_type_map[input_type]})')
        ax.set_xscale('log', base=2)
        ax.grid(True, alpha=0.3)
        ax.legend(loc='best')
        plt.tight_layout()
        plt.savefig(
            f'plots/strong_scaling_speedup_{region}_inputtype_{input_type}.png', dpi=150)
        plt.show()



In [None]:
# 3. Weak scaling plots for each input_type
for region in regions:
    for idx, input_type in enumerate(input_types):
        fig, ax = plt.subplots(figsize=(8, 6))
        times = []
        procs = []
        for input_size, num_proc in weak_scaling_configs:
            mask = (
                (df['name'] == region) &
                (df['input_size'] == input_size) &
                (df['num_procs'] == num_proc) &
                (df['input_type'] == input_type)
            )
            data = df[mask]
            if len(data) > 0:
                times.append(data[time_col].iloc[0])
                procs.append(num_proc)
        if len(times) > 0:
            ax.plot(
                procs,
                times,
                marker=marker_list[idx],
                linestyle=line_styles[idx],
                label=input_type_map[input_type]
            )
        ax.set_xlabel('Number of Processes')
        ax.set_ylabel('Time (s)')
        ax.set_title(f'{region} Weak Scaling ({input_type_map[input_type]})')
        ax.set_xscale('log', base=2)
        ax.grid(True, alpha=0.3)
        ax.legend(loc='best')
        plt.tight_layout()
        plt.savefig(
            f'plots/weak_scaling_{region}_inputtype_{input_type}.png', dpi=150
        )
        plt.show()