In [1]:
# DIAGNOSTIC: Let's examine why we're getting p-values of 0
# The issue is likely that our t-statistics are so large that p-values underflow to 0

import pandas as pd
import numpy as np
import math
from scipy import stats

# Let's examine one case in detail
n = 10000
device = "Raspberry Pi 5"
model = "Random Forest"
metric = "latency"

# Data for this specific case
mean_onnx = 0.0214
sd_onnx = 0.0139
mean_cpp = 0.0010
sd_cpp = 0.0002

print(f"=== DIAGNOSTIC for {device} - {model} - {metric} ===")
print(f"ONNX: mean={mean_onnx}, sd={sd_onnx}")
print(f"C++:  mean={mean_cpp}, sd={sd_cpp}")
print(f"Sample size: {n}")
print()

# Calculate standard errors using delta method
se_onnx = sd_onnx / (math.sqrt(n) * mean_onnx)
se_cpp = sd_cpp / (math.sqrt(n) * mean_cpp)

print(f"Standard error of log(ONNX): {se_onnx}")
print(f"Standard error of log(C++): {se_cpp}")
print()

# Difference in log means
diff_log_means = math.log(mean_onnx) - math.log(mean_cpp)
se_diff = math.sqrt(se_onnx**2 + se_cpp**2)
t_stat = diff_log_means / se_diff

print(f"Difference in log means: {diff_log_means}")
print(f"SE of difference: {se_diff}")
print(f"t-statistic: {t_stat}")
print()

# Calculate degrees of freedom
v1 = se_onnx**2
v2 = se_cpp**2
df = (v1 + v2)**2 / ((v1**2) / (n - 1) + (v2**2) / (n - 1))
print(f"Degrees of freedom: {df}")

# P-value calculation
p_value = 2 * stats.t.sf(abs(t_stat), df)
print(f"Raw p-value: {p_value}")

# Let's see what the smallest representable p-value is
print(f"Machine epsilon: {np.finfo(float).eps}")
print(f"Smallest normal float: {np.finfo(float).tiny}")

# Alternative: use log p-values to avoid underflow
log_p = stats.t.logsf(abs(t_stat), df) + math.log(2)
print(f"Log p-value: {log_p}")
print(f"This corresponds to p ≈ 10^{log_p / math.log(10):.1f}")


=== DIAGNOSTIC for Raspberry Pi 5 - Random Forest - latency ===
ONNX: mean=0.0214, sd=0.0139
C++:  mean=0.001, sd=0.0002
Sample size: 10000

Standard error of log(ONNX): 0.006495327102803739
Standard error of log(C++): 0.002

Difference in log means: 3.0633909220278057
SE of difference: 0.006796269136255333
t-statistic: 450.74596968002055

Degrees of freedom: 11878.13520907353
Raw p-value: 0.0
Machine epsilon: 2.220446049250313e-16
Smallest normal float: 2.2250738585072014e-308
Log p-value: -inf
This corresponds to p ≈ 10^-inf


In [2]:
# SOLUTION: Modified approach to handle extremely small p-values
# We'll use log p-values and set a floor for reportable p-values

import math
import pandas as pd
import numpy as np
from scipy import stats

n = 10000  # per the user

# Input means and stds for LATENCY (ms) and POWER (mW)
data = {
    ("Raspberry Pi Zero 2 W", "Random Forest"): {
        "latency": {"onnx": (0.1973, 0.1156), "cpp": (0.0095, 0.0007)},
        "power":   {"onnx": (2848.5659, 70.0784), "cpp": (1177.849, 8.6439)},
    },
    ("Raspberry Pi Zero 2 W", "SVM"): {
        "latency": {"onnx": (2.3467, 0.0962), "cpp": (36.0464, 0.0740)},
        "power":   {"onnx": (1295.4716, 61.6948), "cpp": (1174.7262, 64.7474)},
    },
    ("Raspberry Pi Zero 2 W", "XGBoost"): {
        "latency": {"onnx": (0.4551, 0.2698), "cpp": (0.0764, 0.0125)},
        "power":   {"onnx": (2757.9712, 78.6043), "cpp": (1316.3602, 38.0696)},
    },
    ("Raspberry Pi Zero 2 W", "LightGBM"): {
        "latency": {"onnx": (0.3612, 0.2533), "cpp": (0.0815, 0.0116)},
        "power":   {"onnx": (2794.5897, 27.8926), "cpp": (1379.8975, 31.5611)},
    },
    ("Raspberry Pi 5", "Random Forest"): {
        "latency": {"onnx": (0.0214, 0.0139), "cpp": (0.0010, 0.0002)},
        "power":   {"onnx": (7476.3182, 355.5003), "cpp": (4201.4783, 109.8751)},
    },
    ("Raspberry Pi 5", "SVM"): {
        "latency": {"onnx": (0.2193, 0.0225), "cpp": (5.3540, 0.0541)},
        "power":   {"onnx": (4674.2857, 66.5501), "cpp": (4358.9711, 47.8565)},
    },
    ("Raspberry Pi 5", "XGBoost"): {
        "latency": {"onnx": (0.0294, 0.0175), "cpp": (0.0116, 0.0021)},
        "power":   {"onnx": (7115.0, 399.4177), "cpp": (4513.75, 204.1926)},
    },
    ("Raspberry Pi 5", "LightGBM"): {
        "latency": {"onnx": (0.0249, 0.0553), "cpp": (0.0128, 0.0022)},
        "power":   {"onnx": (6325.0, 906.3378), "cpp": (4410.7692, 239.7257)},
    },
}

def safe_p_value_calculation(t_stat, df):
    """
    Calculate p-value with protection against underflow.
    Returns the p-value, a flag for extremely small values, and log p-value.
    """
    try:
        # Try regular calculation first
        p_val = 2 * stats.t.sf(abs(t_stat), df)
        
        if p_val == 0.0:
            # Calculate log p-value to get magnitude
            log_p = stats.t.logsf(abs(t_stat), df) + math.log(2)
            
            # Set a floor for extremely small p-values
            min_reportable_p = 1e-300
            
            if math.isinf(log_p) or log_p < math.log(min_reportable_p):
                return min_reportable_p, True, log_p
            else:
                return math.exp(log_p), False, log_p
        else:
            return p_val, False, math.log(p_val)
            
    except (OverflowError, ZeroDivisionError):
        return 1e-300, True, -np.inf

rows = []
for (device, model), metrics in data.items():
    for metric_name, d in metrics.items():
        mean1, sd1 = d["onnx"]
        mean2, sd2 = d["cpp"]

        if mean1 <= 0 or mean2 <= 0:
            continue

        ratio = mean1 / mean2

        # Delta-method SE of log means
        se1 = sd1 / (math.sqrt(n) * mean1)
        se2 = sd2 / (math.sqrt(n) * mean2)
        var_diff = se1**2 + se2**2

        # Welch-Satterthwaite df
        v1 = se1**2
        v2 = se2**2
        df = (v1 + v2)**2 / ((v1**2) / (n - 1) + (v2**2) / (n - 1))

        # t statistic on log scale
        diff_log_means = math.log(mean1) - math.log(mean2)
        se_diff = math.sqrt(var_diff)
        t_stat = diff_log_means / se_diff if se_diff > 0 else np.nan

        # Safe p-value calculation
        p_raw, is_extremely_small, log_p = safe_p_value_calculation(t_stat, df)

        # 95% CI for ratio
        alpha = 0.05
        tcrit = stats.t.ppf(1 - alpha / 2, df)
        ci_low = math.exp(diff_log_means - tcrit * se_diff)
        ci_high = math.exp(diff_log_means + tcrit * se_diff)

        # Cohen's d on raw metric
        sp = math.sqrt((sd1**2 + sd2**2) / 2.0)
        d_cohen = (mean1 - mean2) / sp if sp > 0 else np.nan

        rows.append({
            "Device": device,
            "Model": model,
            "Metric": metric_name,
            "Mean_ONNX": mean1,
            "SD_ONNX": sd1,
            "Mean_CPP": mean2,
            "SD_CPP": sd2,
            "Ratio_ONNX_over_CPP": ratio,
            "Ratio_CI95_low": ci_low,
            "Ratio_CI95_high": ci_high,
            "t_stat_log": t_stat,
            "df_log": df,
            "p_value_raw": p_raw,
            "log_p_value": log_p,
            "extremely_small_p": is_extremely_small,
            "Cohens_d_raw": d_cohen,
        })

df_results = pd.DataFrame(rows)
print(f"Processed {len(df_results)} comparisons")
print(f"Extremely small p-values: {df_results['extremely_small_p'].sum()}")
df_results.head()


Processed 16 comparisons
Extremely small p-values: 15


Unnamed: 0,Device,Model,Metric,Mean_ONNX,SD_ONNX,Mean_CPP,SD_CPP,Ratio_ONNX_over_CPP,Ratio_CI95_low,Ratio_CI95_high,t_stat_log,df_log,p_value_raw,log_p_value,extremely_small_p,Cohens_d_raw
0,Raspberry Pi Zero 2 W,Random Forest,latency,0.1973,0.1156,0.0095,0.0007,20.768421,20.529404,21.010221,513.684302,10315.202513,1e-300,-inf,True,2.297443
1,Raspberry Pi Zero 2 W,Random Forest,power,2848.5659,70.0784,1177.849,8.6439,2.418447,2.417231,2.419665,3439.960033,11764.572357,1e-300,-inf,True,33.462226
2,Raspberry Pi Zero 2 W,SVM,latency,2.3467,0.0962,36.0464,0.074,0.065102,0.06505,0.065155,-6655.596982,10049.152026,1e-300,-inf,True,-392.674928
3,Raspberry Pi Zero 2 W,SVM,power,1295.4716,61.6948,1174.7262,64.7474,1.102786,1.101213,1.104362,134.318723,19585.631545,1e-300,-inf,True,1.909334
4,Raspberry Pi Zero 2 W,XGBoost,latency,0.4551,0.2698,0.0764,0.0125,5.956806,5.885428,6.02905,290.168411,11513.389032,1e-300,-inf,True,1.982909


# Analysis of Zero P-values Issue

## Root Cause
The p-values are showing as 0.0 due to **numerical underflow** caused by:

1. **Extremely large t-statistics** (e.g., 450.75, 1061.98, 3099.22)
2. **Very small standard errors** due to large sample size (n=10,000)
3. **Large effect sizes** - substantial differences between ONNX and C++ performance

## The Mathematical Issue
- With n=10,000, the standard error formula SE = SD/(√n × mean) produces tiny values
- √10,000 = 100, so SEs are divided by 100, making them very small
- Large differences ÷ tiny SEs = enormous t-statistics
- P-values become smaller than computer floating-point precision (< 10^-308)

## Statistical Interpretation
- **All differences are highly statistically significant** (p < 0.001 by any reasonable standard)
- The effect sizes (Cohen's d) are very large, indicating practical significance
- The confidence intervals for ratios don't include 1.0, confirming significance

## Recommendations
1. **Report p < 0.001** instead of exact p-values for extremely small cases
2. **Focus on effect sizes** (Cohen's d) and confidence intervals for practical interpretation
3. **Consider reducing sample size** if this was artificially inflated for simulation
4. **Use exact p-values only when they're computationally meaningful** (p > 1e-10)


In [3]:
# Create a properly formatted results table with meaningful p-value reporting

def format_p_value(p_val, is_extremely_small, log_p):
    """Format p-values for meaningful reporting"""
    if is_extremely_small or p_val < 1e-10:
        if math.isinf(log_p):
            return "< 1e-300"
        else:
            # Convert log p to base-10 exponent for readability
            log10_p = log_p / math.log(10)
            if log10_p < -300:
                return "< 1e-300"
            else:
                return f"< 1e{int(log10_p)}"
    elif p_val < 0.001:
        return f"{p_val:.2e}"
    else:
        return f"{p_val:.6f}"

# Apply Holm-Bonferroni correction
def holm_bonferroni(pvals):
    m = len(pvals)
    order = np.argsort(pvals)
    ranks = np.empty_like(order)
    ranks[order] = np.arange(m)
    sorted_p = np.array(pvals)[order]

    adj_sorted = np.zeros(m)
    max_val = 0.0
    for i in range(m):
        adj = (m - i) * sorted_p[i]
        if adj < max_val:
            adj = max_val
        else:
            max_val = adj
        adj_sorted[i] = min(adj, 1.0)

    adj = np.empty(m)
    adj[order] = adj_sorted
    return adj

# Add Holm-corrected p-values
df_results["p_value_holm"] = holm_bonferroni(df_results["p_value_raw"].values)

# Create formatted versions
df_results["p_value_formatted"] = df_results.apply(
    lambda row: format_p_value(row["p_value_raw"], row["extremely_small_p"], row["log_p_value"]), 
    axis=1
)

df_results["p_value_holm_formatted"] = df_results.apply(
    lambda row: format_p_value(row["p_value_holm"], row["p_value_holm"] < 1e-10, 
                              math.log(row["p_value_holm"]) if row["p_value_holm"] > 0 else -np.inf), 
    axis=1
)

# Create final results table
final_results = df_results[[
    "Device", "Model", "Metric",
    "Mean_ONNX", "SD_ONNX", "Mean_CPP", "SD_CPP",
    "Ratio_ONNX_over_CPP", "Ratio_CI95_low", "Ratio_CI95_high",
    "t_stat_log", "df_log", 
    "p_value_formatted", "p_value_holm_formatted",
    "Cohens_d_raw"
]].copy()

# Rename columns for clarity
final_results.columns = [
    "Device", "Model", "Metric",
    "Mean_ONNX", "SD_ONNX", "Mean_CPP", "SD_CPP", 
    "Ratio_ONNX_over_CPP", "CI95_Low", "CI95_High",
    "t_statistic", "df", 
    "p_value", "p_value_holm_corrected",
    "Cohens_d"
]

# Sort for readability
final_results = final_results.sort_values(["Device", "Model", "Metric"])

print("CORRECTED RESULTS - Properly Handled P-values")
print("=" * 50)
print(final_results.to_string(index=False))

# Save the corrected results
output_path = "./onnx_vs_cpp_significance_CORRECTED.csv"
final_results.to_csv(output_path, index=False)
print(f"\nCorrected results saved to: {output_path}")


CORRECTED RESULTS - Properly Handled P-values
               Device         Model  Metric  Mean_ONNX  SD_ONNX  Mean_CPP   SD_CPP  Ratio_ONNX_over_CPP  CI95_Low  CI95_High  t_statistic           df  p_value p_value_holm_corrected    Cohens_d
       Raspberry Pi 5      LightGBM latency     0.0249   0.0553    0.0128   0.0022             1.945312  1.862200   2.032134    29.872743 10118.769115 < 1e-187               < 1e-187    0.309195
       Raspberry Pi 5      LightGBM   power  6325.0000 906.3378 4410.7692 239.7257             1.433990  1.429689   1.438304   235.202604 12817.588154 < 1e-300               < 1e-298    2.887590
       Raspberry Pi 5 Random Forest latency     0.0214   0.0139    0.0010   0.0002            21.400000 21.116804  21.686994   450.745970 11878.135209 < 1e-300               < 1e-298    2.075322
       Raspberry Pi 5 Random Forest   power  7476.3182 355.5003 4201.4783 109.8751             1.779449  1.777558   1.781343  1061.975993 15540.868405 < 1e-300               

In [4]:
# EXTENDED ANALYSIS: Adding all remaining hardware devices
# Now including Sony Spresense, Google Coral Dev Board, NVIDIA Jetson Orin Nano, and NVIDIA Jetson AGX Orin

import math
import pandas as pd
import numpy as np
from scipy import stats

n = 10000  # per the user

# COMPLETE dataset with ALL hardware devices
# Structure: (mean, std) for both ONNX and C++ implementations
data = {
    # Existing devices (Raspberry Pi Zero 2 W and Raspberry Pi 5)
    ("Raspberry Pi Zero 2 W", "Random Forest"): {
        "latency": {"onnx": (0.1973, 0.1156), "cpp": (0.0095, 0.0007)},
        "power":   {"onnx": (2848.5659, 70.0784), "cpp": (1177.849, 8.6439)},
    },
    ("Raspberry Pi Zero 2 W", "SVM"): {
        "latency": {"onnx": (2.3467, 0.0962), "cpp": (36.0464, 0.0740)},
        "power":   {"onnx": (1295.4716, 61.6948), "cpp": (1174.7262, 64.7474)},
    },
    ("Raspberry Pi Zero 2 W", "XGBoost"): {
        "latency": {"onnx": (0.4551, 0.2698), "cpp": (0.0764, 0.0125)},
        "power":   {"onnx": (2757.9712, 78.6043), "cpp": (1316.3602, 38.0696)},
    },
    ("Raspberry Pi Zero 2 W", "LightGBM"): {
        "latency": {"onnx": (0.3612, 0.2533), "cpp": (0.0815, 0.0116)},
        "power":   {"onnx": (2794.5897, 27.8926), "cpp": (1379.8975, 31.5611)},
    },
    ("Raspberry Pi 5", "Random Forest"): {
        "latency": {"onnx": (0.0214, 0.0139), "cpp": (0.0010, 0.0002)},
        "power":   {"onnx": (7476.3182, 355.5003), "cpp": (4201.4783, 109.8751)},
    },
    ("Raspberry Pi 5", "SVM"): {
        "latency": {"onnx": (0.2193, 0.0225), "cpp": (5.3540, 0.0541)},
        "power":   {"onnx": (4674.2857, 66.5501), "cpp": (4358.9711, 47.8565)},
    },
    ("Raspberry Pi 5", "XGBoost"): {
        "latency": {"onnx": (0.0294, 0.0175), "cpp": (0.0116, 0.0021)},
        "power":   {"onnx": (7115.0, 399.4177), "cpp": (4513.75, 204.1926)},
    },
    ("Raspberry Pi 5", "LightGBM"): {
        "latency": {"onnx": (0.0249, 0.0553), "cpp": (0.0128, 0.0022)},
        "power":   {"onnx": (6325.0, 906.3378), "cpp": (4410.7692, 239.7257)},
    },
    
    # NEW DEVICES - Sony Spresense
    ("Sony Spresense", "Random Forest"): {
        "latency": {"onnx": None, "cpp": (0.0552, 0.0110)},  # No ONNX data provided
        "power":   {"onnx": None, "cpp": (54.2333, 1.8468)},
    },
    ("Sony Spresense", "SVM"): {
        "latency": {"onnx": None, "cpp": None},  # No data for both
        "power":   {"onnx": None, "cpp": None},
    },
    ("Sony Spresense", "XGBoost"): {
        "latency": {"onnx": None, "cpp": (1.2860, 0.9051)},
        "power":   {"onnx": None, "cpp": (54.2152, 1.8288)},
    },
    ("Sony Spresense", "LightGBM"): {
        "latency": {"onnx": None, "cpp": (2.9531, 0.0912)},
        "power":   {"onnx": None, "cpp": (53.7438, 2.387)},
    },
    
    # Google Coral Dev Board
    ("Google Coral Dev Board", "Random Forest"): {
        "latency": {"onnx": None, "cpp": (0.0068, 0.0007)},
        "power":   {"onnx": None, "cpp": (3605.1471, 67.9828)},
    },
    ("Google Coral Dev Board", "SVM"): {
        "latency": {"onnx": None, "cpp": (15.7131, 0.0678)},
        "power":   {"onnx": None, "cpp": (3690.6548, 86.6829)},
    },
    ("Google Coral Dev Board", "XGBoost"): {
        "latency": {"onnx": None, "cpp": (0.0787, 0.0153)},
        "power":   {"onnx": None, "cpp": (3740.0, 99.2387)},
    },
    ("Google Coral Dev Board", "LightGBM"): {
        "latency": {"onnx": None, "cpp": (0.0538, 0.0140)},
        "power":   {"onnx": None, "cpp": (4085.0, 238.3366)},
    },
    
    # NVIDIA Jetson Orin Nano
    ("NVIDIA Jetson Orin Nano", "Random Forest"): {
        "latency": {"onnx": (0.0481, 0.0891), "cpp": (0.0010, 0.0001)},
        "power":   {"onnx": (5706.312, 772.007), "cpp": (4742.361, 278.042)},
    },
    ("NVIDIA Jetson Orin Nano", "SVM"): {
        "latency": {"onnx": (0.2997, 0.0138), "cpp": (7.1265, 0.2786)},
        "power":   {"onnx": (5222.886, 496.867), "cpp": (5119.125, 185.681)},
    },
    ("NVIDIA Jetson Orin Nano", "XGBoost"): {
        "latency": {"onnx": (0.0587, 0.0850), "cpp": (0.0189, 0.0141)},
        "power":   {"onnx": (5882.615, 959.221), "cpp": (5049.474, 156.395)},
    },
    ("NVIDIA Jetson Orin Nano", "LightGBM"): {
        "latency": {"onnx": (0.0571, 0.1000), "cpp": (0.0344, 0.0122)},
        "power":   {"onnx": (5578.0, 982.586), "cpp": (5037.095, 161.076)},
    },
    
    # NVIDIA Jetson AGX Orin
    ("NVIDIA Jetson AGX Orin", "Random Forest"): {
        "latency": {"onnx": (0.0741, 0.3887), "cpp": (0.0010, 0.0000)},
        "power":   {"onnx": (3996.500, 64.616), "cpp": (3645.583, 21.500)},
    },
    ("NVIDIA Jetson AGX Orin", "SVM"): {
        "latency": {"onnx": (0.3030, 0.0125), "cpp": (7.0543, 0.2390)},
        "power":   {"onnx": (3902.657, 102.407), "cpp": (3562.613, 21.139)},
    },
    ("NVIDIA Jetson AGX Orin", "XGBoost"): {
        "latency": {"onnx": (0.0531, 0.0137), "cpp": (0.0163, 0.0064)},
        "power":   {"onnx": (4060.556, 57.234), "cpp": (3517.692, 47.861)},
    },
    ("NVIDIA Jetson AGX Orin", "LightGBM"): {
        "latency": {"onnx": (0.0603, 0.0956), "cpp": (0.0322, 0.0021)},
        "power":   {"onnx": (3967.286, 26.133), "cpp": (3592.917, 15.803)},
    },
}

def safe_p_value_calculation(t_stat, df):
    """
    Calculate p-value with protection against underflow.
    Returns the p-value, a flag for extremely small values, and log p-value.
    """
    try:
        # Try regular calculation first
        p_val = 2 * stats.t.sf(abs(t_stat), df)
        
        if p_val == 0.0:
            # Calculate log p-value to get magnitude
            log_p = stats.t.logsf(abs(t_stat), df) + math.log(2)
            
            # Set a floor for extremely small p-values
            min_reportable_p = 1e-300
            
            if math.isinf(log_p) or log_p < math.log(min_reportable_p):
                return min_reportable_p, True, log_p
            else:
                return math.exp(log_p), False, log_p
        else:
            return p_val, False, math.log(p_val)
            
    except (OverflowError, ZeroDivisionError):
        return 1e-300, True, -np.inf

rows = []
for (device, model), metrics in data.items():
    for metric_name, d in metrics.items():
        onnx_data = d["onnx"]
        cpp_data = d["cpp"]
        
        # Skip if either ONNX or C++ data is missing
        if onnx_data is None or cpp_data is None:
            continue
            
        mean1, sd1 = onnx_data
        mean2, sd2 = cpp_data

        # Guard against non-positive means
        if mean1 <= 0 or mean2 <= 0:
            continue

        ratio = mean1 / mean2

        # Delta-method SE of log means
        se1 = sd1 / (math.sqrt(n) * mean1)
        se2 = sd2 / (math.sqrt(n) * mean2)
        var_diff = se1**2 + se2**2

        # Welch-Satterthwaite df
        v1 = se1**2
        v2 = se2**2
        df = (v1 + v2)**2 / ((v1**2) / (n - 1) + (v2**2) / (n - 1))

        # t statistic on log scale
        diff_log_means = math.log(mean1) - math.log(mean2)
        se_diff = math.sqrt(var_diff)
        t_stat = diff_log_means / se_diff if se_diff > 0 else np.nan

        # Safe p-value calculation
        p_raw, is_extremely_small, log_p = safe_p_value_calculation(t_stat, df)

        # 95% CI for ratio
        alpha = 0.05
        tcrit = stats.t.ppf(1 - alpha / 2, df)
        ci_low = math.exp(diff_log_means - tcrit * se_diff)
        ci_high = math.exp(diff_log_means + tcrit * se_diff)

        # Cohen's d on raw metric
        sp = math.sqrt((sd1**2 + sd2**2) / 2.0)
        d_cohen = (mean1 - mean2) / sp if sp > 0 else np.nan

        rows.append({
            "Device": device,
            "Model": model,
            "Metric": metric_name,
            "Mean_ONNX": mean1,
            "SD_ONNX": sd1,
            "Mean_CPP": mean2,
            "SD_CPP": sd2,
            "Ratio_ONNX_over_CPP": ratio,
            "Ratio_CI95_low": ci_low,
            "Ratio_CI95_high": ci_high,
            "t_stat_log": t_stat,
            "df_log": df,
            "p_value_raw": p_raw,
            "log_p_value": log_p,
            "extremely_small_p": is_extremely_small,
            "Cohens_d_raw": d_cohen,
        })

df_extended = pd.DataFrame(rows)
print(f"EXTENDED ANALYSIS: Processed {len(df_extended)} comparisons")
print(f"Devices included: {df_extended['Device'].unique()}")
print(f"Models per device: {df_extended.groupby('Device')['Model'].nunique()}")
print(f"Extremely small p-values: {df_extended['extremely_small_p'].sum()}")

df_extended.head(10)


EXTENDED ANALYSIS: Processed 32 comparisons
Devices included: ['Raspberry Pi Zero 2 W' 'Raspberry Pi 5' 'NVIDIA Jetson Orin Nano'
 'NVIDIA Jetson AGX Orin']
Models per device: Device
NVIDIA Jetson AGX Orin     4
NVIDIA Jetson Orin Nano    4
Raspberry Pi 5             4
Raspberry Pi Zero 2 W      4
Name: Model, dtype: int64
Extremely small p-values: 29


Unnamed: 0,Device,Model,Metric,Mean_ONNX,SD_ONNX,Mean_CPP,SD_CPP,Ratio_ONNX_over_CPP,Ratio_CI95_low,Ratio_CI95_high,t_stat_log,df_log,p_value_raw,log_p_value,extremely_small_p,Cohens_d_raw
0,Raspberry Pi Zero 2 W,Random Forest,latency,0.1973,0.1156,0.0095,0.0007,20.768421,20.529404,21.010221,513.684302,10315.202513,1e-300,-inf,True,2.297443
1,Raspberry Pi Zero 2 W,Random Forest,power,2848.5659,70.0784,1177.849,8.6439,2.418447,2.417231,2.419665,3439.960033,11764.572357,1e-300,-inf,True,33.462226
2,Raspberry Pi Zero 2 W,SVM,latency,2.3467,0.0962,36.0464,0.074,0.065102,0.06505,0.065155,-6655.596982,10049.152026,1e-300,-inf,True,-392.674928
3,Raspberry Pi Zero 2 W,SVM,power,1295.4716,61.6948,1174.7262,64.7474,1.102786,1.101213,1.104362,134.318723,19585.631545,1e-300,-inf,True,1.909334
4,Raspberry Pi Zero 2 W,XGBoost,latency,0.4551,0.2698,0.0764,0.0125,5.956806,5.885428,6.02905,290.168411,11513.389032,1e-300,-inf,True,1.982909
5,Raspberry Pi Zero 2 W,XGBoost,power,2757.9712,78.6043,1316.3602,38.0696,2.095149,2.093483,2.096817,1821.558921,19993.73028,1e-300,-inf,True,23.343164
6,Raspberry Pi Zero 2 W,LightGBM,latency,0.3612,0.2533,0.0815,0.0116,4.431902,4.370172,4.494504,208.061462,10821.386754,1e-300,-inf,True,1.559974
7,Raspberry Pi Zero 2 W,LightGBM,power,2794.5897,27.8926,1379.8975,31.5611,2.025215,2.024225,2.026206,2827.798674,13673.920053,1e-300,-inf,True,47.499374
8,Raspberry Pi 5,Random Forest,latency,0.0214,0.0139,0.001,0.0002,21.4,21.116804,21.686994,450.74597,11878.135209,1e-300,-inf,True,2.075322
9,Raspberry Pi 5,Random Forest,power,7476.3182,355.5003,4201.4783,109.8751,1.779449,1.777558,1.781343,1061.975993,15540.868405,1e-300,-inf,True,12.44669


In [5]:
# FINAL COMPREHENSIVE RESULTS with Holm-Bonferroni Correction
# Apply multiple comparison correction and generate final formatted results

def format_p_value(p_val, is_extremely_small, log_p):
    """Format p-values for meaningful reporting"""
    if is_extremely_small or p_val < 1e-10:
        if math.isinf(log_p):
            return "< 1e-300"
        else:
            # Convert log p to base-10 exponent for readability
            log10_p = log_p / math.log(10)
            if log10_p < -300:
                return "< 1e-300"
            else:
                return f"< 1e{int(log10_p)}"
    elif p_val < 0.001:
        return f"{p_val:.2e}"
    else:
        return f"{p_val:.6f}"

# Apply Holm-Bonferroni correction
def holm_bonferroni(pvals):
    m = len(pvals)
    order = np.argsort(pvals)
    ranks = np.empty_like(order)
    ranks[order] = np.arange(m)
    sorted_p = np.array(pvals)[order]

    adj_sorted = np.zeros(m)
    max_val = 0.0
    for i in range(m):
        adj = (m - i) * sorted_p[i]
        if adj < max_val:
            adj = max_val
        else:
            max_val = adj
        adj_sorted[i] = min(adj, 1.0)

    adj = np.empty(m)
    adj[order] = adj_sorted
    return adj

# Add Holm-corrected p-values
df_extended["p_value_holm"] = holm_bonferroni(df_extended["p_value_raw"].values)

# Create formatted versions
df_extended["p_value_formatted"] = df_extended.apply(
    lambda row: format_p_value(row["p_value_raw"], row["extremely_small_p"], row["log_p_value"]), 
    axis=1
)

df_extended["p_value_holm_formatted"] = df_extended.apply(
    lambda row: format_p_value(row["p_value_holm"], row["p_value_holm"] < 1e-10, 
                              math.log(row["p_value_holm"]) if row["p_value_holm"] > 0 else -np.inf), 
    axis=1
)

# Create final comprehensive results table
final_comprehensive = df_extended[[
    "Device", "Model", "Metric",
    "Mean_ONNX", "SD_ONNX", "Mean_CPP", "SD_CPP",
    "Ratio_ONNX_over_CPP", "Ratio_CI95_low", "Ratio_CI95_high",
    "t_stat_log", "df_log", 
    "p_value_formatted", "p_value_holm_formatted",
    "Cohens_d_raw"
]].copy()

# Rename columns for clarity
final_comprehensive.columns = [
    "Device", "Model", "Metric",
    "Mean_ONNX", "SD_ONNX", "Mean_CPP", "SD_CPP", 
    "Ratio_ONNX_over_CPP", "CI95_Low", "CI95_High",
    "t_statistic", "df", 
    "p_value", "p_value_holm_corrected",
    "Cohens_d"
]

# Sort for readability
final_comprehensive = final_comprehensive.sort_values(["Device", "Model", "Metric"])

print("COMPREHENSIVE RESULTS - All Hardware Devices")
print("=" * 60)
print(f"Total comparisons: {len(final_comprehensive)}")
print(f"Devices analyzed: {len(final_comprehensive['Device'].unique())}")
print(f"Device breakdown:")
for device in final_comprehensive['Device'].unique():
    count = len(final_comprehensive[final_comprehensive['Device'] == device])
    print(f"  - {device}: {count} comparisons")

import pandas as pd
pd.set_option('display.max_rows', None)
print("\nALL ROWS:")
print(final_comprehensive.to_string(index=False))

# Save the comprehensive results
comprehensive_output_path = "./onnx_vs_cpp_comprehensive_analysis.csv"
final_comprehensive.to_csv(comprehensive_output_path, index=False)
print(f"\nComprehensive results saved to: {comprehensive_output_path}")

# Summary statistics
print("\n" + "="*60)
print("SUMMARY STATISTICS")
print("="*60)

# Count significant results
sig_count = len(final_comprehensive[final_comprehensive['p_value'] != '< 1e-300'])
total_count = len(final_comprehensive)
print(f"Extremely significant results (p < 1e-300): {total_count - sig_count}/{total_count}")
print(f"Other significant results: {sig_count}/{total_count}")

# Performance advantages summary
onnx_faster = len(final_comprehensive[final_comprehensive['Ratio_ONNX_over_CPP'] < 1])
cpp_faster = len(final_comprehensive[final_comprehensive['Ratio_ONNX_over_CPP'] > 1])
print(f"\nPerformance Summary:")
print(f"ONNX faster than C++: {onnx_faster} comparisons")
print(f"C++ faster than ONNX: {cpp_faster} comparisons")

final_comprehensive


COMPREHENSIVE RESULTS - All Hardware Devices
Total comparisons: 32
Devices analyzed: 4
Device breakdown:
  - NVIDIA Jetson AGX Orin: 8 comparisons
  - NVIDIA Jetson Orin Nano: 8 comparisons
  - Raspberry Pi 5: 8 comparisons
  - Raspberry Pi Zero 2 W: 8 comparisons

ALL ROWS:
                 Device         Model  Metric  Mean_ONNX  SD_ONNX  Mean_CPP   SD_CPP  Ratio_ONNX_over_CPP  CI95_Low  CI95_High  t_statistic           df  p_value p_value_holm_corrected    Cohens_d
 NVIDIA Jetson AGX Orin      LightGBM latency     0.0603   0.0956    0.0322   0.0021             1.872671  1.815321   1.931833    39.537847 10032.840051 < 1e-300               < 1e-298    0.415584
 NVIDIA Jetson AGX Orin      LightGBM   power  3967.2860  26.1330 3592.9170  15.8030             1.104196  1.104025   1.104368  1251.392965 17436.688693 < 1e-300               < 1e-298   17.336093
 NVIDIA Jetson AGX Orin Random Forest latency     0.0741   0.3887    0.0010   0.0000            74.100000 66.859335  82.124807    82.

Unnamed: 0,Device,Model,Metric,Mean_ONNX,SD_ONNX,Mean_CPP,SD_CPP,Ratio_ONNX_over_CPP,CI95_Low,CI95_High,t_statistic,df,p_value,p_value_holm_corrected,Cohens_d
30,NVIDIA Jetson AGX Orin,LightGBM,latency,0.0603,0.0956,0.0322,0.0021,1.872671,1.815321,1.931833,39.537847,10032.840051,< 1e-300,< 1e-298,0.415584
31,NVIDIA Jetson AGX Orin,LightGBM,power,3967.286,26.133,3592.917,15.803,1.104196,1.104025,1.104368,1251.392965,17436.688693,< 1e-300,< 1e-298,17.336093
24,NVIDIA Jetson AGX Orin,Random Forest,latency,0.0741,0.3887,0.001,0.0,74.1,66.859335,82.124807,82.076483,9999.0,< 1e-300,< 1e-298,0.265961
25,NVIDIA Jetson AGX Orin,Random Forest,power,3996.5,64.616,3645.583,21.5,1.096258,1.095888,1.096628,534.002074,12613.495978,< 1e-300,< 1e-298,7.287499
26,NVIDIA Jetson AGX Orin,SVM,latency,0.303,0.0125,7.0543,0.239,0.042953,0.042908,0.042997,-5896.355043,19269.637412,< 1e-300,< 1e-298,-39.894344
27,NVIDIA Jetson AGX Orin,SVM,power,3902.657,102.407,3562.613,21.139,1.095448,1.09487,1.096026,338.861576,11018.871955,< 1e-300,< 1e-298,4.598959
28,NVIDIA Jetson AGX Orin,XGBoost,latency,0.0531,0.0137,0.0163,0.0064,3.257669,3.227807,3.287807,251.375531,17276.945958,< 1e-300,< 1e-298,3.441733
29,NVIDIA Jetson AGX Orin,XGBoost,power,4060.556,57.234,3517.692,47.861,1.154324,1.153881,1.154767,732.571394,19973.086858,< 1e-300,< 1e-298,10.290076
22,NVIDIA Jetson Orin Nano,LightGBM,latency,0.0571,0.1,0.0344,0.0122,1.659884,1.602751,1.719053,28.359633,10817.713242,< 1e-169,< 1e-169,0.318664
23,NVIDIA Jetson Orin Nano,LightGBM,power,5578.0,982.586,5037.095,161.076,1.107384,1.103505,1.111277,56.973214,10657.313007,< 1e-300,< 1e-298,0.768258


In [6]:
# COVERAGE CHECK AND FULL DISPLAY
# Show which device/model/metric rows were skipped and display all comparisons

import pandas as pd

pd.set_option('display.max_rows', 200)

skipped_rows = []
for (device, model), metrics in data.items():
    for metric_name, d in metrics.items():
        onnx_data = d.get('onnx')
        cpp_data = d.get('cpp')
        if onnx_data is None or cpp_data is None:
            skipped_rows.append({
                'Device': device,
                'Model': model,
                'Metric': metric_name,
                'Missing_ONNX': onnx_data is None,
                'Missing_CPP': cpp_data is None,
            })

print("COVERAGE SUMMARY")
print("=" * 60)
print(f"Total device-model entries: {len(data)} (each with 2 metrics)")
print(f"Rows produced (ONNX vs C++): {len(df_extended)}")
print(f"Rows skipped (missing counterpart): {len(skipped_rows)}")

if skipped_rows:
    df_skipped = pd.DataFrame(skipped_rows).sort_values(['Device', 'Model', 'Metric'])
    print("\nSkipped due to missing ONNX/C++ counterpart:")
    print(df_skipped.to_string(index=False))

print("\nALL COMPARISONS (ONNX vs C++)")
print("=" * 60)
print(df_extended.sort_values(['Device', 'Model', 'Metric']).to_string(index=False))


COVERAGE SUMMARY
Total device-model entries: 24 (each with 2 metrics)
Rows produced (ONNX vs C++): 32
Rows skipped (missing counterpart): 16

Skipped due to missing ONNX/C++ counterpart:
                Device         Model  Metric  Missing_ONNX  Missing_CPP
Google Coral Dev Board      LightGBM latency          True        False
Google Coral Dev Board      LightGBM   power          True        False
Google Coral Dev Board Random Forest latency          True        False
Google Coral Dev Board Random Forest   power          True        False
Google Coral Dev Board           SVM latency          True        False
Google Coral Dev Board           SVM   power          True        False
Google Coral Dev Board       XGBoost latency          True        False
Google Coral Dev Board       XGBoost   power          True        False
        Sony Spresense      LightGBM latency          True        False
        Sony Spresense      LightGBM   power          True        False
        Sony Spresens

In [7]:
# We'll compute Welch's t-Test on *log-transformed metrics* using a delta-method approximation
# from summary stats (mean, sd, n), and report ratio CIs by back-transforming the log CI.
#
# Assumptions: per-inference metrics are positive and the log transform is reasonable; we only have
# mean and SD, not raw samples, so we approximate mean(log X) ~ log(mean(X)) and
# SE(log X) ~ SD(X) / (sqrt(n) * mean(X)). This is a standard first-order delta-method approach.
#
# We'll then apply Holm–Bonferroni across all comparisons (all device × model × metric rows).
#
# Finally, we'll compute Cohen's d on the raw metric for practical effect size.
import math
import pandas as pd
import numpy as np
from scipy import stats

n = 10000  # per the user

# Input means and stds for LATENCY (ms) and POWER (mW)
# Structure: (mean, std)
data = {
    ("Raspberry Pi Zero 2 W", "Random Forest"): {
        "latency": {"onnx": (0.1973, 0.1156), "cpp": (0.0095, 0.0007)},
        "power":   {"onnx": (2848.5659, 70.0784), "cpp": (1177.849, 8.6439)},
    },
    ("Raspberry Pi Zero 2 W", "SVM"): {
        "latency": {"onnx": (2.3467, 0.0962), "cpp": (36.0464, 0.0740)},
        "power":   {"onnx": (1295.4716, 61.6948), "cpp": (1174.7262, 64.7474)},
    },
    ("Raspberry Pi Zero 2 W", "XGBoost"): {
        "latency": {"onnx": (0.4551, 0.2698), "cpp": (0.0764, 0.0125)},
        "power":   {"onnx": (2757.9712, 78.6043), "cpp": (1316.3602, 38.0696)},
    },
    ("Raspberry Pi Zero 2 W", "LightGBM"): {
        "latency": {"onnx": (0.3612, 0.2533), "cpp": (0.0815, 0.0116)},
        "power":   {"onnx": (2794.5897, 27.8926), "cpp": (1379.8975, 31.5611)},
    },
    ("Raspberry Pi 5", "Random Forest"): {
        "latency": {"onnx": (0.0214, 0.0139), "cpp": (0.0010, 0.0002)},
        "power":   {"onnx": (7476.3182, 355.5003), "cpp": (4201.4783, 109.8751)},
    },
    ("Raspberry Pi 5", "SVM"): {
        "latency": {"onnx": (0.2193, 0.0225), "cpp": (5.3540, 0.0541)},
        "power":   {"onnx": (4674.2857, 66.5501), "cpp": (4358.9711, 47.8565)},
    },
    ("Raspberry Pi 5", "XGBoost"): {
        "latency": {"onnx": (0.0294, 0.0175), "cpp": (0.0116, 0.0021)},
        "power":   {"onnx": (7115.0, 399.4177), "cpp": (4513.75, 204.1926)},
    },
    ("Raspberry Pi 5", "LightGBM"): {
        "latency": {"onnx": (0.0249, 0.0553), "cpp": (0.0128, 0.0022)},
        "power":   {"onnx": (6325.0, 906.3378), "cpp": (4410.7692, 239.7257)},
    },
}

rows = []
for (device, model), metrics in data.items():
    for metric_name, d in metrics.items():
        mean1, sd1 = d["onnx"]
        mean2, sd2 = d["cpp"]

        # Guard against non-positive means (log requires positive); if any mean <=0, skip or adjust
        if mean1 <= 0 or mean2 <= 0:
            continue

        # Ratio (ONNX / C++)
        ratio = mean1 / mean2

        # Delta-method SE of log means
        se1 = sd1 / (math.sqrt(n) * mean1)
        se2 = sd2 / (math.sqrt(n) * mean2)
        # Variance of difference in log means
        var_diff = se1**2 + se2**2

        # Welch-Satterthwaite df for log-scale comparison
        # Here v1 = se1^2 and v2 = se2^2 are the variances of the sample log means
        # For df, we use:
        # df = (v1 + v2)^2 / (v1^2/(n-1) + v2^2/(n-1))
        v1 = se1**2
        v2 = se2**2
        df = (v1 + v2)**2 / ((v1**2) / (n - 1) + (v2**2) / (n - 1))

        # t statistic on log scale
        diff_log_means = math.log(mean1) - math.log(mean2)
        se_diff = math.sqrt(var_diff)
        t_stat = diff_log_means / se_diff if se_diff > 0 else np.nan

        # two-sided p-value
        p_raw = 2 * stats.t.sf(abs(t_stat), df)

        # 95% CI for ratio via back-transformed log CI
        alpha = 0.05
        tcrit = stats.t.ppf(1 - alpha / 2, df)
        ci_low = math.exp(diff_log_means - tcrit * se_diff)
        ci_high = math.exp(diff_log_means + tcrit * se_diff)

        # Cohen's d on raw metric
        sp = math.sqrt((sd1**2 + sd2**2) / 2.0)
        d_cohen = (mean1 - mean2) / sp if sp > 0 else np.nan

        rows.append({
            "Device": device,
            "Model": model,
            "Metric": metric_name,
            "Mean_ONNX": mean1,
            "SD_ONNX": sd1,
            "Mean_CPP": mean2,
            "SD_CPP": sd2,
            "Ratio_ONNX_over_CPP": ratio,
            "Ratio_CI95_low": ci_low,
            "Ratio_CI95_high": ci_high,
            "t_stat_log": t_stat,
            "df_log": df,
            "p_value_raw": p_raw,
            "Cohens_d_raw": d_cohen,
        })

df_results = pd.DataFrame(rows)

# Holm–Bonferroni adjustment across all comparisons
def holm_bonferroni(pvals):
    # Returns adjusted p-values in the original order
    m = len(pvals)
    order = np.argsort(pvals)
    ranks = np.empty_like(order)
    ranks[order] = np.arange(m)  # rank 0..m-1 for sorted ascending
    sorted_p = np.array(pvals)[order]

    # Step-down: adjusted p_i = max_{j<=i} ( (m-j) * p_(j) ), then enforce monotonicity
    adj_sorted = np.zeros(m)
    max_val = 0.0
    for i in range(m):
        adj = (m - i) * sorted_p[i]
        if adj < max_val:
            adj = max_val
        else:
            max_val = adj
        adj_sorted[i] = min(adj, 1.0)

    # Reorder back to original order
    adj = np.empty(m)
    adj[order] = adj_sorted
    return adj

df_results["p_value_holm"] = holm_bonferroni(df_results["p_value_raw"].values.tolist())

# Nice ordering of columns
df_results = df_results[[
    "Device", "Model", "Metric",
    "Mean_ONNX", "SD_ONNX", "Mean_CPP", "SD_CPP",
    "Ratio_ONNX_over_CPP", "Ratio_CI95_low", "Ratio_CI95_high",
    "t_stat_log", "df_log", "p_value_raw", "p_value_holm",
    "Cohens_d_raw"
]]

# Sort for readability
df_results.sort_values(["Device", "Model", "Metric"], inplace=True)

# Save to CSV for download
out_path = "./onnx_vs_cpp_significance_latency_power_holm.csv"
df_results.to_csv(out_path, index=False)


out_path


'./onnx_vs_cpp_significance_latency_power_holm.csv'

In [8]:
# ADD THROUGHPUT (FlPS) ANALYSIS DERIVED FROM LATENCY
# We derive FlPS = 1000 / latency_ms and approximate SD via delta method:
# SD_FlPS ≈ |d(1000/x)/dx| at mean * SD_latency = (1000/mean_latency^2) * SD_latency

import math
import pandas as pd
import numpy as np
from scipy import stats

n = 10000

flps_rows = []
for (device, model), metrics in data.items():
    lat = metrics.get('latency', {})
    onnx_lat = lat.get('onnx')
    cpp_lat = lat.get('cpp')
    if onnx_lat is None or cpp_lat is None:
        continue
    mean_lat_onnx, sd_lat_onnx = onnx_lat
    mean_lat_cpp, sd_lat_cpp = cpp_lat
    if mean_lat_onnx <= 0 or mean_lat_cpp <= 0:
        continue

    # Derive FlPS means and SDs
    mean_flps_onnx = 1000.0 / mean_lat_onnx
    mean_flps_cpp = 1000.0 / mean_lat_cpp
    sd_flps_onnx = (1000.0 / (mean_lat_onnx**2)) * sd_lat_onnx
    sd_flps_cpp = (1000.0 / (mean_lat_cpp**2)) * sd_lat_cpp

    # Ratio ONNX/C++ on FlPS
    ratio = mean_flps_onnx / mean_flps_cpp

    # Delta-method SE of log means (works the same for FlPS as for latency)
    se1 = sd_flps_onnx / (math.sqrt(n) * mean_flps_onnx)
    se2 = sd_flps_cpp / (math.sqrt(n) * mean_flps_cpp)
    var_diff = se1**2 + se2**2

    # Welch-Satterthwaite df
    v1 = se1**2
    v2 = se2**2
    df = (v1 + v2)**2 / ((v1**2) / (n - 1) + (v2**2) / (n - 1))

    # t statistic on log scale
    diff_log_means = math.log(mean_flps_onnx) - math.log(mean_flps_cpp)
    se_diff = math.sqrt(var_diff) if var_diff > 0 else np.nan
    t_stat = diff_log_means / se_diff if se_diff and se_diff > 0 else np.nan

    # p-value with underflow protection
    def safe_p_value_calculation(t_stat, df):
        try:
            p_val = 2 * stats.t.sf(abs(t_stat), df)
            if p_val == 0.0:
                log_p = stats.t.logsf(abs(t_stat), df) + math.log(2)
                min_reportable_p = 1e-300
                if math.isinf(log_p) or log_p < math.log(min_reportable_p):
                    return min_reportable_p, True, log_p
                else:
                    return math.exp(log_p), False, log_p
            else:
                return p_val, False, math.log(p_val)
        except (OverflowError, ZeroDivisionError):
            return 1e-300, True, -np.inf

    p_raw, is_extremely_small, log_p = safe_p_value_calculation(t_stat, df)

    # 95% CI for ratio
    alpha = 0.05
    tcrit = stats.t.ppf(1 - alpha / 2, df)
    ci_low = math.exp(diff_log_means - tcrit * se_diff)
    ci_high = math.exp(diff_log_means + tcrit * se_diff)

    # Cohen's d on raw FlPS
    sp = math.sqrt((sd_flps_onnx**2 + sd_flps_cpp**2) / 2.0)
    d_cohen = (mean_flps_onnx - mean_flps_cpp) / sp if sp > 0 else np.nan

    flps_rows.append({
        "Device": device,
        "Model": model,
        "Metric": "flps",
        "Mean_ONNX": mean_flps_onnx,
        "SD_ONNX": sd_flps_onnx,
        "Mean_CPP": mean_flps_cpp,
        "SD_CPP": sd_flps_cpp,
        "Ratio_ONNX_over_CPP": ratio,
        "Ratio_CI95_low": ci_low,
        "Ratio_CI95_high": ci_high,
        "t_stat_log": t_stat,
        "df_log": df,
        "p_value_raw": p_raw,
        "log_p_value": log_p,
        "extremely_small_p": is_extremely_small,
        "Cohens_d_raw": d_cohen,
    })

print(f"Derived FlPS rows: {len(flps_rows)}")

df_flps = pd.DataFrame(flps_rows)

df_with_flps = pd.concat([df_extended, df_flps], ignore_index=True)

# Apply Holm-Bonferroni correction on combined

def holm_bonferroni(pvals):
    m = len(pvals)
    order = np.argsort(pvals)
    sorted_p = np.array(pvals)[order]
    adj_sorted = np.zeros(m)
    max_val = 0.0
    for i in range(m):
        adj = (m - i) * sorted_p[i]
        if adj < max_val:
            adj = max_val
        else:
            max_val = adj
        adj_sorted[i] = min(adj, 1.0)
    adj = np.empty(m)
    adj[order] = adj_sorted
    return adj

if len(df_with_flps) > 0:
    df_with_flps["p_value_holm"] = holm_bonferroni(df_with_flps["p_value_raw"].values)

    def format_p_value(p_val, is_extremely_small, log_p):
        if is_extremely_small or p_val < 1e-10:
            if math.isinf(log_p):
                return "< 1e-300"
            else:
                log10_p = log_p / math.log(10)
                return "< 1e-300" if log10_p < -300 else f"< 1e{int(log10_p)}"
        elif p_val < 0.001:
            return f"{p_val:.2e}"
        else:
            return f"{p_val:.6f}"

    df_with_flps["p_value_formatted"] = df_with_flps.apply(
        lambda row: format_p_value(row["p_value_raw"], row["extremely_small_p"], row["log_p_value"]), axis=1
    )
    df_with_flps["p_value_holm_formatted"] = df_with_flps.apply(
        lambda row: format_p_value(row["p_value_holm"], row["p_value_holm"] < 1e-10,
                                   math.log(row["p_value_holm"]) if row["p_value_holm"] > 0 else -np.inf), axis=1
    )

    final_with_flps = df_with_flps[[
        "Device", "Model", "Metric",
        "Mean_ONNX", "SD_ONNX", "Mean_CPP", "SD_CPP",
        "Ratio_ONNX_over_CPP", "Ratio_CI95_low", "Ratio_CI95_high",
        "t_stat_log", "df_log",
        "p_value_formatted", "p_value_holm_formatted",
        "Cohens_d_raw"
    ]].copy()

    final_with_flps.columns = [
        "Device", "Model", "Metric",
        "Mean_ONNX", "SD_ONNX", "Mean_CPP", "SD_CPP",
        "Ratio_ONNX_over_CPP", "CI95_Low", "CI95_High",
        "t_statistic", "df",
        "p_value", "p_value_holm_corrected",
        "Cohens_d"
    ]

    final_with_flps = final_with_flps.sort_values(["Device", "Model", "Metric"]).reset_index(drop=True)

    print("\nCOMPREHENSIVE RESULTS (including FlPS)")
    print("=" * 60)
    print(f"Total comparisons (incl. FlPS): {len(final_with_flps)}")
    print(f"Metrics present: {sorted(final_with_flps['Metric'].unique().tolist())}")

    out_path = "./onnx_vs_cpp_comprehensive_analysis_with_flps.csv"
    final_with_flps.to_csv(out_path, index=False)
    print(f"Saved to: {out_path}")

    # Quick breakdown counts
    counts = final_with_flps.groupby(["Metric"]).size()
    print("\nCounts per metric:")
    for m, c in counts.items():
        print(f"  {m}: {c}")

    # Performance summary for FlPS only (higher is better)
    flps_only = final_with_flps[final_with_flps['Metric'] == 'flps']
    if not flps_only.empty:
        onnx_faster = (flps_only['Ratio_ONNX_over_CPP'] > 1).sum()
        cpp_faster = (flps_only['Ratio_ONNX_over_CPP'] < 1).sum()
        print("\nFlPS Performance Summary:")
        print(f"ONNX higher throughput than C++: {onnx_faster}")
        print(f"C++ higher throughput than ONNX: {cpp_faster}")
else:
    print("No FlPS rows derived.")


Derived FlPS rows: 8

COMPREHENSIVE RESULTS (including FlPS)
Total comparisons (incl. FlPS): 40
Metrics present: ['flps', 'latency', 'power']
Saved to: ./onnx_vs_cpp_comprehensive_analysis_with_flps.csv

Counts per metric:
  flps: 8
  latency: 16
  power: 16

FlPS Performance Summary:
ONNX higher throughput than C++: 2
C++ higher throughput than ONNX: 6
