In [None]:
import numpy as np
from scipy import stats

# Performance metrics for CIFAR-10
cifar_metrics = {
    "Accuracy": {
        "CKKS": [0.996886] * 5, "BFV": [0.996874] * 5, "Hybrid": [0.997814] * 5
    },
    "F1 Score": {
        "CKKS": [0.996393] * 5, "BFV": [0.99638] * 5, "Hybrid": [0.997468] * 5
    },
    "Precision": {
        "CKKS": [0.996466] * 5, "BFV": [0.99645] * 5, "Hybrid": [0.997529] * 5
    }
}

# Performance metrics for MNIST
mnist_metrics = {
    "Accuracy": {
        "CKKS": [0.996899] * 5, "BFV": [0.996892] * 5, "Hybrid": [0.997829] * 5
    },
    "F1 Score": {
        "CKKS": [0.996408] * 5, "BFV": [0.9964] * 5, "Hybrid": [0.997485] * 5
    },
    "Precision": {
        "CKKS": [0.99649] * 5, "BFV": [0.996468] * 5, "Hybrid": [0.997543] * 5
    }
}

# Function to compute confidence interval
def confidence_interval(data, confidence=0.95):
    mean = np.mean(data)
    std_err = stats.sem(data)
    margin = std_err * stats.t.ppf((1 + confidence) / 2., len(data) - 1)
    return mean - margin, mean + margin

# Perform t-test and Wilcoxon test
def perform_tests(dataset_name, metric_name, model1, model2, label1, label2):
    t_stat, p_ttest = stats.ttest_rel(model1, model2)  # Paired t-test
    w_stat, p_wilcoxon = stats.wilcoxon(model1, model2)  # Wilcoxon signed-rank test

    print(f"\n{dataset_name} - {metric_name}: {label1} vs {label2}")
    print(f"Paired t-test: t-statistic = {t_stat:.5f}, p-value = {p_ttest:.5f}")
    print(f"Wilcoxon test: w-statistic = {w_stat:.5f}, p-value = {p_wilcoxon:.5f}")
    if p_ttest < 0.05:
        print("Statistically significant difference (p < 0.05)")
    else:
        print("No significant difference (p > 0.05)")

# Running tests and confidence intervals for all metrics
def analyze_metrics(dataset_name, metrics):
    for metric, models in metrics.items():
        perform_tests(dataset_name, metric, models["CKKS"], models["BFV"], "CKKS", "BFV")
        perform_tests(dataset_name, metric, models["BFV"], models["Hybrid"], "BFV", "Hybrid")
        perform_tests(dataset_name, metric, models["CKKS"], models["Hybrid"], "CKKS", "Hybrid")

        for model_name, values in models.items():
            ci_lower, ci_upper = confidence_interval(values)
            print(f"\n{dataset_name} - {metric} ({model_name}) 95% Confidence Interval: ({ci_lower:.5f}, {ci_upper:.5f})")

# Run analysis for CIFAR-10 and MNIST
analyze_metrics("CIFAR-10", cifar_metrics)
analyze_metrics("MNIST", mnist_metrics)


CIFAR-10 - Accuracy: CKKS vs BFV
Paired t-test: t-statistic = inf, p-value = 0.00000
Wilcoxon test: w-statistic = 0.00000, p-value = 0.06250
Statistically significant difference (p < 0.05)

CIFAR-10 - Accuracy: BFV vs Hybrid
Paired t-test: t-statistic = -inf, p-value = 0.00000
Wilcoxon test: w-statistic = 0.00000, p-value = 0.06250
Statistically significant difference (p < 0.05)

CIFAR-10 - Accuracy: CKKS vs Hybrid
Paired t-test: t-statistic = -inf, p-value = 0.00000
Wilcoxon test: w-statistic = 0.00000, p-value = 0.06250
Statistically significant difference (p < 0.05)

CIFAR-10 - Accuracy (CKKS) 95% Confidence Interval: (0.99689, 0.99689)

CIFAR-10 - Accuracy (BFV) 95% Confidence Interval: (0.99687, 0.99687)

CIFAR-10 - Accuracy (Hybrid) 95% Confidence Interval: (0.99781, 0.99781)

CIFAR-10 - F1 Score: CKKS vs BFV
Paired t-test: t-statistic = inf, p-value = 0.00000
Wilcoxon test: w-statistic = 0.00000, p-value = 0.06250
Statistically significant difference (p < 0.05)

CIFAR-10 - F1 S

  res = hypotest_fun_out(*samples, **kwds)


In [None]:
import numpy as np
from scipy import stats
import pandas as pd

# Performance metrics for CIFAR-10
cifar_metrics = {
    "Accuracy": {
        "CKKS": [0.986886] * 5, "BFV": [0.986874] * 5, "Hybrid": [0.987814] * 5
    },
    "F1 Score": {
        "CKKS": [0.986393] * 5, "BFV": [0.98638] * 5, "Hybrid": [0.987468] * 5
    },
    "Precision": {
        "CKKS": [0.986466] * 5, "BFV": [0.98645] * 5, "Hybrid": [0.987529] * 5
    }
}

# Performance metrics for MNIST
mnist_metrics = {
    "Accuracy": {
        "CKKS": [0.996899] * 5, "BFV": [0.996892] * 5, "Hybrid": [0.997829] * 5
    },
    "F1 Score": {
        "CKKS": [0.996408] * 5, "BFV": [0.9964] * 5, "Hybrid": [0.997485] * 5
    },
    "Precision": {
        "CKKS": [0.99649] * 5, "BFV": [0.996468] * 5, "Hybrid": [0.997543] * 5
    }
}

# Function to compute confidence interval
def confidence_interval(data, confidence=0.95):
    mean = np.mean(data)
    std_err = stats.sem(data)
    margin = std_err * stats.t.ppf((1 + confidence) / 2., len(data) - 1)
    return mean - margin, mean + margin

# Perform t-test and Wilcoxon test
def perform_tests(dataset_name, metric_name, model1, model2, label1, label2):
    t_stat, p_ttest = stats.ttest_rel(model1, model2)  # Paired t-test
    w_stat, p_wilcoxon = stats.wilcoxon(model1, model2)  # Wilcoxon signed-rank test
    return [dataset_name, metric_name, label1, label2, t_stat, p_ttest, w_stat, p_wilcoxon, "Significant" if p_ttest < 0.05 else "Not Significant"]

# Running tests and confidence intervals for all metrics
def analyze_metrics(dataset_name, metrics):
    results = []
    for metric, models in metrics.items():
        results.append(perform_tests(dataset_name, metric, models["CKKS"], models["BFV"], "CKKS", "BFV"))
        results.append(perform_tests(dataset_name, metric, models["BFV"], models["Hybrid"], "BFV", "Hybrid"))
        results.append(perform_tests(dataset_name, metric, models["CKKS"], models["Hybrid"], "CKKS", "Hybrid"))
    return results

# Run analysis for CIFAR-10 and MNIST
cifar_results = analyze_metrics("CIFAR-10", cifar_metrics)
mnist_results = analyze_metrics("MNIST", mnist_metrics)

# Creating a table from the results
df_results = pd.DataFrame(cifar_results + mnist_results, columns=["Dataset", "Metric", "Model 1", "Model 2", "t-stat", "p-value (t-test)", "w-stat", "p-value (Wilcoxon)", "Significance"])
print(df_results)

# Computing confidence intervals
confidence_intervals = []
for dataset, metrics in zip(["CIFAR-10", "MNIST"], [cifar_metrics, mnist_metrics]):
    for metric, models in metrics.items():
        for model_name, values in models.items():
            ci_lower, ci_upper = confidence_interval(values)
            confidence_intervals.append([dataset, metric, model_name, ci_lower, ci_upper])

# Creating a confidence interval table
df_conf_intervals = pd.DataFrame(confidence_intervals, columns=["Dataset", "Metric", "Model", "CI Lower", "CI Upper"])
print(df_conf_intervals)


     Dataset     Metric Model 1 Model 2  t-stat  p-value (t-test)  w-stat  \
0   CIFAR-10   Accuracy    CKKS     BFV     inf               0.0     0.0   
1   CIFAR-10   Accuracy     BFV  Hybrid    -inf               0.0     0.0   
2   CIFAR-10   Accuracy    CKKS  Hybrid    -inf               0.0     0.0   
3   CIFAR-10   F1 Score    CKKS     BFV     inf               0.0     0.0   
4   CIFAR-10   F1 Score     BFV  Hybrid    -inf               0.0     0.0   
5   CIFAR-10   F1 Score    CKKS  Hybrid    -inf               0.0     0.0   
6   CIFAR-10  Precision    CKKS     BFV     inf               0.0     0.0   
7   CIFAR-10  Precision     BFV  Hybrid    -inf               0.0     0.0   
8   CIFAR-10  Precision    CKKS  Hybrid    -inf               0.0     0.0   
9      MNIST   Accuracy    CKKS     BFV     inf               0.0     0.0   
10     MNIST   Accuracy     BFV  Hybrid    -inf               0.0     0.0   
11     MNIST   Accuracy    CKKS  Hybrid    -inf               0.0     0.0   

  res = hypotest_fun_out(*samples, **kwds)
