In [1]:
import pandas as pd
from scipy.stats import friedmanchisquare
from scipy.stats import wilcoxon
import itertools
import statsmodels.stats.multitest as smm

In [2]:
df = pd.read_excel("eval.xlsx")
print(df.head())

   requirement_id  setting evaluator      criterion  value
0               1  generic     eval1   completeness      4
1               1  generic     eval1    correctness      3
2               1  generic     eval1  uml_adherence      2
3               1  generic     eval1        clarity      2
4               1  generic     eval1    terminology      4


In [4]:
criteria = ["completeness", "correctness", "uml_adherence", "clarity", "terminology"]

In [5]:
def prepare_friedman(df, criterion):
    pivot = df[df["criterion"] == criterion].pivot_table(
        index="requirement_id",
        columns="setting",
        values="value",
        aggfunc="mean"   # average of two evaluators
    )
    return pivot

for c in criteria:
    print(f"\n=== Friedman Test: {c} ===")
    pivot = prepare_friedman(df, c)
    stat, p = friedmanchisquare(
        pivot["generic"], pivot["layered"], pivot["microservices"], pivot["event"]
    )
    print("Statistic:", stat, "p-value:", p)


=== Friedman Test: completeness ===
Statistic: 15.620689655172384 p-value: 0.0013561967718866902

=== Friedman Test: correctness ===
Statistic: 11.742857142857117 p-value: 0.008317994816501202

=== Friedman Test: uml_adherence ===
Statistic: 24.64912280701753 p-value: 1.8281241922898042e-05

=== Friedman Test: clarity ===
Statistic: 12.989189189189174 p-value: 0.004660042856886029

=== Friedman Test: terminology ===
Statistic: 19.304347826086914 p-value: 0.00023650778330728723


In [6]:
settings = ["generic", "layered", "microservices", "event"]

def posthoc_wilcoxon(df, criterion):
    pivot = prepare_friedman(df, criterion)
    pairs = list(itertools.combinations(settings, 2))
    
    stats = []
    pvals = []
    for a, b in pairs:
        stat, p = wilcoxon(pivot[a], pivot[b])
        stats.append((a, b, stat))
        pvals.append(p)

    corrected = smm.multipletests(pvals, method="holm")[1]

    print(f"\n=== Post-hoc Wilcoxon Tests: {criterion} ===")
    for (a, b, stat), corr_p in zip(stats, corrected):
        print(f"{a} vs {b}: corrected p = {corr_p:.5f}")

In [7]:
for c in criteria:
    posthoc_wilcoxon(df, c)


=== Post-hoc Wilcoxon Tests: completeness ===
generic vs layered: corrected p = 0.08963
generic vs microservices: corrected p = 0.00180
generic vs event: corrected p = 0.01678
layered vs microservices: corrected p = 0.08963
layered vs event: corrected p = 0.97877
microservices vs event: corrected p = 0.08963

=== Post-hoc Wilcoxon Tests: correctness ===
generic vs layered: corrected p = 0.28037
generic vs microservices: corrected p = 0.28837
generic vs event: corrected p = 0.60636
layered vs microservices: corrected p = 0.79451
layered vs event: corrected p = 0.11263
microservices vs event: corrected p = 0.15255

=== Post-hoc Wilcoxon Tests: uml_adherence ===
generic vs layered: corrected p = 0.01204
generic vs microservices: corrected p = 0.00152
generic vs event: corrected p = 0.05601
layered vs microservices: corrected p = 0.22389
layered vs event: corrected p = 0.10008
microservices vs event: corrected p = 0.04572

=== Post-hoc Wilcoxon Tests: clarity ===
generic vs layered: corre