In [3]:
# Copied from https://gist.github.com/timm/5630491

###############################################################
# The A12 non-parametric test
# Tim Menzies, (c) 2013, tim@menzies.us
# (c) http://creativecommons.org/licenses/by/3.0/
#
# The Vargha and Delaney's A12 statistics is a non-parametric effect
# size measure. Reference: + A. Vargha and H. D. Delaney. A critique
# and improvement of the CL common language effect size statistics of
# McGraw and Wong. Journal of Educational and Behavioral Statistics,
# 25(2):101-132, 2000
#
# Given a performance measure M seen in m measures of X and n measures
# of Y, the A12 statistics measures the probability that running
# algorithm X yields higher M values than running another algorithm Y.
#
# A12 = #(X > Y)/mn + 0.5*#(X=Y)/mn
#
# According to Vargha and Delaney, a small, medium, large difference
# between two populations:
#
# + Big is A12 over 0.71
# + Medium is A12 over 0.64
# + Small is A12 over 0.56
#
# In my view, this seems gratitiously different to...
#
# + Big is A12 over three-quarters (0.75)
# + Medium is A12 over two-thirds (0.66)
# + Small is A12 over half (0.5)
#
# Whatever, the following code parameterizes that magic number
# so you can use the standard values if you want to.
#
# While A12 studies two treatments. LA12 handles multiple treatments.
# Samples from each population are sorted by their mean. Then
# b4= sample[i] and after= sample[i+1] and rank(after) = 1+rank(b4)
# if a12 reports that the two populations are different.

# To simplify that process, I offer the following syntax. A population
# is a list of numbers, which may be unsorted, and starts with some
# symbol or string describing the population. A12s expects a list of
# such populations. For examples of that syntax, see the following use cases

# from a12 import *
#
# rxs= [["x1", 0.34, 0.49, 0.51, 0.60],
#       ["x2", 0.9, 0.7, 0.8, 0.60],
#       ["x3", 0.15, 0.25, 0.4, 0.35],
#       ["x4", 0.6, 0.7, 0.8, 0.90],
#       ["x5", 0.1, 0.2, 0.3, 0.40]]
# for rx in a12s(rxs,rev=False,enough=0.75): print rx
#
# print ""
# rxs = [["y1", 101, 100, 99, 101, 99.5],
#        ["y2", 101, 100, 99, 101, 100.0],
#        ["y3", 101, 100, 99.5, 101, 99.0],
#        ["y4", 101, 100, 99, 101, 100.0]]
# for rx in a12s(rxs): print rx

class Rx:
    "has the nums of a treatment, its name and rank"
    def __init__(i,lst):
        i.rx, i.lst = lst[0], lst[1:]
        i.mean = sum(i.lst)/len(i.lst)
        i.rank = 0
    def __repr__(i):
        return 'rank #%s %s at %s'%(i.rank,i.rx,i.mean)

def a12s(lst,rev=True,enough=0.66):
    "sees if lst[i+1] has rank higher than lst[i]"
    lst = [Rx(one) for one in lst]
    lst = sorted(lst,key=lambda x:x.mean,reverse=rev)
    one   = lst[0]
    rank = one.rank = 1
    for two in lst[1:]:
        if a12(one.lst,two.lst,rev) > enough: rank += 1
        two.rank = rank
        one = two
    return lst

def a12(lst1,lst2,rev=True):
    "how often is x in lst1 more than y in lst2?"
    more = same = 0.0
    for x in lst1:
        for y in lst2:
            if   x==y : same += 1
            elif rev     and x > y : more += 1
            elif not rev and x < y : more += 1
    return (more + 0.5*same)  / (len(lst1)*len(lst2))

def fromFile(f="a12.dat",rev=True,enough=0.66):
    "utility for reading sample data from disk"
    import re
    cache = {}
    num, space = r'^\+?-?[0-9]', r'[ \t\n]+'
    for line in open(f):
        line = line.strip()
        if line:
            for word in re.split(space,line):
                if re.match(num,word[0]):
                    cache[now] += [float(word)]
                else:
                    now  = word
                    cache[now] = [now]
    return a12s(cache.values(),rev,enough)

In [2]:
# Load dataframe with results
import pandas as pd

df = pd.read_csv('edge_roc_df.csv')
df

Unnamed: 0.1,Unnamed: 0,run,acdc,node sp,edge sp,eap,eap ig-10
0,0,11,1.0,0.9,0.96,0.0,0.92
1,1,18,0.978723,0.595745,0.829787,0.326241,0.950355
2,2,19,0.946667,0.58,0.8,0.0,0.76
3,3,20,0.5,0.176471,0.447964,0.0,0.627451
4,4,21,0.77399,0.688131,0.700126,0.594066,0.721591
5,5,26,1.0,0.960784,0.921569,0.0,0.843137
6,6,29,0.441176,0.313725,0.411765,0.0,0.176471
7,7,3,0.984694,0.933673,1.0,1.0,0.989796
8,8,33,0.352941,0.72549,0.764706,0.0,0.784314
9,9,34,0.441176,0.156863,0.27451,0.0,0.254902


In [8]:
import scipy.stats as stats

def run_stats(alg_1, alg_2):
    x = df[alg_1]
    y = df[alg_2]
    wilcoxon_result = stats.wilcoxon(x, y)
    print(wilcoxon_result)
    wilcoxon_p_values_df.loc[alg_1, alg_2] = wilcoxon_result.pvalue
    
    a12_result = a12(x, y)
    print(a12_result)
    a12_values_df.loc[alg_1, alg_2] = a12_result
    
available_algs = list(df.columns[2:])
print(available_algs)

# for each combination of algorithms, run the stats, and save it in a DF
wilcoxon_p_values_df = pd.DataFrame(index=available_algs, columns=available_algs)
a12_values_df = pd.DataFrame(index=available_algs, columns=available_algs)
for i in range(len(available_algs)):
    for j in range(i+1, len(available_algs)):
        print(f'{available_algs[i]} vs {available_algs[j]}')
        run_stats(available_algs[i], available_algs[j])

['acdc', 'node sp', 'edge sp', 'eap', 'eap ig-10']
acdc vs node sp
WilcoxonResult(statistic=13.0, pvalue=0.00537109375)
0.7155555555555555
acdc vs edge sp
WilcoxonResult(statistic=13.0, pvalue=0.041389404009149304)
0.5822222222222222
acdc vs eap
WilcoxonResult(statistic=1.0, pvalue=0.0001220703125)
0.9022222222222223
acdc vs eap ig-10
WilcoxonResult(statistic=22.0, pvalue=0.0301513671875)
0.6844444444444444
node sp vs edge sp
WilcoxonResult(statistic=3.5, pvalue=0.00042724609375)
0.35555555555555557
node sp vs eap
WilcoxonResult(statistic=1.0, pvalue=0.0001220703125)
0.8577777777777778
node sp vs eap ig-10
WilcoxonResult(statistic=37.0, pvalue=0.207763671875)
0.43333333333333335
edge sp vs eap
WilcoxonResult(statistic=0.0, pvalue=0.000978706525317055)
0.8955555555555555
edge sp vs eap ig-10
WilcoxonResult(statistic=31.0, pvalue=0.10699462890625)
0.5888888888888889
eap vs eap ig-10
WilcoxonResult(statistic=1.0, pvalue=0.0001220703125)
0.12


In [9]:
wilcoxon_p_values_df

Unnamed: 0,acdc,node sp,edge sp,eap,eap ig-10
acdc,,0.005371,0.041389,0.000122,0.030151
node sp,,,0.000427,0.000122,0.207764
edge sp,,,,0.000979,0.106995
eap,,,,,0.000122
eap ig-10,,,,,


In [10]:
a12_values_df

Unnamed: 0,acdc,node sp,edge sp,eap,eap ig-10
acdc,,0.715556,0.582222,0.902222,0.684444
node sp,,,0.355556,0.857778,0.433333
edge sp,,,,0.895556,0.588889
eap,,,,,0.12
eap ig-10,,,,,
