In [17]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from statsmodels.stats.multitest import multipletests
from statsmodels.stats.descriptivestats import sign_test
from statsmodels.stats.weightstats import zconfint
from scipy import stats
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [4]:
auc_data = pd.read_csv("data/AUCs.txt", sep="\t")

In [9]:
auc_data.columns

Index(['Unnamed: 0', 'C4.5', 'C4.5+m', 'C4.5+cf', 'C4.5+m+cf'], dtype='object')

In [12]:
auc_data.rename({'Unnamed: 0': "df"}, axis=1, inplace=True)

In [13]:
auc_data

Unnamed: 0,df,C4.5,C4.5+m,C4.5+cf,C4.5+m+cf
0,adult (sample),0.763,0.768,0.771,0.798
1,breast cancer,0.599,0.591,0.59,0.569
2,breast cancer wisconsin,0.954,0.971,0.968,0.967
3,cmc,0.628,0.661,0.654,0.657
4,ionosphere,0.882,0.888,0.886,0.898
5,iris,0.936,0.931,0.916,0.931
6,liver disorders,0.661,0.668,0.609,0.685
7,lung cancer,0.583,0.583,0.563,0.625
8,lymphography,0.775,0.838,0.866,0.875
9,mushroom,1.0,1.0,1.0,1.0


In [25]:
%%time 
corr_data = []
for i, lhs_column in enumerate(auc_data.columns[1:]):
    for j, rhs_column in enumerate(auc_data.columns[1:]):
        if i >= j:
            continue
        
        corr, p = stats.wilcoxon(auc_data[lhs_column], auc_data[rhs_column])
        corr_data.append([lhs_column, rhs_column, corr, p])

Wall time: 3 ms


In [26]:
corr_data

[['C4.5', 'C4.5+m', 6.5, 0.01075713311978963],
 ['C4.5', 'C4.5+cf', 43.0, 0.861262330095348],
 ['C4.5', 'C4.5+m+cf', 11.0, 0.015874359307532084],
 ['C4.5+m', 'C4.5+cf', 18.0, 0.05432871367198416],
 ['C4.5+m', 'C4.5+m+cf', 22.0, 0.3278256758446406],
 ['C4.5+cf', 'C4.5+m+cf', 10.5, 0.025313519968766574]]

In [27]:
model_correlation = pd.DataFrame.from_records(corr_data, columns=["model_A", "model_B", "statistic", "p"])

In [37]:
model_correlation.eval("p < .05").sum()

3

In [38]:
model_correlation

Unnamed: 0,model_A,model_B,statistic,p
0,C4.5,C4.5+m,6.5,0.010757
1,C4.5,C4.5+cf,43.0,0.861262
2,C4.5,C4.5+m+cf,11.0,0.015874
3,C4.5+m,C4.5+cf,18.0,0.054329
4,C4.5+m,C4.5+m+cf,22.0,0.327826
5,C4.5+cf,C4.5+m+cf,10.5,0.025314


In [44]:
reject, p_corrected, a1, a2 = multipletests(model_correlation.p, alpha=.05, method="fdr_bh")

In [45]:
model_correlation["p_corrected"] = p_corrected
model_correlation["reject"] = reject

In [46]:
model_correlation

Unnamed: 0,model_A,model_B,statistic,p,p_corrected,reject
0,C4.5,C4.5+m,6.5,0.010757,0.047623,True
1,C4.5,C4.5+cf,43.0,0.861262,0.861262,False
2,C4.5,C4.5+m+cf,11.0,0.015874,0.047623,True
3,C4.5+m,C4.5+cf,18.0,0.054329,0.081493,False
4,C4.5+m,C4.5+m+cf,22.0,0.327826,0.393391,False
5,C4.5+cf,C4.5+m+cf,10.5,0.025314,0.050627,False


In [None]:
reject, p_corrected, a1, a2 = multipletests(model_correlation.p, alpha=.05, method="holm")