# Wilcoxon Test for Accuracy

In [32]:
from numpy.random import seed
from numpy.random import randn
from scipy.stats import wilcoxon
import pandas as pd
import numpy as np
import itertools

In [40]:
# Wilcoxon signed-rank test

class MyWilcoxon:
    def __init__(self, sheet_path=r'C:\Users\HassanEldeeb\Documents\GitHub\AutoMLBenchmarking\Complete_Sheet.xlsx',
                sheet_names=['10 Min', '30 Min', '60 Min', '4 Hours', 'meta-features']):
        
        
        self.sheet = pd.read_excel(sheet_path,
                              null_values=['', 'NA', 'NAN', 'NaN', 'Nan', 'NA\n','0', '0.0', 'None'],
                              sheet_name=sheet_names)
        
        
        
    def calc_wilcoxon(self):
        for t in ['10 Min', '30 Min', '60 Min', '4 Hours']:
            self.sheet[t].rename(columns={"sklearn_e_accuracy_mean": "sklearn-e_accuracy_mean",
                                            "sklearn_m_accuracy_mean": "sklearn-m_accuracy_mean",
                                            "sklearn_v_accuracy_mean": "sklearn-v_accuracy_mean"}, inplace=True)
        result = pd.DataFrame(columns = ['Factor_1', 'Factor_2', 'Time_Budget_1', 'Time_Budget_2',
                                         'p_value', 'stat', 'Statistically_Better'])
        cols = ['sklearn_accuracy_mean', 'sklearn-e_accuracy_mean', 'sklearn-m_accuracy_mean', 'sklearn-v_accuracy_mean',
                'autoweka_accuracy_mean', 'recipe_valid_acc', 'smartml_valid_acc', 'tpot_accuracy_mean', 'atm_acc']
        for t in itertools.product(['10 Min', '30 Min', '60 Min', '4 Hours'], ['10 Min', '30 Min', '60 Min', '4 Hours']):
            for col in itertools.product(cols, cols):
                if (t[0]== t[1] and col[0] != col[1]) or ((t[0]!= t[1] and col[0] == col[1])):
                    data1 = self.sheet[t[0]][col[0]]
                    data2 = self.sheet[t[1]][col[1]]
                    condition = (data1.notna()) & (data2.notna())
                    data1 = data1[condition]
                    data2 = data2[condition]
                    
                    stat, p = wilcoxon(data1, data2)
                    if p <= 0.05:
                        if data1.mean() > data2.mean():
                            better = '1'
                        elif data2.mean() > data1.mean():
                            better = '2'
                        else:
                            better = 'None'
                    else:
                        better = 'None'
                    new_row = {'Factor_1': col[0].split('_')[0], 'Factor_2': col[1].split('_')[0], 'Time_Budget_1':t[0], 'Time_Budget_2':t[1],
                                'p_value': p, 'stat': stat, 'Statistically_Better': better}
                    result = result.append(new_row, ignore_index=True)
        return result
    
    def calc_wilcoxon_time_budgets(self):
        for t in ['10 Min', '30 Min', '60 Min', '4 Hours']:
            self.sheet[t].rename(columns={"sklearn_e_accuracy_mean": "sklearn-e_accuracy_mean",
                                            "sklearn_m_accuracy_mean": "sklearn-m_accuracy_mean",
                                            "sklearn_v_accuracy_mean": "sklearn-v_accuracy_mean"}, inplace=True)
        result = pd.DataFrame(columns = ['Factor_1', 'Factor_2', 'Time_Budget_1', 'Time_Budget_2',
                                         'p_value', 'stat', 'avg_diff', 'Statistically_Better'])
        cols = ['sklearn_accuracy_mean', 'sklearn-e_accuracy_mean', 'sklearn-m_accuracy_mean', 'sklearn-v_accuracy_mean',
                'autoweka_accuracy_mean', 'recipe_valid_acc', 'smartml_valid_acc', 'tpot_accuracy_mean', 'atm_acc']
        t = ['10 Min', '30 Min', '60 Min', '4 Hours']
        for t1 in range(4):
            for t2 in range(t1):
                for col in cols:
                    #print('{} - {}'.format(t[t1], t[t2]))
                    data1 = self.sheet[t[t1]][col]
                    data2 = self.sheet[t[t2]][col]
                    condition = (data1.notna()) & (data2.notna())
                    data1 = data1[condition]
                    data2 = data2[condition]

                    stat, p = wilcoxon(data1, data2)
                    avg_diff = data1.mean() - data2.mean()
                    if p <= 0.05:
                        if avg_diff > 0:
                            better = '1'
                        elif avg_diff < 0:
                            better = '2'
                        else:
                            better = 'None'
                    else:
                        better = 'None'
                    new_row = {'Factor_1': col.split('_')[0], 'Factor_2': col.split('_')[0], 'Time_Budget_1':t[t1], 'Time_Budget_2':t[t2],
                                'p_value': p, 'stat': stat, 'avg_diff': avg_diff, 'Statistically_Better': better}
                    result = result.append(new_row, ignore_index=True)
        return result
            


In [41]:
wil = MyWilcoxon()

In [42]:
tpot10 = wil.sheet['10 Min'].loc[:,['dataset', 'tpot_accuracy_mean']]
tpot30 = wil.sheet['30 Min'].loc[:,['dataset', 'tpot_accuracy_mean']]
tpot60 = wil.sheet['60 Min'].loc[:,['dataset', 'tpot_accuracy_mean']]
tpot240 = wil.sheet['4 Hours'].loc[:,['dataset', 'tpot_accuracy_mean']]
condition = (tpot10.tpot_accuracy_mean.notna()) & (tpot30.tpot_accuracy_mean.notna())
tpot10 = tpot10[condition]
tpot30 = tpot30[condition]
tpot60 = tpot60[condition]
tpot240 = tpot240[condition]
print(tpot10.mean(), tpot30.mean(), tpot60.mean(), tpot240.mean())

tpot_accuracy_mean    0.891778
dtype: float64 tpot_accuracy_mean    0.901203
dtype: float64 tpot_accuracy_mean    0.899061
dtype: float64 tpot_accuracy_mean    0.904069
dtype: float64


In [43]:
s30 = wil.sheet['30 Min'].loc[:,['dataset', 'sklearn_v_accuracy_mean']]
s240 = wil.sheet['4 Hours'].loc[:,['dataset', 'sklearn_v_accuracy_mean']]
display(s30)
condition = (s240.sklearn_v_accuracy_mean.notna()) & (s30.sklearn_v_accuracy_mean.notna())
s30 = s30[condition]
s240 = s240[condition]
#print(s30.mean(), s240.mean())
result = pd.concat([s30, s240], axis=1, sort=False)
result

Unnamed: 0,dataset,sklearn_v_accuracy_mean
0,aaaData_for_UCI_named,1.000000
1,AirlinesCodrnaAdult,0.786878
2,Amazon,0.641778
3,analcatdata_authorship,0.990521
4,AP_Breast_Lung,0.974576
...,...,...
95,synthetic_control,0.960000
96,tumors_C,0.600000
97,umistfacescropped,0.965278
98,vowel,0.998656


Unnamed: 0,dataset,sklearn_v_accuracy_mean,dataset.1,sklearn_v_accuracy_mean.1
0,aaaData_for_UCI_named,1.000000,aaaData_for_UCI_named,1.000000
1,AirlinesCodrnaAdult,0.786878,AirlinesCodrnaAdult,0.786785
2,Amazon,0.641778,Amazon,0.765333
3,analcatdata_authorship,0.990521,analcatdata_authorship,0.990521
4,AP_Breast_Lung,0.974576,AP_Breast_Lung,0.966102
...,...,...,...,...
95,synthetic_control,0.960000,synthetic_control,0.986667
96,tumors_C,0.600000,tumors_C,0.596325
97,umistfacescropped,0.965278,umistfacescropped,0.986111
98,vowel,0.998656,vowel,1.000000


In [44]:
result = wil.calc_wilcoxon()
result_tb = wil.calc_wilcoxon_time_budgets()

In [45]:
result[((result.Statistically_Better == '1') ^ (result.Statistically_Better == 'None')) & (((result.Factor_1=='sklearn-v') & (result.Factor_2=='sklearn')) ^ ((result.Factor_2=='sklearn-v') & (result.Factor_1=='sklearn')))].round(3)
#result[(result.Statistically_Better == '1') & ((result.Factor_1=='sklearn-v') ^ (result.Factor_2=='sklearn-v'))].round(3)

Unnamed: 0,Factor_1,Factor_2,Time_Budget_1,Time_Budget_2,p_value,stat,Statistically_Better
2,sklearn,sklearn-v,10 Min,10 Min,0.0,952.5,1
110,sklearn,sklearn-v,30 Min,30 Min,0.002,1018.0,1
218,sklearn,sklearn-v,60 Min,60 Min,0.0,944.5,1
326,sklearn,sklearn-v,4 Hours,4 Hours,0.0,761.0,1


In [48]:
#display(result_tb[result_tb.Factor_1 == result_tb.Factor_1.unique()[0]].round(3))
for tool in result_tb.Factor_1.unique():
    display(result_tb[result_tb.Factor_1 == tool].round(3))

Unnamed: 0,Factor_1,Factor_2,Time_Budget_1,Time_Budget_2,p_value,stat,avg_diff,Statistically_Better
0,sklearn,sklearn,30 Min,10 Min,0.362,1357.5,0.003,
9,sklearn,sklearn,60 Min,10 Min,0.0,970.5,0.009,1.0
18,sklearn,sklearn,60 Min,30 Min,0.019,1194.0,0.005,1.0
27,sklearn,sklearn,4 Hours,10 Min,0.001,1104.0,0.014,1.0
36,sklearn,sklearn,4 Hours,30 Min,0.002,1045.0,0.011,1.0
45,sklearn,sklearn,4 Hours,60 Min,0.117,1470.0,0.005,


Unnamed: 0,Factor_1,Factor_2,Time_Budget_1,Time_Budget_2,p_value,stat,avg_diff,Statistically_Better
1,sklearn-e,sklearn-e,30 Min,10 Min,0.0,770.5,0.007,1.0
10,sklearn-e,sklearn-e,60 Min,10 Min,0.0,986.5,0.011,1.0
19,sklearn-e,sklearn-e,60 Min,30 Min,0.675,1815.0,0.004,
28,sklearn-e,sklearn-e,4 Hours,10 Min,0.0,1110.0,0.013,1.0
37,sklearn-e,sklearn-e,4 Hours,30 Min,0.038,1531.5,0.006,1.0
46,sklearn-e,sklearn-e,4 Hours,60 Min,0.265,1730.0,0.002,


Unnamed: 0,Factor_1,Factor_2,Time_Budget_1,Time_Budget_2,p_value,stat,avg_diff,Statistically_Better
2,sklearn-m,sklearn-m,30 Min,10 Min,0.211,669.0,0.004,
11,sklearn-m,sklearn-m,60 Min,10 Min,0.198,992.0,0.004,
20,sklearn-m,sklearn-m,60 Min,30 Min,0.956,1164.0,0.0,
29,sklearn-m,sklearn-m,4 Hours,10 Min,0.099,1379.5,0.008,
38,sklearn-m,sklearn-m,4 Hours,30 Min,0.614,1592.5,0.004,
47,sklearn-m,sklearn-m,4 Hours,60 Min,0.398,1557.0,0.004,


Unnamed: 0,Factor_1,Factor_2,Time_Budget_1,Time_Budget_2,p_value,stat,avg_diff,Statistically_Better
3,sklearn-v,sklearn-v,30 Min,10 Min,0.226,1332.5,0.005,
12,sklearn-v,sklearn-v,60 Min,10 Min,0.004,1165.5,0.007,1.0
21,sklearn-v,sklearn-v,60 Min,30 Min,0.141,1279.0,0.002,
30,sklearn-v,sklearn-v,4 Hours,10 Min,0.0,1032.0,0.007,1.0
39,sklearn-v,sklearn-v,4 Hours,30 Min,0.027,1391.0,0.002,1.0
48,sklearn-v,sklearn-v,4 Hours,60 Min,0.11,1536.0,0.0,


Unnamed: 0,Factor_1,Factor_2,Time_Budget_1,Time_Budget_2,p_value,stat,avg_diff,Statistically_Better
4,autoweka,autoweka,30 Min,10 Min,0.005,271.5,0.031,1.0
13,autoweka,autoweka,60 Min,10 Min,0.037,564.0,0.038,1.0
22,autoweka,autoweka,60 Min,30 Min,0.658,500.0,0.008,
31,autoweka,autoweka,4 Hours,10 Min,0.0,486.0,0.083,1.0
40,autoweka,autoweka,4 Hours,30 Min,0.022,673.0,0.051,1.0
49,autoweka,autoweka,4 Hours,60 Min,0.004,293.0,0.044,1.0


Unnamed: 0,Factor_1,Factor_2,Time_Budget_1,Time_Budget_2,p_value,stat,avg_diff,Statistically_Better
5,recipe,recipe,30 Min,10 Min,0.272,35.0,-0.031,
14,recipe,recipe,60 Min,10 Min,0.067,56.0,0.014,
23,recipe,recipe,60 Min,30 Min,0.829,167.0,0.007,
32,recipe,recipe,4 Hours,10 Min,0.01,60.0,0.026,1.0
41,recipe,recipe,4 Hours,30 Min,0.65,196.5,0.005,
50,recipe,recipe,4 Hours,60 Min,0.939,135.5,-0.001,


Unnamed: 0,Factor_1,Factor_2,Time_Budget_1,Time_Budget_2,p_value,stat,avg_diff,Statistically_Better
6,smartml,smartml,30 Min,10 Min,0.636,662.0,0.007,
15,smartml,smartml,60 Min,10 Min,0.832,772.0,0.009,
24,smartml,smartml,60 Min,30 Min,0.597,514.0,0.009,
33,smartml,smartml,4 Hours,10 Min,0.121,835.5,0.026,
42,smartml,smartml,4 Hours,30 Min,0.05,625.0,0.025,1.0
51,smartml,smartml,4 Hours,60 Min,0.071,577.0,0.015,


Unnamed: 0,Factor_1,Factor_2,Time_Budget_1,Time_Budget_2,p_value,stat,avg_diff,Statistically_Better
7,tpot,tpot,30 Min,10 Min,0.165,165.0,0.009,
16,tpot,tpot,60 Min,10 Min,0.339,186.0,0.008,
25,tpot,tpot,60 Min,30 Min,0.4,443.0,0.001,
34,tpot,tpot,4 Hours,10 Min,0.013,151.5,0.013,1.0
43,tpot,tpot,4 Hours,30 Min,0.027,289.5,0.006,1.0
52,tpot,tpot,4 Hours,60 Min,0.01,482.0,0.004,1.0


Unnamed: 0,Factor_1,Factor_2,Time_Budget_1,Time_Budget_2,p_value,stat,avg_diff,Statistically_Better
8,atm,atm,30 Min,10 Min,0.341,632.0,0.003,
17,atm,atm,60 Min,10 Min,0.488,766.0,-0.004,
26,atm,atm,60 Min,30 Min,0.538,671.0,-0.008,
35,atm,atm,4 Hours,10 Min,0.594,900.5,0.003,
44,atm,atm,4 Hours,30 Min,0.964,879.0,-0.003,
53,atm,atm,4 Hours,60 Min,0.496,879.5,0.005,


In [16]:
cols = ['sklearn_accuracy_mean', 'sklearn-e_accuracy_mean', 'sklearn-m_accuracy_mean', 'sklearn-v_accuracy_mean',
                'autoweka_accuracy_mean', 'recipe_valid_acc', 'smartml_valid_acc', 'tpot_accuracy_mean', 'atm_acc']
summary = pd.DataFrame(columns=['time_budget', 'tool', 'accuracy'])
for t in ['10 Min', '30 Min', '60 Min', '4 Hours']:
    for c in cols:
        df1= pd.DataFrame({'time_budget': [t]*100, 'tool': [c]*100, 'accuracy': wil.sheet[t][c]})
        summary = pd.concat([summary, df1])
summary

Unnamed: 0,time_budget,tool,accuracy
0,10 Min,sklearn_accuracy_mean,1.000000
1,10 Min,sklearn_accuracy_mean,0.787060
2,10 Min,sklearn_accuracy_mean,0.781333
3,10 Min,sklearn_accuracy_mean,1.000000
4,10 Min,sklearn_accuracy_mean,0.968927
...,...,...,...
95,4 Hours,atm_acc,1.000000
96,4 Hours,atm_acc,0.943000
97,4 Hours,atm_acc,0.875000
98,4 Hours,atm_acc,0.792000


In [17]:
import researchpy as rp
s = summary[summary.accuracy!=0 & ((summary.tool == 'sklearn_v_accuracy_mean') ^ (summary.tool == 'sklearn_m_accuracy_mean'))]
rp.summary_cont(s.groupby(['time_budget', 'tool']))['accuracy'].round(3)#.to_latex(index=True)





Unnamed: 0_level_0,Unnamed: 1_level_0,N,Mean,SD,SE,95% Conf.,Interval
time_budget,tool,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
10 Min,atm_acc,75,0.888,0.123,0.014,0.86,0.916
10 Min,autoweka_accuracy_mean,86,0.848,0.161,0.017,0.814,0.882
10 Min,recipe_valid_acc,33,0.84,0.176,0.031,0.78,0.9
10 Min,sklearn-e_accuracy_mean,99,0.873,0.139,0.014,0.846,0.901
10 Min,sklearn-m_accuracy_mean,99,0.87,0.144,0.015,0.841,0.898
10 Min,sklearn-v_accuracy_mean,99,0.868,0.145,0.015,0.839,0.896
10 Min,sklearn_accuracy_mean,99,0.873,0.143,0.014,0.845,0.901
10 Min,smartml_valid_acc,89,0.799,0.212,0.022,0.755,0.843
10 Min,tpot_accuracy_mean,43,0.894,0.117,0.018,0.859,0.929
30 Min,atm_acc,74,0.903,0.116,0.013,0.877,0.93
