# Wilcoxon Test for F1 Score

In [1]:
from numpy.random import seed
from numpy.random import randn
from scipy.stats import wilcoxon
import pandas as pd
import itertools

In [2]:
# Wilcoxon signed-rank test

class MyWilcoxon:
    def __init__(self):
        
        self.d20 = ["AirlinesCodrnaAdult",
                    "Amazon",
                    "AP_Breast_Lung",
                    "arrhythmia",
                    "audiology",
                    "connect-4",
                    "dataset_31_credit-g",
                    "electricity-normalized",
                    "eye_movements",
                    "gina_agnostic",
                    "hiva_agnostic",
                    "KDDCup99",
                    "MagicTelescope",
                    "openml_phpJNxH0q",
                    "phpZrCzJR",
                    "pokerhand-normalized",
                    "solar-flare_1",
                    "umistfacescropped",
                    "vowel",
                    "wine-quality-red"]
        self.d10 = ["dataset_39_ecoli",
                    "synthetic_control",
                    "avila-tr",
                    "phpGUrE90",
                    "dataset_60_waveform-5000",
                    "dataset_186_satimage",
                    "dataset_40_sonar",
                    "phpmPOD5A",
                    "AP_Omentum_Ovary",
                    "phprAeXmK"]
        
        
        skout=r"C:\Users\HassanEldeeb\Documents\GitHub\AutoMLBenchmarking\logs_search_space/skout.xlsx"
        self.skout = pd.read_excel(skout,
                              null_values=['', 'NA', 'NAN', 'NaN', 'Nan', 'NA\n', 'None'])
        self.skout = self.skout[['dataset', 'time_budget', 'methods', 'f1score']]
        self.skout.methods = self.skout.methods.replace("['adaboost', 'bernoulli_nb', 'decision_tree', 'extra_trees', 'gaussian_nb', 'gradient_boosting', 'k_nearest_neighbors', 'lda', 'liblinear_svc', 'libsvm_svc', 'multinomial_nb', 'passive_aggressive', 'qda', 'random_forest', 'sgd']", "fc")
        self.skout.methods = self.skout.methods.replace("['decision_tree', 'libsvm_svc', 'random_forest']", "3c")
        self.skout.methods = self.skout.methods.replace("['libsvm_svc']", "1c")
        self.skout.methods = self.skout.methods.replace("['decision_tree']", "1c")
        self.skout.methods = self.skout.methods.replace("['random_forest']", "1c")
        self.skout = self.clean(self.skout, is_30=True)
        self.skout = self.skout.groupby(['dataset', 'time_budget', 'methods']).mean()
        
        atm=r"C:\Users\HassanEldeeb\Documents\GitHub\AutoMLBenchmarking\logs_search_space/atmout.xlsx"
        self.atm = pd.read_excel(atm,
                              null_values=['', 'NA', 'NAN', 'NaN', 'Nan', 'NA\n', 'None'])
        self.atm = self.atm[['dataset', 'time_budget', 'methods', 'f1score']]
        self.atm.methods = self.atm.methods.replace("['logreg', 'svm', 'sgd', 'dt', 'et', 'rf', 'gnb', 'mnb', 'bnb', 'gp', 'pa', 'knn', 'mlp', 'ada']", "fc")
        self.atm.methods = self.atm.methods.replace("['rf', 'dt', 'svm']", "3c")
        self.atm.methods = self.atm.methods.replace("['logreg', 'dt', 'knn']", "def")
        self.atm.methods = self.atm.methods.replace("['svm']", "1c")
        self.atm.methods = self.atm.methods.replace("['dt']", "1c")
        self.atm.methods = self.atm.methods.replace("['rf']", "1c")
        self.atm = self.clean(self.atm)
        self.atm = self.atm.groupby(['dataset', 'time_budget', 'methods']).mean()
        
        tpot=r"C:\Users\HassanEldeeb\Documents\GitHub\AutoMLBenchmarking\logs_search_space/tpot.xlsx"
        self.tpot = pd.read_excel(tpot,
                              null_values=['', 'NA', 'NAN', 'NaN', 'Nan', 'NA\n', 'None'])
        self.tpot = self.tpot[['dataset', 'time_budget', 'methods', 'f1score']]
        self.tpot.methods = self.tpot.methods.replace("default", "fc")
        self.tpot.methods = self.tpot.methods.replace("3C", "3c")
        self.tpot.methods = self.tpot.methods.replace("SVC", "1c")
        self.tpot.methods = self.tpot.methods.replace("DT", "1c")
        self.tpot.methods = self.tpot.methods.replace("RF", "1c")
        self.tpot = self.clean(self.tpot)
        self.tpot = self.tpot.groupby(['dataset', 'time_budget', 'methods']).mean()
        
    def calc_wilcoxon(self):
        result = pd.DataFrame(columns = ['Framework_1', 'Framework_2', 'Time_Budget_1', 'Time_Budget_2', 'Search_Space_1',
                                         'Search_Sapce_2', 'p_value', 'stat', 'Statistically_Better'])
        for tool in itertools.product([self.skout, self.tpot, self.atm], [self.skout, self.tpot, self.atm]):
            for t in itertools.product([60, 30, 10], [60, 30, 10]):
                for ss in itertools.product(['fc', '3c', '1c'], ['fc', '3c', '1c']):
                    if ((tool[0].equals(tool[1]) and t[0] == t[1] and ss[0] != ss[1]) or 
                        (tool[0].equals(tool[1]) and t[0] != t[1] and ss[0] == ss[1]) or
                        (not tool[0].equals(tool[1]) and t[0] == t[1] and ss[0] == ss[1])):
                        
                        if tool[0].equals(self.skout) and tool[1].equals(self.skout):
                            data_sets = slice(None)
                        else:
                            data_sets = self.d20
                        data1 = tool[0].loc[(data_sets, t[0], ss[0]), :].f1score.values
                        data2 = tool[1].loc[(data_sets, t[1], ss[1]), :].f1score.values
                        stat, p = wilcoxon(data1, data2)
                        if tool[0].equals(self.skout):
                            framework1 = 'AutoSKLearn'
                        elif tool[0].equals(self.tpot):
                            framework1 = 'TPOT'
                        else:
                            framework1 = 'ATM'
                        if tool[1].equals(self.skout):
                            framework2 = 'AutoSKLearn'
                        elif tool[1].equals(self.tpot):
                            framework2 = 'TPOT'
                        else:
                            framework2 = 'ATM'
                        if p <= 0.05:
                            if data1.mean() > data2.mean():
                                better = '1'
                            elif data2.mean() > data1.mean():
                                better = '2'
                            else:
                                better = 'None'
                        else:
                            better = 'None'
                        new_row = {'Framework_1': framework1, 'Framework_2': framework2, 'Time_Budget_1':t[0], 'Time_Budget_2':t[1],
                                   'Search_Space_1':ss[0], 'Search_Sapce_2': ss[1], 'p_value': p, 'stat': stat,
                                   'Statistically_Better': better}
                        result = result.append(new_row, ignore_index=True)
        return result
            
    def clean(self, df, is_30 = False):
        df = df[df.f1score != 0]
        #         if is_30:
        #             datasets = self.d20 + self.d10
        #         else:
        datasets = self.d20
        df = df[df.dataset.isin(datasets)]
        for d in datasets:
            for t in [10, 30, 60]:
                for ss in ['fc', '3c', '1c']:
                    if df[(df.methods == ss) & (df.time_budget == t) & (df.dataset == d)].shape[0] == 0:
                        new_row = {'dataset': d, 'time_budget': t, 'methods':ss, 'f1score':0}
                        df = df.append(new_row, ignore_index=True)
        return df

In [3]:
wil = MyWilcoxon()


In [4]:
result = wil.calc_wilcoxon()

In [14]:
result[(result.Statistically_Better == '1')].round(3)

Unnamed: 0,Framework_1,Framework_2,Time_Budget_1,Time_Budget_2,Search_Space_1,Search_Sapce_2,p_value,stat,Statistically_Better
1,AutoSKLearn,AutoSKLearn,60,60,fc,1c,0.002,23.0,1
3,AutoSKLearn,AutoSKLearn,60,60,3c,1c,0.009,35.0,1
15,AutoSKLearn,AutoSKLearn,30,30,fc,3c,0.025,45.0,1
16,AutoSKLearn,AutoSKLearn,30,30,fc,1c,0.002,23.0,1
18,AutoSKLearn,AutoSKLearn,30,30,3c,1c,0.008,34.0,1
23,AutoSKLearn,AutoSKLearn,30,10,1c,1c,0.011,37.0,1
31,AutoSKLearn,AutoSKLearn,10,10,fc,1c,0.001,17.0,1
33,AutoSKLearn,AutoSKLearn,10,10,3c,1c,0.001,16.0,1
37,AutoSKLearn,TPOT,60,60,3c,3c,0.018,36.0,1
40,AutoSKLearn,TPOT,30,30,3c,3c,0.012,38.0,1
