In [1]:
import json
from glob import glob
import numpy as np
import pandas as pd

## Load result files

In [2]:
result_files = glob('../model/results/*.json')

In [3]:
dicts = {}

for file_ in result_files:
    with open(file_, 'r') as f:
        d = json.load(f)
    dataset = d['dataset']
    if dicts.get(dataset) is None:
        dicts[dataset] = [d]
    else:
        dicts[dataset].append(d)

In [4]:
dicts_difference = {}
for dataset in dicts.keys():
    dicts_difference[dataset] = dict()

for dataset in dicts.keys():
    architecture_files = dicts[dataset]
    for architecture_file in architecture_files:
        architecture = f"{architecture_file['convolution_layer']}_{architecture_file['local_pooling_layer']}"
        if dicts_difference[dataset].get(architecture) is None:
            dicts_difference[dataset][architecture] = 0
            
        if architecture_file['global_pooling_layer'] == 'max':
            dicts_difference[dataset][architecture] += architecture_file['mean_accuracy']
        else:
            dicts_difference[dataset][architecture] -= architecture_file['mean_accuracy']

## Wilcoxon test

In [5]:
from scipy.stats import wilcoxon
import warnings
warnings.filterwarnings("ignore")

In [18]:
df_wilcoxon = pd.DataFrame()
df_wilcoxon.index = list(dicts_difference.keys())
df_wilcoxon['p-value'] = [0]*len(df_wilcoxon)
df_wilcoxon['mean_diff'] = [0]*len(df_wilcoxon)

In [19]:
for dataset in dicts_difference.keys():
    list_differences = list(dicts_difference[dataset].values())
    mean_difference = np.mean(list_differences)
    res = wilcoxon(list_differences)
    print(f"{dataset}: mean_difference = {mean_difference:.4f}")
    print(f"Wilcoxon test statistic = {res.statistic}, p-value = {res.pvalue:.4f}")
    if res.pvalue > 0.05:
        print("Can't reject null hyp. -> samples come from the same distribution")
    else:
        print("Null hyp. rejected -> samples come from different distribution")
    print()
    df_wilcoxon.loc[dataset, 'p-value'] = res.pvalue
    df_wilcoxon.loc[dataset, 'mean_diff'] = mean_difference

MUTAG: mean_difference = -0.0084
Wilcoxon test statistic = 34.5, p-value = 0.2583
Can't reject null hyp. -> samples come from the same distribution

PROTEINS: mean_difference = 0.0090
Wilcoxon test statistic = 42.0, p-value = 0.3303
Can't reject null hyp. -> samples come from the same distribution

ENZYMES: mean_difference = -0.0104
Wilcoxon test statistic = 37.0, p-value = 0.2078
Can't reject null hyp. -> samples come from the same distribution

NCI1: mean_difference = 0.6687
Wilcoxon test statistic = 0.0, p-value = 0.0001
Null hyp. rejected -> samples come from different distribution



In [23]:
for dataset in dicts.keys():
    best_acc = 0 
    best_architecture = ''
    for d in dicts[dataset]:
        acc = d['mean_accuracy']
        if acc >= best_acc:
            best_acc = acc
            best_architecture = f"{d['convolution_layer']}_{d['local_pooling_layer']}_{d['global_pooling_layer']}"
    print(f"{dataset}: best architecture = {best_architecture} -> {best_acc:.3f}")
    df_wilcoxon.loc[dataset, 'best_arch'] = best_architecture

MUTAG: best architecture = GINConv_EDGE_max -> 0.847
PROTEINS: best architecture = GCN_EDGE_max -> 0.753
ENZYMES: best architecture = GINConv_EDGE_mean -> 0.379
NCI1: best architecture = GINConv_MEWIS_max -> 0.744


In [26]:
df_wilcoxon

Unnamed: 0,p-value,mean_diff,best_arch
MUTAG,0.258251,-0.008421,GINConv_EDGE_max
PROTEINS,0.330261,0.008999,GCN_EDGE_max
ENZYMES,0.207764,-0.010444,GINConv_EDGE_mean
NCI1,6.1e-05,0.668674,GINConv_MEWIS_max
