### utils

In [1]:
### calculate cliffs delta effect size
### From https://github.com/neilernst/cliffsDelta/blob/master/cliffsDelta.py
from __future__ import division


def cliffsDelta(lst1, lst2, **dull):

    """Returns delta and true if there are more than 'dull' differences"""
    if not dull:
        dull = {'small': 0.147, 'medium': 0.33, 'large': 0.474} # effect sizes from (Hess and Kromrey, 2004)
    m, n = len(lst1), len(lst2)
    lst2 = sorted(lst2)
    j = more = less = 0
    for repeats, x in runs(sorted(lst1)):
        while j <= (n - 1) and lst2[j] < x:
            j += 1
        more += j*repeats
        while j <= (n - 1) and lst2[j] == x:
            j += 1
        less += (n - j)*repeats
    d = (more - less) / (m*n)
    size = lookup_size(d, dull)
    return d, size


def lookup_size(delta: float, dull: dict) -> str:
    """
    :type delta: float
    :type dull: dict, a dictionary of small, medium, large thresholds.
    """
    delta = abs(delta)
    if delta < dull['small']:
        return 'negligible'
    if dull['small'] <= delta < dull['medium']:
        return 'small'
    if dull['medium'] <= delta < dull['large']:
        return 'medium'
    if delta >= dull['large']:
        return 'large'


def runs(lst):
    """Iterator, chunks repeated values"""
    for j, two in enumerate(lst):
        if j == 0:
            one, i = two, 0
        if one != two:
            yield j - i, one
            i = j
        one = two
    yield j - i + 1, two

    
### calculate wilcoxon pvalue
import pandas as pd
from scipy import stats


def wilcoxon_signed_rank_test(y1, y2):
    statistic,pvalue = stats.wilcoxon(y1, y2)
    return pvalue

In [2]:
def get_score(y1,y2):
    pvalue = wilcoxon_signed_rank_test(y1, y2)
    d, size = cliffsDelta(y1, y2)
    return  pvalue, d, size


def get_pvalue_and_effect_size(all_score):
    models_name = list(all_score)
    for i in range(len( models_name )):
        for j in range(i+1,len( models_name )):
            pvalue, d, size = get_score(all_score[models_name[i]],all_score[models_name[j]])
            print("{} and {}, pvalue:{}, cliffsDelta:{}, effect size:{}".format(models_name[i],models_name[j],pvalue, d, size))


def get_all_model_score(path,question_cnt = 50):
    data_frame=pd.read_excel(path)
    result = {}
    
    user_cnt = len(data_frame["ID"])
    for i in range(user_cnt):
        for q in range(question_cnt):
            cocogum , ast_att_gru, astnn, rencos = list(data_frame.loc[i])[5+4*q:9+4*q]
            key = "Q_" + str(q)
            if key in result:
                result[ key][ "cocogum" ].append(cocogum)
                result[ key][ "ast_att_gru" ].append( ast_att_gru)
                result[ key][ "astnn" ].append(astnn)
                result[ key][ "rencos" ].append(rencos)
            else:
                result[ key] = {}
                result[ key][ "cocogum" ] = [cocogum]
                result[ key][ "ast_att_gru" ] = [ast_att_gru]
                result[ key][ "astnn" ] = [astnn]
                result[ key][ "rencos" ] = [rencos]

    cocogum_scores = []
    ast_att_gru_scores = []
    astnn_scores = []
    rencos_scores = []
    for q, four_score in result.items():
        cocogum_scores.extend(four_score["cocogum"])
        ast_att_gru_scores.extend(four_score["ast_att_gru"])
        astnn_scores.extend(four_score["astnn"])
        rencos_scores.extend(four_score["rencos"])

    all_score = {"cocogum":cocogum_scores,"ast_att_gru":ast_att_gru_scores,"astnn":astnn_scores,"rencos":rencos_scores }
    return all_score


from collections import Counter
import numpy as np


def print_distribution(four_model_score):
    print('-' * 90)
    print("model type \t", "1\t" ," 2\t", "3\t", "4\t" ,"5\t" ,"Avg\t","≥4\t", "≥3\t", "≤2\t")
    for k in four_model_score:
        result = Counter(four_model_score[k])
        avg = np.mean(four_model_score[k])
        print(k,"  \t" ,result[1],"\t"  ,result[2], "\t" ,result[3],"\t"  ,result[4],"\t" ,result[5],"\t" ,  round(   avg ,2),"\t" ,\
              result[4]+result[5],"\t" , result[3]+ result[4]+result[5],"\t" , result[1]+ result[2],"\t" )
    print('-' * 90)

def print_all(path,question_cnt = 50):
    all_score = get_all_model_score(path,question_cnt = 50)
    print_distribution(all_score)
    get_pvalue_and_effect_size(all_score)

### Calculate distribution, pvalue, and effect size

In [3]:
print_all(r"Source code summarization human evaluation（random50)(new)(1-20).xlsx",question_cnt=50)

------------------------------------------------------------------------------------------
model type 	 1	  2	 3	 4	 5	 Avg	 ≥4	 ≥3	 ≤2	
cocogum   	 87 	 147 	 308 	 259 	 199 	 3.34 	 458 	 766 	 234 	
ast_att_gru   	 103 	 236 	 312 	 248 	 101 	 3.01 	 349 	 661 	 339 	
astnn   	 320 	 286 	 189 	 130 	 75 	 2.35 	 205 	 394 	 606 	
rencos   	 254 	 278 	 209 	 170 	 89 	 2.56 	 259 	 468 	 532 	
------------------------------------------------------------------------------------------
cocogum and ast_att_gru, pvalue:8.246301215039014e-19, cliffsDelta:0.158206, effect size:small
cocogum and astnn, pvalue:6.28639812864945e-73, cliffsDelta:0.424025, effect size:medium
cocogum and rencos, pvalue:3.96856899773712e-52, cliffsDelta:0.337881, effect size:medium
ast_att_gru and astnn, pvalue:1.2434481056260308e-47, cliffsDelta:0.309593, effect size:small
ast_att_gru and rencos, pvalue:9.878010940157551e-25, cliffsDelta:0.211541, effect size:small
astnn and rencos, pvalue:7.803578364015607e-

In [4]:
print_all(r"Source code summarization human evaluation（random50)(new)(1-24).xlsx",question_cnt=50)

------------------------------------------------------------------------------------------
model type 	 1	  2	 3	 4	 5	 Avg	 ≥4	 ≥3	 ≤2	
cocogum   	 97 	 188 	 379 	 306 	 230 	 3.32 	 536 	 915 	 285 	
ast_att_gru   	 110 	 285 	 392 	 293 	 120 	 3.02 	 413 	 805 	 395 	
astnn   	 362 	 374 	 238 	 148 	 78 	 2.34 	 226 	 464 	 736 	
rencos   	 289 	 331 	 273 	 207 	 100 	 2.58 	 307 	 580 	 620 	
------------------------------------------------------------------------------------------
cocogum and ast_att_gru, pvalue:7.2976289202135e-18, cliffsDelta:0.14534027777777778, effect size:negligible
cocogum and astnn, pvalue:6.241293621443663e-88, cliffsDelta:0.43407222222222225, effect size:medium
cocogum and rencos, pvalue:5.319783228033325e-56, cliffsDelta:0.3272291666666667, effect size:small
ast_att_gru and astnn, pvalue:5.438485643970364e-59, cliffsDelta:0.33044305555555553, effect size:medium
ast_att_gru and rencos, pvalue:8.739053704851886e-28, cliffsDelta:0.21104166666666666, eff