In [1]:
import pandas as pd
import krippendorff

In [2]:
# calculate cliffs delta effect size: https://github.com/neilernst/cliffsDelta/blob/master/cliffsDelta.py
from __future__ import division
import copy
import random
from collections import Counter
import numpy as np
from prettytable import PrettyTable

def cliffsDelta(lst1, lst2, **dull):
    """Returns delta and true if there are more than 'dull' differences"""
    if not dull:
        dull = {'small': 0.147, 'medium': 0.33, 'large': 0.474} # effect sizes from (Hess and Kromrey, 2004)
    m, n = len(lst1), len(lst2)
    lst2 = sorted(lst2)
    j = more = less = 0
    for repeats, x in runs(sorted(lst1)):
        while j <= (n - 1) and lst2[j] < x:
            j += 1
        more += j*repeats
        while j <= (n - 1) and lst2[j] == x:
            j += 1
        less += (n - j)*repeats
    d = (more - less) / (m*n)
    size = lookup_size(d, dull)
    return d, size


def lookup_size(delta: float, dull: dict) -> str:
    """
    :type delta: float
    :type dull: dict, a dictionary of small, medium, large thresholds.
    """
    delta = abs(delta)
    if delta < dull['small']:
        return 'negligible'
    if dull['small'] <= delta < dull['medium']:
        return 'small'
    if dull['medium'] <= delta < dull['large']:
        return 'medium'
    if delta >= dull['large']:
        return 'large'


def runs(lst):
    """Iterator, chunks repeated values"""
    for j, two in enumerate(lst):
        if j == 0:
            one, i = two, 0
        if one != two:
            yield j - i, one
            i = j
        one = two
    yield j - i + 1, two

    
### calculate wilcoxon pvalue
import pandas as pd
from scipy import stats

def wilcoxon_signed_rank_test(y1, y2):
    statistic,pvalue = stats.wilcoxon(y1, y2)
    return pvalue


def get_score(y1,y2):
    pvalue = wilcoxon_signed_rank_test(y1, y2)
    d, size = cliffsDelta(y1, y2)
    return  pvalue, d, size


def get_pvalue_and_effect_size(all_score):
    models_name = list(all_score)
    for i in range(len( models_name )):
        for j in range(i+1,len( models_name )):
            pvalue, d, size = get_score(all_score[models_name[i]],all_score[models_name[j]])
            print("{} and {}, pvalue:{}, cliffsDelta:{}, effect size:{}".format(models_name[i],models_name[j],pvalue, d, size))


def get_all_model_score(path,question_cnt = 50):
    data_frame=pd.read_excel(path)
    result = {}
    
    user_cnt = len(data_frame["ID"])
    for i in range(user_cnt):
        for q in range(question_cnt):
            cocogum , ast_att_gru, astnn, rencos = list(data_frame.loc[i])[5+4*q:9+4*q]
            key = "Q_" + str(q)
            if key in result:
                result[ key][ "cocogum" ].append(cocogum)
                result[ key][ "ast_att_gru" ].append( ast_att_gru)
                result[ key][ "astnn" ].append(astnn)
                result[ key][ "rencos" ].append(rencos)
            else:
                result[ key] = {}
                result[ key][ "cocogum" ] = [cocogum]
                result[ key][ "ast_att_gru" ] = [ast_att_gru]
                result[ key][ "astnn" ] = [astnn]
                result[ key][ "rencos" ] = [rencos]

    cocogum_scores = []
    ast_att_gru_scores = []
    astnn_scores = []
    rencos_scores = []
    for q, four_score in result.items():
        cocogum_scores.extend(four_score["cocogum"])
        ast_att_gru_scores.extend(four_score["ast_att_gru"])
        astnn_scores.extend(four_score["astnn"])
        rencos_scores.extend(four_score["rencos"])

    all_score = {"cocogum":cocogum_scores,"ast_att_gru":ast_att_gru_scores,"astnn":astnn_scores,"rencos":rencos_scores }
    return all_score


def parse_score_dict(result):
    cocogum_scores = []
    ast_att_gru_scores = []
    astnn_scores = []
    rencos_scores = []
    for q, four_score in result.items():
        cocogum_scores.extend(four_score["cocogum"])
        ast_att_gru_scores.extend(four_score["ast_att_gru"])
        astnn_scores.extend(four_score["astnn"])
        rencos_scores.extend(four_score["rencos"])

    all_score = {"cocogum":cocogum_scores,"ast_att_gru":ast_att_gru_scores,"astnn":astnn_scores,"rencos":rencos_scores }
    return all_score


def get_all_model_in_three_aspects_score(path,question_cnt = 10,start_qid=1):
    model_order_dict = {}
    for i in range(1, 51,1):
        random.seed(i)
        li= [1,2,3,4]
        random.shuffle(li)
        model_order_dict[i] =  li    
#     print('model_order_dict:', model_order_dict)
    
#     path = "112506357_2_Code Summarization Human Evaluation 1- 10_2_2.xlsx"
    data_frame=pd.read_excel(path)
    result = {"informative":{}, "naturalness":{}, "similarity":{}}
    user_cnt = len(data_frame["序号"])
    for i in range(user_cnt):
        for q in range(question_cnt):
            start_index = 6 + q*12
            one_question_score = [list(data_frame.loc[0])[start_index+j*3:start_index+(j+1)*3] for j in range(4)]
            model_order_in_this_question = model_order_dict[q+start_qid]
            one_question_model_socre = dict (zip(model_order_in_this_question,one_question_score  ))
            ast_att_gru, astnn, rencos, cocogum = one_question_model_socre[1], \
                                                  one_question_model_socre[2], \
                                                  one_question_model_socre[3], \
                                                  one_question_model_socre[4]
            key = "Q_" + str(q)
            if key in result["informative"]:
                result["informative"][ key][ "cocogum" ].append(cocogum[0]-1)
                result["informative"][ key][ "ast_att_gru" ].append( ast_att_gru[0]-1)
                result["informative"][ key][ "astnn" ].append(astnn[0]-1)
                result["informative"][ key][ "rencos" ].append(rencos[0]-1)

                result["naturalness"][ key][ "cocogum" ].append(cocogum[1]-1)
                result["naturalness"][ key][ "ast_att_gru" ].append( ast_att_gru[1]-1)
                result["naturalness"][ key][ "astnn" ].append(astnn[1]-1)
                result["naturalness"][ key][ "rencos" ].append(rencos[1]-1)

                result["similarity"][ key][ "cocogum" ].append(cocogum[2]-1)
                result["similarity"][ key][ "ast_att_gru" ].append( ast_att_gru[2]-1)
                result["similarity"][ key][ "astnn" ].append(astnn[2]-1)
                result["similarity"][ key][ "rencos" ].append(rencos[2]-1)
            else:
                result["informative"] [ key] = {}
                result["naturalness"][ key]= {}
                result["similarity"][ key] = {}

                result["informative"][ key][ "cocogum" ]= [cocogum[0]-1]
                result["informative"][ key][ "ast_att_gru" ]=  [ast_att_gru[0]-1]
                result["informative"][ key][ "astnn" ] = [astnn[0]-1]
                result["informative"][ key][ "rencos" ] = [rencos[0]-1]

                result["naturalness"][ key][ "cocogum" ] = [cocogum[1]-1]
                result["naturalness"][ key][ "ast_att_gru" ] = [ast_att_gru[1]-1-1]
                result["naturalness"][ key][ "astnn" ] = [astnn[1]-1]
                result["naturalness"][ key][ "rencos" ] = [rencos[1]-1]

                result["similarity"][ key][ "cocogum" ] = [cocogum[2]-1]
                result["similarity"][ key][ "ast_att_gru" ] =  [ast_att_gru[2]-1]
                result["similarity"][ key][ "astnn" ] = [astnn[2]-1]
                result["similarity"][ key][ "rencos" ] = [rencos[2]-1]
    return  parse_score_dict(result['informative']), parse_score_dict(result['naturalness']), parse_score_dict(result['similarity'])


def print_distribution(four_model_score):
    table = PrettyTable(['model type', "0", "1", "2", "3", "4", "Avg(Std)", "≥3", "≥2", "≤1"])
    for k in four_model_score:
        result = Counter(four_model_score[k])
        avg = np.mean(four_model_score[k])
        std = np.std(four_model_score[k])
        table.add_row([k, result[0], result[1], result[2], result[3], result[4],
                       "{}({})".format(round(avg,2), round(std,2)),
                       result[3]+result[4], result[2]+result[3]+result[4], result[0]+result[1]])
    print(table)


# multi-excel
def merge_all_score(s1,s2,s3,s4,s5):
    merged_scores = copy.deepcopy(s1)
    five_score = [s1,s2,s3,s4,s5 ]
    for key in merged_scores[0].keys():
        for i in range(3):
            for j in range(1,len(five_score)):
                merged_scores[i][key].extend(five_score[j][i][key])
    return  merged_scores


def calcute_final_result(path1,path2,path3,path4,path5):
    all_scores1_10 = get_all_model_in_three_aspects_score(path1,question_cnt=10,start_qid=1)
    all_scores11_20 = get_all_model_in_three_aspects_score(path2,question_cnt=10,start_qid=11)
    all_scores21_30 = get_all_model_in_three_aspects_score(path3,question_cnt=10,start_qid=21)
    all_scores31_40 = get_all_model_in_three_aspects_score(path4,question_cnt=10,start_qid=31)
    all_scores41_50 = get_all_model_in_three_aspects_score(path5,question_cnt=10,start_qid=41)
    merged_scores = merge_all_score(all_scores1_10,all_scores11_20 ,all_scores21_30 ,all_scores31_40 ,all_scores41_50 )
    print("informative")
    print_distribution( merged_scores[0])
    get_pvalue_and_effect_size( merged_scores[0])
    
    print(80*"*")
    print("naturalness")
    print_distribution( merged_scores[1])
    get_pvalue_and_effect_size( merged_scores[1])
    
    print(80*"*")
    print("similarity")
    print_distribution( merged_scores[2])
    get_pvalue_and_effect_size( merged_scores[2])
    return merged_scores


def main():
    path1_10 = r"112506357_2_Code Summarization Human Evaluation 1- 10_4_4.xlsx"
    path11_20 = r"112506178_2_Code Summarization Human Evaluation 11- 20_4_4.xlsx"
    path21_30 = r"112506168_2_Code Summarization Human Evaluation 21- 30_4_4.xlsx"
    path31_40 = r"112506604_2_Code Summarization Human Evaluation 31- 40_4_4.xlsx"
    path41_50 = r"112504929_2_Code Summarization Human Evaluation 41- 50_4_4.xlsx"
    scores = calcute_final_result(path1_10, path11_20, path21_30, path31_40, path41_50)


## 4 appraoch 3 aspect 50 question

In [3]:
import krippendorff

In [4]:
def get_alpha_for_4_raters_in_one_group_question_one_approach( all_scores, approach, aspect):
    aspects = {0:"informative", 1:"naturalness", 2:"similarity"}
    score_cocogum = all_scores[aspect][approach]
    rater0_range = list(range(0,40,4))
    rater1_range = list(range(1,40,4))
    rater2_range = list(range(1,40,4))
    rater3_range = list(range(1,40,4))
    rater0 = [ score_cocogum[item] for item in rater0_range]
    rater1 = [ score_cocogum[item] for item in rater1_range]
    rater2 = [ score_cocogum[item] for item in rater2_range]
    rater3 = [ score_cocogum[item] for item in rater3_range]
    all_raters_score = [rater0,rater1,rater2,rater3]
    print(" %s in %s: "%(approach,aspects[aspect]) + "Krippendorff's alpha for ordinal metric: {}".format(krippendorff.alpha(reliability_data=all_raters_score ,level_of_measurement='ordinal')))

In [5]:
def get_alpha_for_4_raters_in_one_group_question(all_scores):
    approaches=['cocogum', 'ast_att_gru', 'astnn', 'rencos']
    for aspect in range(3):
        for approach in approaches:
            get_alpha_for_4_raters_in_one_group_question_one_approach( all_scores, approach, aspect)

In [8]:
path1_10 = r"112506357_2_Code Summarization Human Evaluation 1- 10_4_4.xlsx"
path11_20 = r"112506178_2_Code Summarization Human Evaluation 11- 20_4_4.xlsx"
path21_30 = r"112506168_2_Code Summarization Human Evaluation 21- 30_4_4.xlsx"
path31_40 = r"112506604_2_Code Summarization Human Evaluation 31- 40_4_4.xlsx"
path41_50 = r"112504929_2_Code Summarization Human Evaluation 41- 50_4_4.xlsx"
all_scores1_10 = get_all_model_in_three_aspects_score(path1_10 ,question_cnt=10,start_qid=1)
print(80*"*")
print(path1_10)

get_alpha_for_4_raters_in_one_group_question(all_scores1_10)

print(80*"*")
print(path11_20)
all_scores11_20 = get_all_model_in_three_aspects_score(path11_20,question_cnt=10,start_qid=11)
get_alpha_for_4_raters_in_one_group_question(all_scores11_20)


print(80*"*")
print(path21_30)
all_scores21_30 = get_all_model_in_three_aspects_score(path21_30,question_cnt=10,start_qid=21)
get_alpha_for_4_raters_in_one_group_question(all_scores21_30)


print(80*"*")
print(path31_40)
all_scores31_40 = get_all_model_in_three_aspects_score(path31_40,question_cnt=10,start_qid=31)
get_alpha_for_4_raters_in_one_group_question(all_scores31_40)


print(80*"*")
print(path41_50)
all_scores41_50 = get_all_model_in_three_aspects_score(path41_50,question_cnt=10,start_qid=41)
get_alpha_for_4_raters_in_one_group_question(all_scores41_50)



********************************************************************************
112506357_2_Code Summarization Human Evaluation 1- 10_4_4.xlsx
 cocogum in informative: Krippendorff's alpha for ordinal metric: 1.0
 ast_att_gru in informative: Krippendorff's alpha for ordinal metric: 1.0
 astnn in informative: Krippendorff's alpha for ordinal metric: 1.0
 rencos in informative: Krippendorff's alpha for ordinal metric: 1.0
 cocogum in naturalness: Krippendorff's alpha for ordinal metric: 1.0
 ast_att_gru in naturalness: Krippendorff's alpha for ordinal metric: 0.5966803278688524
 astnn in naturalness: Krippendorff's alpha for ordinal metric: 1.0
 rencos in naturalness: Krippendorff's alpha for ordinal metric: 1.0
 cocogum in similarity: Krippendorff's alpha for ordinal metric: 1.0
 ast_att_gru in similarity: Krippendorff's alpha for ordinal metric: 1.0
 astnn in similarity: Krippendorff's alpha for ordinal metric: 1.0
 rencos in similarity: Krippendorff's alpha for ordinal metric: 1.0
**

##  4 appraoch  50 question (concat 3 aspects)

In [16]:
def get_alpha_for_4_raters_in_one_group_question_one_approach_concat_three_aspect( all_scores, approach):
    rater0_in_three_aspects = []
    rater1_in_three_aspects = []
    rater2_in_three_aspects = []
    rater3_in_three_aspects = []
        
    aspects = {0:"informative", 1:"naturalness", 2:"similarity"}
    for aspect in range(3):
        score_cocogum = all_scores[aspect][approach]
        rater0_range = list(range(0,40,4))
        rater1_range = list(range(1,40,4))
        rater2_range = list(range(1,40,4))
        rater3_range = list(range(1,40,4))
        rater0 = [ score_cocogum[item] for item in rater0_range]
        rater1 = [ score_cocogum[item] for item in rater1_range]
        rater2 = [ score_cocogum[item] for item in rater2_range]
        rater3 = [ score_cocogum[item] for item in rater3_range]
        rater0_in_three_aspects .extend( rater0)
        rater1_in_three_aspects .extend( rater1)
        rater2_in_three_aspects .extend( rater2)
        rater3_in_three_aspects .extend( rater3)
    all_raters_score = [rater0_in_three_aspects,rater1_in_three_aspects,rater2_in_three_aspects,rater3_in_three_aspects]
    print(" %s in three aspect: "%(approach) + "Krippendorff's alpha for ordinal metric: {}".format(krippendorff.alpha(reliability_data=all_raters_score ,level_of_measurement='ordinal')))

In [17]:

def get_alpha_for_4_raters_in_one_group_question_concat_three_aspect(all_scores):
    approaches=['cocogum', 'ast_att_gru', 'astnn', 'rencos']
    for approach in approaches:
    #     get_alpha_for_4_raters_in_one_group_question_one_approach( all_scores, approach, aspect)
        get_alpha_for_4_raters_in_one_group_question_one_approach_concat_three_aspect( all_scores, approach)

In [18]:
path1_10 = r"112506357_2_Code Summarization Human Evaluation 1- 10_4_4.xlsx"
path11_20 = r"112506178_2_Code Summarization Human Evaluation 11- 20_4_4.xlsx"
path21_30 = r"112506168_2_Code Summarization Human Evaluation 21- 30_4_4.xlsx"
path31_40 = r"112506604_2_Code Summarization Human Evaluation 31- 40_4_4.xlsx"
path41_50 = r"112504929_2_Code Summarization Human Evaluation 41- 50_4_4.xlsx"
all_scores1_10 = get_all_model_in_three_aspects_score(path1_10 ,question_cnt=10,start_qid=1)
print(80*"*")
print(path1_10)

get_alpha_for_4_raters_in_one_group_question_concat_three_aspect(all_scores1_10)

print(80*"*")
print(path11_20)
all_scores11_20 = get_all_model_in_three_aspects_score(path11_20,question_cnt=10,start_qid=11)
get_alpha_for_4_raters_in_one_group_question_concat_three_aspect(all_scores11_20)


print(80*"*")
print(path21_30)
all_scores21_30 = get_all_model_in_three_aspects_score(path21_30,question_cnt=10,start_qid=21)
get_alpha_for_4_raters_in_one_group_question_concat_three_aspect(all_scores21_30)


print(80*"*")
print(path31_40)
all_scores31_40 = get_all_model_in_three_aspects_score(path31_40,question_cnt=10,start_qid=31)
get_alpha_for_4_raters_in_one_group_question_concat_three_aspect(all_scores31_40)


print(80*"*")
print(path41_50)
all_scores41_50 = get_all_model_in_three_aspects_score(path41_50,question_cnt=10,start_qid=41)
get_alpha_for_4_raters_in_one_group_question_concat_three_aspect(all_scores41_50)



********************************************************************************
112506357_2_Code Summarization Human Evaluation 1- 10_4_4.xlsx
 cocogum in three aspect: Krippendorff's alpha for ordinal metric: 1.0
 ast_att_gru in three aspect: Krippendorff's alpha for ordinal metric: 0.872550441403958
 astnn in three aspect: Krippendorff's alpha for ordinal metric: 1.0
 rencos in three aspect: Krippendorff's alpha for ordinal metric: 1.0
********************************************************************************
112506178_2_Code Summarization Human Evaluation 11- 20_4_4.xlsx
 cocogum in three aspect: Krippendorff's alpha for ordinal metric: 1.0
 ast_att_gru in three aspect: Krippendorff's alpha for ordinal metric: 0.9304948946043319
 astnn in three aspect: Krippendorff's alpha for ordinal metric: 1.0
 rencos in three aspect: Krippendorff's alpha for ordinal metric: 1.0
********************************************************************************
112506168_2_Code Summarization 

# debug

## "informative"

In [7]:
import krippendorff

In [6]:
all_scores1_10[0].keys()

dict_keys(['cocogum', 'ast_att_gru', 'astnn', 'rencos'])

### CoCoGum

In [9]:
rater0_range = list(range(0,40,4))
rater1_range = list(range(1,40,4))
rater2_range = list(range(1,40,4))
rater3_range = list(range(1,40,4))

In [10]:
score_cocogum = all_scores1_10[0]["cocogum"]
rater0 = [ score_cocogum[item] for item in rater0_range]
rater1 = [ score_cocogum[item] for item in rater1_range]
rater2 = [ score_cocogum[item] for item in rater2_range]
rater3 = [ score_cocogum[item] for item in rater3_range]

In [12]:
all_raters_score = [rater0,rater1,rater2,rater3]

In [14]:
print("Krippendorff's alpha for ordinal metric: {}".format(krippendorff.alpha(reliability_data=all_raters_score ,level_of_measurement='ordinal')))

Krippendorff's alpha for ordinal metric: 1.0


In [19]:
score_cocogum = all_scores1_10[0]["ast_att_gru"]
rater0 = [ score_cocogum[item] for item in rater0_range]
rater1 = [ score_cocogum[item] for item in rater1_range]
rater2 = [ score_cocogum[item] for item in rater2_range]
rater3 = [ score_cocogum[item] for item in rater3_range]
all_raters_score = [rater0,rater1,rater2,rater3]
print("Krippendorff's alpha for ordinal metric: {}".format(krippendorff.alpha(reliability_data=all_raters_score ,level_of_measurement='ordinal')))

Krippendorff's alpha for ordinal metric: 1.0


In [20]:
score_cocogum = all_scores1_10[0]["astnn"]
rater0 = [ score_cocogum[item] for item in rater0_range]
rater1 = [ score_cocogum[item] for item in rater1_range]
rater2 = [ score_cocogum[item] for item in rater2_range]
rater3 = [ score_cocogum[item] for item in rater3_range]
all_raters_score = [rater0,rater1,rater2,rater3]
print("Krippendorff's alpha for ordinal metric: {}".format(krippendorff.alpha(reliability_data=all_raters_score ,level_of_measurement='ordinal')))

Krippendorff's alpha for ordinal metric: 1.0


In [21]:
score_cocogum = all_scores1_10[0]["rencos"]
rater0 = [ score_cocogum[item] for item in rater0_range]
rater1 = [ score_cocogum[item] for item in rater1_range]
rater2 = [ score_cocogum[item] for item in rater2_range]
rater3 = [ score_cocogum[item] for item in rater3_range]
all_raters_score = [rater0,rater1,rater2,rater3]
print("Krippendorff's alpha for ordinal metric: {}".format(krippendorff.alpha(reliability_data=all_raters_score ,level_of_measurement='ordinal')))

Krippendorff's alpha for ordinal metric: 1.0


In [29]:
def get_alpha_for_4_raters_in_one_group_question_one_approach( all_scores, approach, aspect):
    aspects = {0:"informative", 1:"naturalness", 2:"similarity"}
    score_cocogum = all_scores[aspect][approach]
    rater0_range = list(range(0,40,4))
    rater1_range = list(range(1,40,4))
    rater2_range = list(range(1,40,4))
    rater3_range = list(range(1,40,4))
    rater0 = [ score_cocogum[item] for item in rater0_range]
    rater1 = [ score_cocogum[item] for item in rater1_range]
    rater2 = [ score_cocogum[item] for item in rater2_range]
    rater3 = [ score_cocogum[item] for item in rater3_range]
    all_raters_score = [rater0,rater1,rater2,rater3]
    print(" %s in %s: "%(approach,aspects[aspect]) + "Krippendorff's alpha for ordinal metric: {}".format(krippendorff.alpha(reliability_data=all_raters_score ,level_of_measurement='ordinal')))

In [30]:
def get_alpha_for_4_raters_in_one_group_question(all_scores):
    approaches=['cocogum', 'ast_att_gru', 'astnn', 'rencos']
    for aspect in range(3):
        for approach in approaches:
            get_alpha_for_4_raters_in_one_group_question_one_approach( all_scores, approach, aspect)

In [31]:
 get_alpha_for_4_raters_in_one_group_question(all_scores1_10)

 cocogum in informative: Krippendorff's alpha for ordinal metric: 1.0
 ast_att_gru in informative: Krippendorff's alpha for ordinal metric: 1.0
 astnn in informative: Krippendorff's alpha for ordinal metric: 1.0
 rencos in informative: Krippendorff's alpha for ordinal metric: 1.0
 cocogum in naturalness: Krippendorff's alpha for ordinal metric: 1.0
 ast_att_gru in naturalness: Krippendorff's alpha for ordinal metric: 0.5966803278688524
 astnn in naturalness: Krippendorff's alpha for ordinal metric: 1.0
 rencos in naturalness: Krippendorff's alpha for ordinal metric: 1.0
 cocogum in similarity: Krippendorff's alpha for ordinal metric: 1.0
 ast_att_gru in similarity: Krippendorff's alpha for ordinal metric: 1.0
 astnn in similarity: Krippendorff's alpha for ordinal metric: 1.0
 rencos in similarity: Krippendorff's alpha for ordinal metric: 1.0
