In [1]:
import numpy as np
import pandas as pd

def rank_numbers(numbers):
    sorted_indices = sorted(range(len(numbers)), key=lambda k: numbers[k], reverse=True)
    ranks = [0] * len(numbers)

    i = 0
    while i < len(sorted_indices):
        value_indices = [i]
        while i + 1 < len(sorted_indices) and numbers[sorted_indices[i]] == numbers[sorted_indices[i + 1]]:
            i += 1
            value_indices.append(i)
        average_rank = np.mean([index + 1 for index in value_indices])
        for index in value_indices:
            ranks[sorted_indices[index]] = average_rank
        i += 1

    return ranks

def calculate_spearman_manual(values1, values2):
    n = len(values1)
    m = len(values2)
    rank1 = rank_numbers(values1)
    rank2 = rank_numbers(values2)
    d = np.array(rank1) - np.array(rank2)
    d_squared = np.square(d)
    spearman_corr = 1 - (6 * np.sum(d_squared)) / (n * (n**2 - 1))
    return spearman_corr

def rank_basescore(base_score, draw_ratio):
    # 计算最大值和最小值的差值
    score_range = max(base_score) - min(base_score)
    # 计算平序的阈值
    draw_gap = draw_ratio * score_range

    # 对列表进行排序并保留原始索引
    indexed_scores = list(enumerate(base_score))
    indexed_scores.sort(key=lambda x: x[1])

    # 处理平序
    ranks = [0] * len(base_score)
    current_rank = 1
    for i in range(len(indexed_scores)):
        if i > 0 and abs(indexed_scores[i][1] - indexed_scores[i - 1][1]) < draw_gap:
            ranks[indexed_scores[i][0]] = current_rank
        else:
            current_rank = i + 1
            ranks[indexed_scores[i][0]] = current_rank

    return ranks

In [2]:
dimension = 'aesthetic_quality'

In [3]:
import json
jsonpath = '../Human_anno/{}.json'.format(dimension)
with open(jsonpath,'r') as f:
    oc = json.load(f)

# history ="../GPT4o_eval_results/{}_gpt4eval_results.json".format(dimension)
# with open(history,'r') as f:
#     gpt4o_eval_history = json.load(f)

In [4]:
models = ['cogvideox5b','kling', 'gen3','videocrafter2', 'pika', 'show1', 'lavie']
# models = ['videocrafter2', 'pika', 'show1', 'lavie']
# models = ['cogvideox5b','gen3', 'kling']


l1 = list(range(0,len(oc)))
# l2 = list(range(1,len(oc),3))
# l3 = list(range(2,len(oc),3))

idexls = l1

length = len(idexls)


In [5]:
flag = 0
iternum = 0
num_human = 4
while(flag == 0):
    iternum += 1
    gptvsannos_spearman = np.zeros((4,length))
    gptvsannomean_spearman = np.zeros(length)
    multigptvsannos_spearman = np.zeros((4,length))
    multigptvsannomean_spearman = np.zeros(length)
    baselinevsannos_spearman = np.zeros((4,length))
    baselinevsannomean_spearman = np.zeros(length)
    gptvsmultigpt_spearman = np.zeros(length)


    gptscore = np.zeros((length,len(models)))
    annoscore = np.zeros((4,length,len(models)))
    multigptscore = np.zeros((length,len(models)))
    annomeanscore =np.zeros((length,len(models)))
    baseline_rank = np.zeros((length,len(models)))
    badeval = []

    for j in range(length):
        i = idexls[j]
        gpt4o_eval_rs = np.array(list(oc[i]['gpt4o_eval'].values()))
        human_anno = np.array(list(oc[i]['human_anno'].values()))
        baseline_score = np.array(list(oc[i]['baseline_score'].values()))
        # multiagent_eval_results = np.array(list(oc[i]['multiagent_score'].values()))
        # multiagent_eval_results = np.array(list(oc[i]['combench_style'].values()))
        multiagent_eval_results = np.array(list(oc[i]['gpt4o_eval'].values()))

        for human in range(num_human):
            annoscore[human,j,:] = human_anno[:,human]

        baseline_rank[j] = rank_basescore(baseline_score, 0.1)
        gptscore[j]= gpt4o_eval_rs
        multigptscore[j] = multiagent_eval_results
        annomeanscore[j] = np.mean(annoscore[:,j,:],axis=0)

        for human in range(num_human):
            gptvsannos_spearman[human,j] = calculate_spearman_manual(gpt4o_eval_rs,annoscore[human,j,:])
            multigptvsannos_spearman[human,j] = calculate_spearman_manual(multiagent_eval_results,annoscore[human,j,:])
            baselinevsannos_spearman[human,j] = calculate_spearman_manual(baseline_rank[j],annoscore[human,j,:])

        gptvsmultigpt_spearman[j] = calculate_spearman_manual(gpt4o_eval_rs,multiagent_eval_results)
        gptvsannomean_spearman[j] = calculate_spearman_manual(gpt4o_eval_rs,annomeanscore[j])
        multigptvsannomean_spearman[j] = calculate_spearman_manual(multiagent_eval_results,annomeanscore[j])
        baselinevsannomean_spearman[j] = calculate_spearman_manual(baseline_rank[j],annomeanscore[j])

        if gptvsannos_spearman[0,j] < 0.5:
            badeval.append(i)

    avggptscore = gptscore.mean(axis=0)
    avgmultigptscore = multigptscore.mean(axis=0)

    flag = 1

    if iternum%10==0:
        print("GPT average score: ",avggptscore)
        for i in range(num_human):
            print("Anno{} average score: ".format(i+1),annoscore[i].mean(axis=0))


# x = 0
# y = 0
# for i in range(len(models)):
#     print("gpt eval gap in model {}".format(models[i]),np.round(avggptscore[i] - annoscore[:num_human].mean(axis=0).mean(axis=0)[i],3))
#     print("multi agent eval gap in model {}".format(models[i]),np.round(avgmultigptscore[i] - annoscore[:num_human].mean(axis=0).mean(axis=0)[i],3))
#     x += np.abs(np.round(avggptscore[i] - annoscore[:num_human].mean(axis=0).mean(axis=0)[i],3))
#     y += np.abs(np.round(avgmultigptscore[i] - annoscore[:num_human].mean(axis=0).mean(axis=0)[i],3))
# print("gpt eval gap in total",x)
# print("multi agent eval gap in total",y)

print("AnnoMean average score:", np.round(annoscore[:num_human].mean(axis=0).mean(axis=0), 3))
print("GPT average score:", np.round(avggptscore, 3))
print("MultiGPT average score:", np.round(avgmultigptscore, 3))
print("AnnoMean score rakn",rank_numbers(annoscore[:num_human].mean(axis=0).mean(axis=0)))
print("GPT score rank",rank_numbers(avggptscore))
print("MultiGPT score rank",rank_numbers(avgmultigptscore))

for i in range(num_human):
    print("Anno{} average score:".format(i + 1), np.round(annoscore[i].mean(axis=0), 3))

print("GPT vs MultiGPT Spearman:", np.round(gptvsmultigpt_spearman.mean(), 3))
for i in range(num_human):
    print("GPT vs Anno{} Spearman:".format(i + 1), np.round(gptvsannos_spearman[i].mean(), 3))
for i in range(num_human):
    print("MultiGPT vs Anno{} Spearman:".format(i + 1), np.round(multigptvsannos_spearman[i].mean(), 3))
for i in range(num_human):
    print("Baseline vs Anno{} Spearman:".format(i + 1), np.round(baselinevsannos_spearman[i].mean(), 3))


print("Average GPT vs Anno Spearman:", np.round((gptvsannos_spearman[0].mean() + gptvsannos_spearman[1].mean() + gptvsannos_spearman[2].mean()) / 3, 3))
print("GPT vs AnnoMean Spearman:", np.round(gptvsannomean_spearman.mean(), 3))
print("Average MultiGPT vs Anno Spearman:", np.round((multigptvsannos_spearman[0].mean() + multigptvsannos_spearman[1].mean() + multigptvsannos_spearman[2].mean()) / 3, 3))
print("MultiGPT vs AnnoMean Spearman:", np.round(multigptvsannomean_spearman.mean(), 3))
print("Average Baseline vs Anno Spearman:", np.round((baselinevsannos_spearman[0].mean() + baselinevsannos_spearman[1].mean() + baselinevsannos_spearman[2].mean()) / 3, 3))
print("Baseline vs AnnoMean Spearman:", np.round(baselinevsannomean_spearman.mean(), 3))


AnnoMean average score: [3.941 3.858 4.4   3.477 3.344 3.276 2.559]
GPT average score: [3.938 3.824 4.443 3.846 3.762 3.275 2.941]
MultiGPT average score: [3.938 3.824 4.443 3.846 3.762 3.275 2.941]
AnnoMean score rakn [2.0, 3.0, 1.0, 4.0, 5.0, 6.0, 7.0]
GPT score rank [2.0, 4.0, 1.0, 3.0, 5.0, 6.0, 7.0]
MultiGPT score rank [2.0, 4.0, 1.0, 3.0, 5.0, 6.0, 7.0]
Anno1 average score: [3.872 3.604 4.821 3.289 3.355 3.095 2.546]
Anno2 average score: [3.989 3.93  4.586 3.278 2.861 2.96  2.681]
Anno3 average score: [4.993 4.974 4.996 4.934 4.908 4.839 3.348]
Anno4 average score: [2.912 2.923 3.198 2.407 2.253 2.209 1.659]
GPT vs MultiGPT Spearman: 1.0
GPT vs Anno1 Spearman: 0.662
GPT vs Anno2 Spearman: 0.622
GPT vs Anno3 Spearman: 0.668
GPT vs Anno4 Spearman: 0.505
MultiGPT vs Anno1 Spearman: 0.662
MultiGPT vs Anno2 Spearman: 0.622
MultiGPT vs Anno3 Spearman: 0.668
MultiGPT vs Anno4 Spearman: 0.505
Baseline vs Anno1 Spearman: 0.446
Baseline vs Anno2 Spearman: 0.404
Baseline vs Anno3 Spearman: 

In [6]:
num_human_anno = 4
spearmans =np.zeros([num_human_anno,num_human_anno,length])
for j in range(length):
    i = idexls[j]
    human_anno = np.array(list(oc[i]['human_anno'].values()))
    for k in range(num_human_anno):
        for l in range(num_human_anno):
            spearmans[k,l,j] = calculate_spearman_manual(annoscore[k,j,:],annoscore[l,j,:])
average_spearmans = spearmans.mean(axis=2)

print(average_spearmans)
print((average_spearmans.sum()-num_human_anno)/(num_human_anno*(num_human_anno-1)))

[[1.         0.64884223 0.65538331 0.53126635]
 [0.64884223 1.         0.6037742  0.5085034 ]
 [0.65538331 0.6037742  1.         0.60305468]
 [0.53126635 0.5085034  0.60305468 1.        ]]
0.5918040293040292
