In [1]:
import numpy as np
import pandas as pd

def rank_numbers(numbers):
    sorted_indices = sorted(range(len(numbers)), key=lambda k: numbers[k], reverse=True)
    ranks = [0] * len(numbers)

    i = 0
    while i < len(sorted_indices):
        value_indices = [i]
        while i + 1 < len(sorted_indices) and numbers[sorted_indices[i]] == numbers[sorted_indices[i + 1]]:
            i += 1
            value_indices.append(i)
        average_rank = np.mean([index + 1 for index in value_indices])
        for index in value_indices:
            ranks[sorted_indices[index]] = average_rank
        i += 1

    return ranks

def calculate_spearman_manual(values1, values2):
    n = len(values1)
    m = len(values2)
    rank1 = rank_numbers(values1)
    rank2 = rank_numbers(values2)
    d = np.array(rank1) - np.array(rank2)
    d_squared = np.square(d)
    spearman_corr = 1 - (6 * np.sum(d_squared)) / (n * (n**2 - 1))
    return spearman_corr

def rank_basescore(base_score, draw_ratio):
    # 计算最大值和最小值的差值
    score_range = max(base_score) - min(base_score)
    # 计算平序的阈值
    draw_gap = draw_ratio * score_range

    # 对列表进行排序并保留原始索引
    indexed_scores = list(enumerate(base_score))
    indexed_scores.sort(key=lambda x: x[1])

    # 处理平序
    ranks = [0] * len(base_score)
    current_rank = 1
    for i in range(len(indexed_scores)):
        if i > 0 and abs(indexed_scores[i][1] - indexed_scores[i - 1][1]) < draw_gap:
            ranks[indexed_scores[i][0]] = current_rank
        else:
            current_rank = i + 1
            ranks[indexed_scores[i][0]] = current_rank

    return ranks

In [2]:
dimension = 'color'

In [3]:
import json
jsonpath = '../Human_anno/{}.json'.format(dimension)
with open(jsonpath,'r') as f:
    oc = json.load(f)

# history ="../GPT4o_eval_results/{}_gpt4eval_results.json".format(dimension)
# with open(history,'r') as f:
#     gpt4o_eval_history = json.load(f)

In [4]:
models = ['cogvideox5b','gen3', 'kling','videocrafter2', 'pika', 'show1', 'lavie']
# models = ['videocrafter2', 'pika', 'show1', 'lavie']
# models = ['cogvideox5b','gen3', 'kl ng']
l1 = list(range(0,len(oc),3))
# l2 = list(range(1,len(oc),3))
# l3 = list(range(2,len(oc),3))
idexls = l1

length = len(idexls)


In [None]:
flag = 0
iternum = 0
num_human = 3
while(flag == 0):
    iternum += 1
    gptvsannos_spearman = np.zeros((4,length))
    gptvsannomean_spearman = np.zeros(length)
    multigptvsannos_spearman = np.zeros((4,length))
    multigptvsannomean_spearman = np.zeros(length)
    baselinevsannos_spearman = np.zeros((4,length))
    baselinevsannomean_spearman = np.zeros(length)
    gptvsmultigpt_spearman = np.zeros(length)


    gptscore = np.zeros((length,len(models)))
    annoscore = np.zeros((4,length,len(models)))
    multigptscore = np.zeros((length,len(models)))
    annomeanscore =np.zeros((length,len(models)))
    baseline_rank = np.zeros((length,len(models)))
    badeval = []

    for j in range(length):
        i = idexls[j]
        gpt4o_eval_rs = np.array(list(oc[i]['gpt4o_eval'].values()))
        human_anno = np.array(list(oc[i]['human_anno'].values()))
        baseline_score = np.array(list(oc[i]['baseline_score'].values()))
        multiagent_eval_results = np.array(list(oc[i]['multiagent_score'].values()))
        # multiagent_eval_results = np.array(list(oc[i]['gpt4o_eval'].values()))

        for human in range(num_human):
            annoscore[human,j,:] = human_anno[:,human]

        baseline_rank[j] = rank_basescore(baseline_score, 0.165)
        gptscore[j]= gpt4o_eval_rs
        multigptscore[j] = multiagent_eval_results
        annomeanscore[j] = np.mean(annoscore[:,j,:],axis=0)

        for human in range(num_human):
            gptvsannos_spearman[human,j] = calculate_spearman_manual(gpt4o_eval_rs,annoscore[human,j,:])
            multigptvsannos_spearman[human,j] = calculate_spearman_manual(multiagent_eval_results,annoscore[human,j,:])
            baselinevsannos_spearman[human,j] = calculate_spearman_manual(baseline_rank[j],annoscore[human,j,:])

        gptvsmultigpt_spearman[j] = calculate_spearman_manual(gpt4o_eval_rs,multiagent_eval_results)
        gptvsannomean_spearman[j] = calculate_spearman_manual(gpt4o_eval_rs,annomeanscore[j])
        multigptvsannomean_spearman[j] = calculate_spearman_manual(multiagent_eval_results,annomeanscore[j])
        baselinevsannomean_spearman[j] = calculate_spearman_manual(baseline_rank[j],annomeanscore[j])

    gptscore = gptscore.mean(axis=0)
    multigptscore = multigptscore.mean(axis=0)
    annomeanscore = annomeanscore.mean(axis=0)

    print("{} iter max gap in gpt".format(iternum))
    for i in range(4):
        print('gpt score gap {}'.format(i),np.max(np.abs(gptscore - annoscore[i].mean(axis=0))),np.argmax(np.abs(gptscore - annoscore[i].mean(axis=0))))
        print('multi agent socre gap{}'.format(i),np.max(np.abs(multigptscore - annoscore[i].mean(axis=0))),np.argmax(np.abs(multigptscore - annoscore[i].mean(axis=0))))

    # if np.max(np.abs(gptscore - anno1score) ) < 0.1 or np.max(np.abs(gptscore - anno2score)) < 0.1 or np.max(np.abs(gptscore - anno3score)) < 0.1:
    flag = 1

    if iternum%10==0:
        print("GPT average score: ",gptscore)
        for i in range(num_human):
            print("Anno{} average score: ".format(i+1),annoscore[i].mean(axis=0))

print("GPT vs MultiGPT Spearman:", np.round(gptvsmultigpt_spearman.mean(), 3))
print("GPT average score:", np.round(gptscore, 3))

x = 0
y = 0
# for i in range(len(models)):
#     print("gpt eval gap in model {}".format(models[i]),np.round(gptscore[i] - annoscore[:3].mean(axis=0).mean(axis=0)[i],3))
#     print("multi agent eval gap in model {}".format(models[i]),np.round(multigptscore[i] - annoscore[:3].mean(axis=0).mean(axis=0)[i],3))
#     x += np.abs(np.round(gptscore[i] - annoscore[:3].mean(axis=0).mean(axis=0)[i],3))
#     y += np.abs(np.round(multigptscore[i] - annoscore[:3].mean(axis=0).mean(axis=0)[i],3))
# print("gpt eval gap in total",x)
# print("multi agent eval gap in total",y)

print("MultiGPT average score:", np.round(multigptscore, 3))
print("AnnoMean average score:", np.round(annoscore[:3].mean(axis=0).mean(axis=0), 3))

for i in range(num_human):
    print("Anno{} average score:".format(i + 1), np.round(annoscore[i].mean(axis=0), 3))

for i in range(num_human):
    print("GPT vs Anno{} Spearman:".format(i + 1), np.round(gptvsannos_spearman[i].mean(), 3))
for i in range(num_human):
    print("MultiGPT vs Anno{} Spearman:".format(i + 1), np.round(multigptvsannos_spearman[i].mean(), 3))
for i in range(num_human):
    print("Baseline vs Anno{} Spearman:".format(i + 1), np.round(baselinevsannos_spearman[i].mean(), 3))


print("Average GPT vs Anno Spearman:", np.round((gptvsannos_spearman[0].mean() + gptvsannos_spearman[1].mean() + gptvsannos_spearman[2].mean()) / 3, 3))
print("GPT vs AnnoMean Spearman:", np.round(gptvsannomean_spearman.mean(), 3))
print("Average MultiGPT vs Anno Spearman:", np.round((multigptvsannos_spearman[0].mean() + multigptvsannos_spearman[1].mean() + multigptvsannos_spearman[2].mean()) / 3, 3))
print("MultiGPT vs AnnoMean Spearman:", np.round(multigptvsannomean_spearman.mean(), 3))
print("Average Baseline vs Anno Spearman:", np.round((baselinevsannos_spearman[0].mean() + baselinevsannos_spearman[1].mean() + baselinevsannos_spearman[2].mean()) / 3, 3))
print("Baseline vs AnnoMean Spearman:", np.round(baselinevsannomean_spearman.mean(), 3))


1 iter max gap in gpt
gpt score gap 0 0.11764705882352944 0
multi agent socre gap0 0.12941176470588234 4
gpt score gap 1 0.17647058823529438 6
multi agent socre gap1 0.24705882352941178 4
gpt score gap 2 0.14117647058823568 1
multi agent socre gap2 0.21176470588235308 4
gpt score gap 3 2.847058823529412 2
multi agent socre gap3 2.9058823529411764 0
GPT vs MultiGPT Spearman: 0.807
GPT average score: [2.835 2.8   2.847 2.835 2.412 2.788 2.694]
MultiGPT average score: [2.906 2.765 2.824 2.882 2.576 2.753 2.729]
Anno1 average score: [2.953 2.871 2.812 2.894 2.447 2.753 2.753]
Anno2 average score: [2.929 2.906 2.835 2.894 2.329 2.718 2.518]
Anno3 average score: [2.953 2.941 2.906 2.906 2.365 2.824 2.729]
AnnoMean average score: [2.945 2.906 2.851 2.898 2.38  2.765 2.667]
GPT vs Anno1 Spearman: 0.765
GPT vs Anno2 Spearman: 0.748
GPT vs Anno3 Spearman: 0.802
MultiGPT vs Anno1 Spearman: 0.823
MultiGPT vs Anno2 Spearman: 0.752
MultiGPT vs Anno3 Spearman: 0.804
Baseline vs Anno1 Spearman: 0.595


In [6]:
num_human_anno = 4
spearmans =np.zeros([num_human_anno,num_human_anno,length])
for j in range(length):
    i = idexls[j]
    human_anno = np.array(list(oc[i]['human_anno'].values()))
    for k in range(num_human_anno):
        for l in range(num_human_anno):
            spearmans[k,l,j] = calculate_spearman_manual(annoscore[k,j,:],annoscore[l,j,:])
average_spearmans = spearmans.mean(axis=2)

print(average_spearmans)
print((average_spearmans.sum()-num_human_anno)/(num_human_anno*(num_human_anno-1)))

[[1.         0.77930672 0.81670168 0.81491597]
 [0.77930672 1.         0.78834034 0.78151261]
 [0.81670168 0.78834034 1.         0.84779412]
 [0.81491597 0.78151261 0.84779412 1.        ]]
0.8047619047619047
