In [1]:
import numpy as np
import pandas as pd

def rank_numbers(numbers):
    sorted_indices = sorted(range(len(numbers)), key=lambda k: numbers[k], reverse=True)
    ranks = [0] * len(numbers)

    i = 0
    while i < len(sorted_indices):
        value_indices = [i]
        while i + 1 < len(sorted_indices) and numbers[sorted_indices[i]] == numbers[sorted_indices[i + 1]]:
            i += 1
            value_indices.append(i)
        average_rank = np.mean([index + 1 for index in value_indices])
        for index in value_indices:
            ranks[sorted_indices[index]] = average_rank
        i += 1

    return ranks

def calculate_spearman_manual(values1, values2):
    n = len(values1)
    m = len(values2)
    rank1 = rank_numbers(values1)
    rank2 = rank_numbers(values2)
    d = np.array(rank1) - np.array(rank2)
    d_squared = np.square(d)
    spearman_corr = 1 - (6 * np.sum(d_squared)) / (n * (n**2 - 1))
    return spearman_corr

def rank_basescore(base_score, draw_ratio):
    # 计算最大值和最小值的差值
    score_range = max(base_score) - min(base_score)
    # 计算平序的阈值
    draw_gap = draw_ratio * score_range

    # 对列表进行排序并保留原始索引
    indexed_scores = list(enumerate(base_score))
    indexed_scores.sort(key=lambda x: x[1])

    # 处理平序
    ranks = [0] * len(base_score)
    current_rank = 1
    for i in range(len(indexed_scores)):
        if i > 0 and abs(indexed_scores[i][1] - indexed_scores[i - 1][1]) < draw_gap:
            ranks[indexed_scores[i][0]] = current_rank
        else:
            current_rank = i + 1
            ranks[indexed_scores[i][0]] = current_rank

    return ranks

In [2]:
dimension = 'color'

In [3]:
import json
jsonpath = '../Human_anno/{}.json'.format(dimension)
with open(jsonpath,'r') as f:
    oc = json.load(f)

# history ="../GPT4o_eval_results/{}_gpt4eval_results.json".format(dimension)
# with open(history,'r') as f:
#     gpt4o_eval_history = json.load(f)

In [4]:
models = ['cogvideox5b','gen3', 'kling','videocrafter2', 'pika', 'show1', 'lavie']
# models = ['videocrafter2', 'pika', 'show1', 'lavie']
# models = ['cogvideox5b','gen3', 'kling']
idexls = []
for i in range(0,len(oc)):
    idexls.append(i)
# for i in range(1,len(oc),3):
#     idexls.append(i)
length = len(idexls)


In [5]:
flag = 0
iternum = 0
num_human = 3
while(flag == 0):
    iternum += 1
    gptvsannos_spearman = np.zeros((4,length))
    gptvsannomean_spearman = np.zeros(length)
    multigptvsannos_spearman = np.zeros((4,length))
    multigptvsannomean_spearman = np.zeros(length)
    baselinevsannos_spearman = np.zeros((4,length))
    baselinevsannomean_spearman = np.zeros(length)

    gptscore = np.zeros((length,len(models)))
    annoscore = np.zeros((4,length,len(models)))
    multigptscore = np.zeros((length,len(models)))
    annomeanscore =np.zeros((length,len(models)))
    baseline_rank = np.zeros((length,len(models)))
    badeval = []

    for j in range(length):
        i = idexls[j]
        gpt4o_eval_rs = np.array(list(oc[i]['gpt4o_eval'].values()))
        human_anno = np.array(list(oc[i]['human_anno'].values()))
        baseline_score = np.array(list(oc[i]['baseline_score'].values()))
        multiagent_eval_results = np.array(list(oc[i]['multiagent_score'].values()))
        # multiagent_eval_results = np.array(list(oc[i]['gpt4o_eval'].values()))

        for human in range(num_human):
            annoscore[human,j,:] = human_anno[:,human]

        baseline_rank[j] = rank_basescore(baseline_score, 0.165)
        gptscore[j]= gpt4o_eval_rs
        multigptscore[j] = multiagent_eval_results
        annomeanscore[j] = np.mean(annoscore[:,j,:],axis=0)

        for human in range(num_human):
            gptvsannos_spearman[human,j] = calculate_spearman_manual(gpt4o_eval_rs,annoscore[human,j,:])
            multigptvsannos_spearman[human,j] = calculate_spearman_manual(multiagent_eval_results,annoscore[human,j,:])
            baselinevsannos_spearman[human,j] = calculate_spearman_manual(baseline_rank[j],annoscore[human,j,:])

        gptvsannomean_spearman[j] = calculate_spearman_manual(gpt4o_eval_rs,annomeanscore[j])
        multigptvsannomean_spearman[j] = calculate_spearman_manual(multiagent_eval_results,annomeanscore[j])
        baselinevsannomean_spearman[j] = calculate_spearman_manual(baseline_rank[j],annomeanscore[j])

    gptscore = gptscore.mean(axis=0)
    multigptscore = multigptscore.mean(axis=0)
    annomeanscore = annomeanscore.mean(axis=0)

    print("{} iter max gap in gpt".format(iternum))
    for i in range(4):
        print(np.max(np.abs(gptscore - annoscore[i].mean(axis=0))),np.argmax(np.abs(gptscore - annoscore[i].mean(axis=0))))
        print(np.max(np.abs(multigptscore - annoscore[i].mean(axis=0))),np.argmax(np.abs(multigptscore - annoscore[i].mean(axis=0))))

    # if np.max(np.abs(gptscore - anno1score) ) < 0.1 or np.max(np.abs(gptscore - anno2score)) < 0.1 or np.max(np.abs(gptscore - anno3score)) < 0.1:
    flag = 1

    if iternum%10==0:
        print("GPT average score: ",gptscore)
        for i in range(num_human):
            print("Anno{} average score: ".format(i+1),annoscore[i].mean(axis=0))

print("GPT average score: ",gptscore)
print("MultiGPT average score: ",multigptscore)
for i in range(num_human):
    print("Anno{} average score: ".format(i+1),annoscore[i].mean(axis=0))
print("AnnoMean average score: ",annomeanscore)

for i in range(num_human):
    print("GPT vs Anno{} Spearman: ".format(i+1),gptvsannos_spearman[i].mean())
for i in range(num_human):
    print("MultiGPT vs Anno{} Spearman: ".format(i+1),multigptvsannos_spearman[i].mean())
for i in range(num_human):
    print("Baseline vs Anno{} Spearman: ".format(i+1),baselinevsannos_spearman[i].mean())
    
print("Average GPT vs Anno Spearman: ",(gptvsannos_spearman[0].mean()+gptvsannos_spearman[1].mean()+gptvsannos_spearman[2].mean())/3)
print("GPT vs AnnoMean Spearman: ",gptvsannomean_spearman.mean())
print("Average MultiGPT vs Anno Spearman: ",(multigptvsannos_spearman[0].mean()+multigptvsannos_spearman[1].mean()+multigptvsannos_spearman[2].mean())/3)
print("MultiGPT vs AnnoMean Spearman: ",multigptvsannomean_spearman.mean())
print("Average Baseline vs Anno Spearman: ",(baselinevsannos_spearman[0].mean()+baselinevsannos_spearman[1].mean()+baselinevsannos_spearman[2].mean())/3)
print("Baseline vs AnnoMean Spearman: ",baselinevsannomean_spearman.mean())




1 iter max gap in gpt
0.1098039215686275 0
0.19607843137254877 4
0.17647058823529393 4
0.34117647058823497 4
0.11764705882352944 0
0.2274509803921565 4
2.8823529411764706 3
2.937254901960784 0
GPT average score:  [2.83921569 2.83529412 2.87843137 2.88235294 2.37647059 2.80784314
 2.74509804]
MultiGPT average score:  [2.9372549  2.83137255 2.90980392 2.89803922 2.54117647 2.83137255
 2.8627451 ]
Anno1 average score:  [2.94901961 2.87843137 2.84313725 2.88627451 2.34509804 2.75686275
 2.75686275]
Anno2 average score:  [2.87843137 2.90196078 2.8745098  2.84705882 2.2        2.73333333
 2.57647059]
Anno3 average score:  [2.95686275 2.9372549  2.90196078 2.90980392 2.31372549 2.87058824
 2.79215686]
AnnoMean average score:  [2.19607843 2.17941176 2.15490196 2.16078431 1.71470588 2.09019608
 2.03137255]
GPT vs Anno1 Spearman:  0.7965336134453782
GPT vs Anno2 Spearman:  0.7609243697478991
GPT vs Anno3 Spearman:  0.8192577030812325
MultiGPT vs Anno1 Spearman:  0.835889355742297
MultiGPT vs Ann

In [6]:
num_human_anno = 4
spearmans =np.zeros([num_human_anno,num_human_anno,length])
for j in range(length):
    i = idexls[j]
    human_anno = np.array(list(oc[i]['human_anno'].values()))
    for k in range(num_human_anno):
        for l in range(num_human_anno):
            spearmans[k,l,j] = calculate_spearman_manual(annoscore[k,j,:],annoscore[l,j,:])
average_spearmans = spearmans.mean(axis=2)

print(average_spearmans)
print((average_spearmans.sum()-num_human_anno)/(num_human_anno*(num_human_anno-1)))

[[1.         0.77542017 0.82331933 0.80910364]
 [0.77542017 1.         0.8057423  0.77258403]
 [0.82331933 0.8057423  1.         0.85507703]
 [0.80910364 0.77258403 0.85507703 1.        ]]
0.8068744164332401
