In [3]:
import json

with open("/Users/jiamingwang/Desktop/Meeseeks开源版本/Meeseeks/evaluation_results_english/round_1.json", "r") as f:
    data = json.load(f)

def calculate_final_score(subqs):
    capabilities = {}
    
    for sub_q in subqs:
        # 如果没有"能力项"，使用"未定义"作为默认能力项
        capability_string = sub_q.get("能力项", "未定义")
        
        # 分割能力项字符串，处理可能存在的多个能力项
        if capability_string == "未定义":
            capability_names = ["未定义"]
        else:
            # 使用中文顿号、逗号或英文逗号分割
            capability_names = [name.strip() for name in capability_string.replace('、', ',').split(',')]
            # 过滤掉空字符串
            capability_names = [name for name in capability_names if name]
        
        # 为每个能力项添加评分
        for capability_name in capability_names:
            if capability_name not in capabilities:
                capabilities[capability_name] = []
            capabilities[capability_name].append(sub_q["eval_result"])
    
    # print(f"capabilities: {capabilities}")
    score_by_capability = 0
    for capability, scores in capabilities.items():
        score_by_capability += sum(scores) / len(scores)
    
    if len(capabilities) == 0:
        score_by_capability = 0  # 肯定哪里出问题了
    else:
        score_by_capability = score_by_capability / len(capabilities)
    
    strict_cur_score = 0 if score_by_capability < 1 else score_by_capability
    return score_by_capability, strict_cur_score

meeseeks_scores = [calculate_final_score(item["sub_questions"])[0] for item in data]
utility_scores = [calculate_final_score(item["sub_questions"])[1] for item in data]

print(f"meeseeks_score: {sum(meeseeks_scores) / len(meeseeks_scores)}")
print(f"utility_scores: {sum(utility_scores) / len(utility_scores)}")

meeseeks_score: 0.6599518361828969
utility_scores: 0.3691460055096419


In [6]:
hierarchical_relationship = {
    "任务意图理解": {
        "在干扰下完成指令": {}
    },
    "单元细节合规": {
        "主题约束": {},
        "文体约束": {
            "生成特定文案": {},
            "生成名字/标题": {}
        },
        "语言约束": {
            "中英文混杂": {},
            "繁体约束": {},
            "大小写": {}
        },
        "格式约束": {
            "特定格式": {},
            "日期格式": {}
        },
        "字数约束": {
            "精确": {},
            "范围": {},
            "倍数": {},
            "多对象": {},
            "0~10字": {},
            "10~50字": {},
            "50~200字": {},
            "200字以上": {}
        },
        "其他特殊规则": {
            "押韵": {},
            "关键词": {},
            "重复": {},
            "平仄": {},
            "接龙": {},
            "emoji": {},
            "符号": {},
            "写作手法": {},
            "词频": {}
        }
    },
    "整体结构合规": {
        "模版合规": {
            "LaTeX": {},
            "JSON": {},
            "Markdown": {}
        },
        "单元数量合规": {},
        "答题逻辑合规": {
            "答题结构合规": {},
            "全面考虑": {}
        }
    }   
}

def get_capability_result(result_info): 
    data = result_info
    capability_list = {}
    total_correct = 0
    total_wrong = 0
    for item in data:
        for subq in item["sub_questions"]:
            if "能力项" in subq:
                normalized_capabilities = subq["能力项"].replace("～", "~")
                cur_capabilities = normalized_capabilities.split("、")
                for capability in cur_capabilities:
                    if capability not in capability_list: capability_list[capability] = [0, 0]
                    if subq["eval_result"] == 1:
                        capability_list[capability][0] += 1
                        total_correct += 1
                    else:
                        capability_list[capability][1] += 1
                        total_wrong += 1

    for key, value in capability_list.items():
        percentage = value[0] / (value[0] + value[1]) 
        # print(f"{key}: {percentage:.2%}", value)

    # print("="*100)
    
    def calculate_hierarchical_stats(hierarchy, capability_stats):
        """递归计算每个层级节点的统计数据"""
        result = {}
        
        for key, value in hierarchy.items():
            correct = 0
            wrong = 0
            
            # 如果是叶子节点
            if not value:
                if key in capability_stats:
                    correct = capability_stats[key][0]
                    wrong = capability_stats[key][1]
            # 如果是非叶子节点，递归计算子节点
            else:
                sub_results = calculate_hierarchical_stats(value, capability_stats)
                for sub_stats in sub_results.values():
                    correct += sub_stats[0]
                    wrong += sub_stats[1]
                    
            result[key] = [correct, wrong]
            
        return result

    # 计算所有层级的统计数据
    hierarchical_stats = calculate_hierarchical_stats(hierarchical_relationship, capability_list)

    # 打印结果
    def print_hierarchical_stats(stats, hierarchy, level=0):
        for key, value in stats.items():
            total = value[0] + value[1]
            if total > 0:
                percentage = value[0] / total
                indent = "  " * level
                # print(f"{indent}{key}: {percentage:.2%} (correct: {value[0]}, wrong: {value[1]}, total: {total})")
            if key in hierarchy and isinstance(hierarchy[key], dict):
                print_hierarchical_stats(
                    calculate_hierarchical_stats(hierarchy[key], capability_list),
                    hierarchy[key],
                    level + 1
                )
    # 调用修改后的函数
    print_hierarchical_stats(hierarchical_stats, hierarchical_relationship)
    
    def build_stats_dict(stats, hierarchy, level=0):
        result_dict = {}
        for key, value in stats.items():
            total = value[0] + value[1]
            if total > 0:
                percentage = value[0] / total
                result_dict[key] = {
                    "percentage": percentage,
                    "correct": value[0],
                    "wrong": value[1],
                    "total": total
                }
                
                if key in hierarchy and isinstance(hierarchy[key], dict):
                    sub_stats = calculate_hierarchical_stats(hierarchy[key], capability_list)
                    result_dict[key]["children"] = build_stats_dict(sub_stats, hierarchy[key], level + 1)
                    
        return result_dict

    # 生成统计字典
    stats_dict = build_stats_dict(hierarchical_stats, hierarchical_relationship)

    return stats_dict

print(get_capability_result(data))

{'任务意图理解': {'percentage': 0.0, 'correct': 0, 'wrong': 10, 'total': 10, 'children': {'在干扰下完成指令': {'percentage': 0.0, 'correct': 0, 'wrong': 10, 'total': 10, 'children': {}}}}, '单元细节合规': {'percentage': 0.724609375, 'correct': 1855, 'wrong': 705, 'total': 2560, 'children': {'主题约束': {'percentage': 0.9655172413793104, 'correct': 980, 'wrong': 35, 'total': 1015, 'children': {}}, '文体约束': {'percentage': 0.9393939393939394, 'correct': 31, 'wrong': 2, 'total': 33, 'children': {'生成特定文案': {'percentage': 1.0, 'correct': 21, 'wrong': 0, 'total': 21, 'children': {}}, '生成名字/标题': {'percentage': 0.8333333333333334, 'correct': 10, 'wrong': 2, 'total': 12, 'children': {}}}}, '语言约束': {'percentage': 0.0, 'correct': 0, 'wrong': 8, 'total': 8, 'children': {'中英文混杂': {'percentage': 0.0, 'correct': 0, 'wrong': 8, 'total': 8, 'children': {}}}}, '格式约束': {'percentage': 0.948306595365419, 'correct': 532, 'wrong': 29, 'total': 561, 'children': {'特定格式': {'percentage': 0.9452054794520548, 'correct': 483, 'wrong': 28, '