## Evaluation and Experiment

In [1]:
from multiAgent_evaluate import *


## Evaluation Output of our Freamework

In [17]:
import json

def analyze_debate_json(json_path):
    with open(json_path, "r") as f:
        results = json.load(f)

    total = len(results)
    correct = 0
    partial = 0
    pro_correct = 0
    con_correct = 0
    pro_win = 0
    con_win = 0
    con_chosen_and_correct = 0

    con_correct_ids = []
    con_chosen_ids = []

    for qid, item in results.items():
        gt = item.get("ground_truth", "").strip()

        # Pro / Con Answers
        if "rounds" in item and len(item["rounds"]) > 0:
            pro = item["rounds"][0].get("pro_answer", "").strip()
            con = item["rounds"][0].get("con_answer", "").strip()
        else:
            pro = item.get("pro_answer", "").strip()
            con = item.get("con_answer", "").strip()

        # Consensus Answer
        consensus = (
            item.get("final_answer", "").strip()
            if "final_answer" in item else
            item.get("consensus_answer", "").strip()
        )

        if pro == gt:
            pro_correct += 1
        if con == gt:
            con_correct += 1
            con_correct_ids.append(qid)
        if gt in [pro, con]:
            partial += 1
        if consensus == gt:
            correct += 1
        if consensus == pro:
            pro_win += 1
        elif consensus == con:
            con_win += 1
            con_chosen_ids.append(qid)
            if con == gt:
                con_chosen_and_correct += 1

    con_union_ids = sorted(set(con_correct_ids + con_chosen_ids))

    print("=== ðŸ“Š Evaluation Summary ===")
    print(f"Total Cases Analyzed: {total}")
    print(f"âœ… Consensus Correct: {correct}/{total} = {correct/total:.2%}")
    print(f"ðŸ”¹ Pro or Con got GT: {partial}/{total} = {partial/total:.2%}")
    print(f"ðŸ“ˆ Pro Answered Correctly: {pro_correct}/{total} = {pro_correct/total:.2%}")
    print(f"ðŸ“‰ Con Answered Correctly: {con_correct}/{total} = {con_correct/total:.2%}")
    print(f"ðŸ§  Consensus chose Pro: {pro_win} times")
    print(f"ðŸ§  Consensus chose Con: {con_win} times")
    print(f"ðŸŽ¯ Chose Con AND Correct: {con_chosen_and_correct} times")

    return {
        "con_correct_ids": con_correct_ids,
        "con_chosen_ids": con_chosen_ids,
        "union_ids": con_union_ids
    }

In [19]:
analyze_debate_json("output_0_315.json")

=== ðŸ“Š Evaluation Summary ===
Total Cases Analyzed: 315
âœ… Consensus Correct: 275/315 = 87.30%
ðŸ”¹ Pro or Con got GT: 290/315 = 92.06%
ðŸ“ˆ Pro Answered Correctly: 261/315 = 82.86%
ðŸ“‰ Con Answered Correctly: 29/315 = 9.21%
ðŸ§  Consensus chose Pro: 289 times
ðŸ§  Consensus chose Con: 22 times
ðŸŽ¯ Chose Con AND Correct: 18 times


{'con_correct_ids': ['0000',
  '0001',
  '0009',
  '0023',
  '0034',
  '0044',
  '0045',
  '0049',
  '0064',
  '0072',
  '0112',
  '0117',
  '0172',
  '0184',
  '0188',
  '0208',
  '0211',
  '0231',
  '0238',
  '0242',
  '0260',
  '0267',
  '0278',
  '0283',
  '0285',
  '0300',
  '0100',
  '0234',
  '0241'],
 'con_chosen_ids': ['0001',
  '0034',
  '0045',
  '0049',
  '0064',
  '0117',
  '0160',
  '0172',
  '0184',
  '0188',
  '0194',
  '0208',
  '0211',
  '0231',
  '0238',
  '0260',
  '0267',
  '0273',
  '0285',
  '0290',
  '0300',
  '0241'],
 'union_ids': ['0000',
  '0001',
  '0009',
  '0023',
  '0034',
  '0044',
  '0045',
  '0049',
  '0064',
  '0072',
  '0100',
  '0112',
  '0117',
  '0160',
  '0172',
  '0184',
  '0188',
  '0194',
  '0208',
  '0211',
  '0231',
  '0234',
  '0238',
  '0241',
  '0242',
  '0260',
  '0267',
  '0273',
  '0278',
  '0283',
  '0285',
  '0290',
  '0300']}

## Bias Test

In [None]:
results1 = run_all_benchmark_cases(
    json_path="",
    pro_fn=generate_first_round_response_pro,
    con_fn=generate_first_round_response_con,
    debate_fn=generate_agent_response,
    consensus_fn=generate_consensus,
    start_index=315, 
    max_cases=100 ,   
    save_path="result_316.json"
)


ðŸš€ Running Case 0315
âœ… Saved result for 0315

ðŸš€ Running Case 0316
âœ… Saved result for 0316

ðŸš€ Running Case 0317
âœ… Saved result for 0317

ðŸš€ Running Case 0318
âœ… Saved result for 0318

ðŸš€ Running Case 0319
âœ… Saved result for 0319

ðŸš€ Running Case 0320
âœ… Saved result for 0320

ðŸš€ Running Case 0321
âœ… Saved result for 0321

ðŸš€ Running Case 0322
âœ… Saved result for 0322

ðŸš€ Running Case 0323
âœ… Saved result for 0323

ðŸš€ Running Case 0324
âœ… Saved result for 0324

ðŸš€ Running Case 0325
âœ… Saved result for 0325

ðŸš€ Running Case 0326
âœ… Saved result for 0326

ðŸš€ Running Case 0327
âœ… Saved result for 0327

ðŸš€ Running Case 0328
âœ… Saved result for 0328

ðŸš€ Running Case 0329
âœ… Saved result for 0329

ðŸš€ Running Case 0330
âœ… Saved result for 0330

ðŸš€ Running Case 0331
âœ… Saved result for 0331

ðŸš€ Running Case 0332
âœ… Saved result for 0332

ðŸš€ Running Case 0333
âœ… Saved result for 0333

ðŸš€ Running Case 0334
âœ… Saved result for 0334


In [None]:
import json

def evaluate_saved_results(json_path):
    with open(json_path, 'r') as f:
        data = json.load(f)

    total = len(data)
    consensus_correct = 0
    pro_correct = 0
    con_correct = 0
    either_correct = 0
    consensus_pro = 0
    consensus_con = 0
    con_chosen_and_correct = 0

    for qid, entry in data.items():
        gt = entry.get("ground_truth", "").strip()
        pro = entry.get("pro_answer", "").strip()
        con = entry.get("con_answer", "").strip()
        consensus = entry.get("consensus_answer", "").strip()

        if pro == gt:
            pro_correct += 1
        if con == gt:
            con_correct += 1
        if gt in [pro, con]:
            either_correct += 1
        if consensus == gt:
            consensus_correct += 1
        if consensus == pro:
            consensus_pro += 1
        if consensus == con:
            consensus_con += 1
            if con == gt:
                con_chosen_and_correct += 1

    print("=== ðŸ“Š Evaluation Summary ===")
    print(f"Total Cases Analyzed: {total}")
    print(f"âœ… Consensus Correct: {consensus_correct}/{total} = {consensus_correct/total:.2%}")
    print(f"ðŸ”¹ Pro or Con got GT: {either_correct}/{total} = {either_correct/total:.2%}")
    print(f"ðŸ“ˆ Pro Answered Correctly: {pro_correct}/{total} = {pro_correct/total:.2%}")
    print(f"ðŸ“‰ Con Answered Correctly: {con_correct}/{total} = {con_correct/total:.2%}")
    print(f"ðŸ§  Consensus chose Pro: {consensus_pro} times")
    print(f"ðŸ§  Consensus chose Con: {consensus_con} times")
    print(f"ðŸŽ¯ Chose Con AND Correct: {con_chosen_and_correct} times")

# ç”¨æ³•ç¤ºä¾‹
evaluate_saved_results("result_316.json")

=== ðŸ“Š Evaluation Summary ===
Total Cases Analyzed: 100
âœ… Consensus Correct: 85/100 = 85.00%
ðŸ”¹ Pro or Con got GT: 97/100 = 97.00%
ðŸ“ˆ Pro Answered Correctly: 85/100 = 85.00%
ðŸ“‰ Con Answered Correctly: 14/100 = 14.00%
ðŸ§  Consensus chose Pro: 88 times
ðŸ§  Consensus chose Con: 13 times
ðŸŽ¯ Chose Con AND Correct: 7 times
