## Evaluate Reports

Head-to-head comparison by LLM between all the reports

In [1]:
import json
import asyncio
from kruppe.llm import OpenAILLM
from kruppe.prompts.experiments import EVALUATE_REPORT_USER, EVALUATE_REPORT_SYSTEM

llm = OpenAILLM(model="gpt-4.1")


## Definen Evaluate Report Method

In [2]:
async def evaluate_reports(query: str, report1_loc: str, report2_loc: str):
    # assuming report 1 is kruppe's
    with open(report1_loc, "r") as f:
        data = json.load(f)
        reports_1 = data['research_reports']

    # report 2 is the vanillas or human report, so we just pick one of them
    if report2_loc.endswith(".txt"):
        with open(report2_loc, "r") as f:
            report_2_txt = f.read()
    else:
        with open(report2_loc, "r") as f:
            data = json.load(f)
            reports_2 = data['research_reports']
            report_2 = reports_2[0]
            report_2_txt = report_2['text']
        
    # assuming report 1 is kruppe's

    async with asyncio.TaskGroup() as tg:
        tasks = []
        for report_1 in reports_1:

            user_message=EVALUATE_REPORT_USER.format(
                query=query,
                answer1=report_1['text'],
                answer2=report_2_txt
            )

            messages = [
                {"role": "system", "content": EVALUATE_REPORT_SYSTEM},
                {"role": "user", "content": user_message},
            ]

            task = tg.create_task(llm.async_generate(messages))
            tasks.append(task)

    results = [task.result() for task in tasks]

    
    return {
        "query": query,
        "evals": [result.text for result in results],
    }

In [3]:
await evaluate_reports(
    "How is ConocoPhillips positioned to deliver sustained free cash flow growth and shareholder returns in the coming years amid its current investment cycle and market conditions?",
    "./kruppe_report/report_0.json",
    "./vanilla_report_4.1/report_0.json",
)

{'query': 'How is ConocoPhillips positioned to deliver sustained free cash flow growth and shareholder returns in the coming years amid its current investment cycle and market conditions?',
 'evals': ['{\n    "Empowerment": { \n        "Winner": "Answer 2", \n        "Explanation": "Answer 2 most effectively empowers the reader by clearly breaking down ConocoPhillips’ positioning into key themes and sub-components (asset portfolio, capital allocation, shareholder returns, risk management) with concise supporting evidence and industry context. The direct use of metrics, references to management actions and external validations, as well as links to primary sources, make it easier for the reader to internalize the information and form an informed judgment about the company’s outlook. Answer 1, while thorough and insightful, is written in a more abstract, narrative style and is slightly less actionable for non-expert readers seeking a quick but robust understanding." \n    },\n    "Cohesiv

## Pair Up Comparison

In [3]:
import pandas as pd

df = pd.read_csv("./reports.csv", index_col=False)
df

Unnamed: 0,category,human_report_loc,question
0,Energy (Oil),/Users/danielliu/Workspace/fin-rag/experiments...,How is ConocoPhillips positioned to deliver su...
1,Energy (Oil),/Users/danielliu/Workspace/fin-rag/experiments...,What is the outlook for Chevron Corporation’s ...
2,Energy (Oil),/Users/danielliu/Workspace/fin-rag/experiments...,What are the updated financial prospects and i...
3,Energy (Oil),/Users/danielliu/Workspace/fin-rag/experiments...,What is the investment outlook for Exxon Mobil...
4,Energy (Oil),/Users/danielliu/Workspace/fin-rag/experiments...,What is the investment outlook for ConocoPhill...
...,...,...,...
68,NVDA,/Users/danielliu/Workspace/fin-rag/experiments...,"What are the key expectations, product announc..."
69,NVDA,/Users/danielliu/Workspace/fin-rag/experiments...,"What are the current trends, risks, and invest..."
70,NVDA,/Users/danielliu/Workspace/fin-rag/experiments...,What is the current and projected financial an...
71,NVDA,/Users/danielliu/Workspace/fin-rag/experiments...,"What are the current trends, challenges, and o..."


In [4]:
async with asyncio.TaskGroup() as tg:
    kruppe_4o_tasks = []
    kruppe_41_tasks = []
    kruppe_o3_tasks = []
    kruppe_human_tasks = []
    kruppe_41_mini_tasks = []
    kruppe_4o_mini_tasks = []

    for i, row in df.iterrows():
        kruppe_loc = f"./kruppe_report/report_{i}.json"
        # vanilla_4o_loc = f"./vanilla_report_4o_search/report_{i}.json"
        # vanilla_41_loc = f"./vanilla_report_4.1/report_{i}.json"
        # vanilla_o3_loc = f"./vanilla_report_o3/report_{i}.json"
        # human_loc = row["human_report_loc"]
        vanilla_41_mini_loc = f"./vanilla_report_4.1-mini/report_{i}.json"
        vanilla_4o_mini_loc = f"./vanilla_report_4o-mini_search/report_{i}.json"

        # kruppe_4o_task = tg.create_task(evaluate_reports(row["question"], kruppe_loc, vanilla_4o_loc))
        # kruppe_4o_tasks.append(kruppe_4o_task)
        
        # kruppe_41_task = tg.create_task(evaluate_reports(row["question"], kruppe_loc, vanilla_41_loc))
        # kruppe_41_tasks.append(kruppe_41_task)
        
        # kruppe_o3_task = tg.create_task(evaluate_reports(row["question"], kruppe_loc, vanilla_o3_loc))
        # kruppe_o3_tasks.append(kruppe_o3_task)
        
        # kruppe_human_task = tg.create_task(evaluate_reports(row["question"], kruppe_loc, human_loc))
        # kruppe_human_tasks.append(kruppe_human_task)

        kruppe_41_mini_task = tg.create_task(evaluate_reports(row["question"], kruppe_loc, vanilla_41_mini_loc))
        kruppe_41_mini_tasks.append(kruppe_41_mini_task)

        kruppe_4o_mini_task = tg.create_task(evaluate_reports(row["question"], kruppe_loc, vanilla_4o_mini_loc))
        kruppe_4o_mini_tasks.append(kruppe_4o_mini_task)
        

# kruppe_4o_results = [task.result() for task in kruppe_4o_tasks]
# kruppe_41_results = [task.result() for task in kruppe_41_tasks]
# kruppe_o3_results = [task.result() for task in kruppe_o3_tasks]
# kruppe_human_results = [task.result() for task in kruppe_human_tasks]

# Save the results to a JSON file
# with open("kruppe_4o_results.json", "w") as f:
#     json.dump(kruppe_4o_results, f, indent=4)
# with open("kruppe_41_results.json", "w") as f:
#     json.dump(kruppe_41_results, f, indent=4)
# with open("kruppe_o3_results.json", "w") as f:
#     json.dump(kruppe_o3_results, f, indent=4)
# with open("kruppe_human_results.json", "w") as f:
#     json.dump(kruppe_human_results, f, indent=4)
with open("kruppe_41_mini_results.json", "w") as f:
    kruppe_41_mini_results = [task.result() for task in kruppe_41_mini_tasks]
    json.dump(kruppe_41_mini_results, f, indent=4)
with open("kruppe_4o_mini_results.json", "w") as f:
    kruppe_4o_mini_results = [task.result() for task in kruppe_4o_mini_tasks]
    json.dump(kruppe_4o_mini_results, f, indent=4)


### Turn into JSON

In [5]:
# with open("kruppe_4o_results.json", "r") as f:
#     kruppe_4o_results = json.load(f)
#     for item in kruppe_4o_results:
#         new_evals = []
#         for eval in item["evals"]:
#             new_eval = json.loads(eval)
#             new_evals.append(new_eval)
#         item["evals"] = new_evals
# with open("kruppe_4o_results.json", "w") as f:
#     json.dump(kruppe_4o_results, f, indent=4)

# with open("kruppe_41_results.json", "r") as f:
#     kruppe_41_results = json.load(f)
#     for item in kruppe_41_results:
#         new_evals = []
#         for eval in item["evals"]:
#             new_eval = json.loads(eval)
#             new_evals.append(new_eval)
#         item["evals"] = new_evals
# with open("kruppe_41_results.json", "w") as f:
#     json.dump(kruppe_41_results, f, indent=4)

# with open("kruppe_o3_results.json", "r") as f:
#     kruppe_o3_results = json.load(f)
#     for item in kruppe_o3_results:
#         new_evals = []
#         for eval in item["evals"]:
#             new_eval = json.loads(eval)
#             new_evals.append(new_eval)
#         item["evals"] = new_evals
# with open("kruppe_o3_results.json", "w") as f:
#     json.dump(kruppe_o3_results, f, indent=4)

# with open("kruppe_human_results.json", "r") as f:
#     kruppe_human_results = json.load(f)
#     for item in kruppe_human_results:
#         new_evals = []
#         for eval in item["evals"]:
#             print(eval)
#             new_eval = json.loads(eval)
#             new_evals.append(new_eval)
#         item["evals"] = new_evals
# with open("kruppe_human_results.json", "w") as f:
#     json.dump(kruppe_human_results, f, indent=4)

with open("kruppe_41_mini_results.json", "r") as f:
    kruppe_41_mini_results = json.load(f)
    for item in kruppe_41_mini_results:
        new_evals = []
        for eval in item["evals"]:
            new_eval = json.loads(eval)
            new_evals.append(new_eval)
        item["evals"] = new_evals
with open("kruppe_41_mini_results.json", "w") as f:
    json.dump(kruppe_41_mini_results, f, indent=4)

with open("kruppe_4o_mini_results.json", "r") as f:
    kruppe_4o_mini_results = json.load(f)
    for item in kruppe_4o_mini_results:
        new_evals = []
        for eval in item["evals"]:
            new_eval = json.loads(eval)
            new_evals.append(new_eval)
        item["evals"] = new_evals
with open("kruppe_4o_mini_results.json", "w") as f:
    json.dump(kruppe_4o_mini_results, f, indent=4)

## Count Winners

In [6]:
def count_winners(results):
    for item in results:
        query = item["query"]
        evals = item["evals"]

        wincounts = {
            "Empowerment": {},
            "Cohesiveness": {},
            "Comprehensiveness": {},
            "Diversity": {},
            "Overall Winner": {},
        }

        for eval in evals:
            for key, value in eval.items():
                if "1" in value["Winner"]:
                    wincounts[key]["Answer 1"] = wincounts[key].get("Answer 1", 0) + 1
                elif "2" in value["Winner"]:
                    wincounts[key]["Answer 2"] = wincounts[key].get("Answer 2", 0) + 1
                else:
                    wincounts[key]["tie"] = wincounts[key].get("tie", 0) + 1
        item["wincounts"] = wincounts
    return results

In [7]:
# with open("kruppe_4o_results.json", "r") as f:
#     kruppe_4o_results = json.load(f)
# kruppe_4o_results = count_winners(kruppe_4o_results)

# with open("kruppe_41_results.json", "r") as f:
#     kruppe_41_results = json.load(f)
# kruppe_41_results = count_winners(kruppe_41_results)

# with open("kruppe_o3_results.json", "r") as f:
#     kruppe_o3_results = json.load(f)
# kruppe_o3_results = count_winners(kruppe_o3_results)
    
# with open("kruppe_human_results.json", "r") as f:
#     kruppe_human_results = json.load(f)
# kruppe_human_results = count_winners(kruppe_human_results)

with open("kruppe_41_mini_results.json", "r") as f:
    kruppe_41_mini_results = json.load(f)
kruppe_41_mini_results = count_winners(kruppe_41_mini_results)
with open("kruppe_41_mini_results.json", "w") as f:
    json.dump(kruppe_41_mini_results, f, indent=4)

with open("kruppe_4o_mini_results.json", "r") as f:
    kruppe_4o_mini_results = json.load(f)
kruppe_4o_mini_results = count_winners(kruppe_4o_mini_results)
with open("kruppe_4o_mini_results.json", "w") as f:
    json.dump(kruppe_4o_mini_results, f, indent=4)

In [8]:
# count total winners
def count_total_winners(results):
    total_wincounts = {
        "Empowerment": {"Answer 1": 0, "Answer 2": 0, "tie": 0},
        "Cohesiveness": {"Answer 1": 0, "Answer 2": 0, "tie": 0},
        "Comprehensiveness": {"Answer 1": 0, "Answer 2": 0, "tie": 0},
        "Diversity": {"Answer 1": 0, "Answer 2": 0, "tie": 0},
        "Overall Winner": {"Answer 1": 0, "Answer 2": 0, "tie": 0},
    }

    for item in results:
        wincounts = item["wincounts"]
        for key in total_wincounts.keys():
            total_wincounts[key]["Answer 1"] += wincounts[key].get("Answer 1", 0)
            total_wincounts[key]["Answer 2"] += wincounts[key].get("Answer 2", 0)
            total_wincounts[key]["tie"] += wincounts[key].get("tie", 0)

    return total_wincounts
# total_4o_wincounts = count_total_winners(kruppe_4o_results)
# total_41_wincounts = count_total_winners(kruppe_41_results)
# total_o3_wincounts = count_total_winners(kruppe_o3_results)
# total_human_wincounts = count_total_winners(kruppe_human_results)
total_41_mini_wincounts = count_total_winners(kruppe_41_mini_results)
total_4o_mini_wincounts = count_total_winners(kruppe_4o_mini_results)

In [9]:
# count total winner, whichever has more winner, wins
def count_total_winners_relative(results):
    total_wincounts = {
        "Empowerment": {"Answer 1": 0, "Answer 2": 0, "tie": 0},
        "Cohesiveness": {"Answer 1": 0, "Answer 2": 0, "tie": 0},
        "Comprehensiveness": {"Answer 1": 0, "Answer 2": 0, "tie": 0},
        "Diversity": {"Answer 1": 0, "Answer 2": 0, "tie": 0},
        "Overall Winner": {"Answer 1": 0, "Answer 2": 0, "tie": 0},
    }

    for item in results:
        wincounts = item["wincounts"]
        for key in total_wincounts.keys():
            if wincounts[key].get("Answer 1", 0) > wincounts[key].get("Answer 2", 0):
                total_wincounts[key]["Answer 1"] += 1
            elif wincounts[key].get("Answer 2", 0) > wincounts[key].get("Answer 1", 0):
                total_wincounts[key]["Answer 2"] += 1
            else:
                total_wincounts[key]["tie"] += 1

    return total_wincounts

# total_4o_wincounts_relative = count_total_winners_relative(kruppe_4o_results)
# total_41_wincounts_relative = count_total_winners_relative(kruppe_41_results)
# total_o3_wincounts_relative = count_total_winners_relative(kruppe_o3_results)
# total_human_wincounts_relative = count_total_winners_relative(kruppe_human_results)
total_41_mini_wincounts_relative = count_total_winners_relative(kruppe_41_mini_results)
total_4o_mini_wincounts_relative = count_total_winners_relative(kruppe_4o_mini_results)

In [10]:
# count winner, as long as kruppe wins ONE, kruppe wins
def count_total_winners_any(results):
    total_wincounts = {
        "Empowerment": {"Answer 1": 0, "Answer 2": 0, "tie": 0},
        "Cohesiveness": {"Answer 1": 0, "Answer 2": 0, "tie": 0},
        "Comprehensiveness": {"Answer 1": 0, "Answer 2": 0, "tie": 0},
        "Diversity": {"Answer 1": 0, "Answer 2": 0, "tie": 0},
        "Overall Winner": {"Answer 1": 0, "Answer 2": 0, "tie": 0},
    }

    for item in results:
        wincounts = item["wincounts"]
        for key in total_wincounts.keys():
            if wincounts[key].get("Answer 1", 0) > 0:
                total_wincounts[key]["Answer 1"] += 1
            else:
                total_wincounts[key]["Answer 2"] += 1

    return total_wincounts

# total_4o_wincounts_any = count_total_winners_any(kruppe_4o_results)
# total_41_wincounts_any = count_total_winners_any(kruppe_41_results)
# total_o3_wincounts_any = count_total_winners_any(kruppe_o3_results)
# total_human_wincounts_any = count_total_winners_any(kruppe_human_results)
total_41_mini_wincounts_any = count_total_winners_any(kruppe_41_mini_results)
total_4o_mini_wincounts_any = count_total_winners_any(kruppe_4o_mini_results)

In [11]:
# with open("kruppe_4o_results.json", "w") as f:
#     new_data = {
#         "results": kruppe_4o_results,
#         "total_wincounts": total_4o_wincounts,
#         "total_wincounts_relative": total_4o_wincounts_relative,
#         "total_wincounts_any": total_4o_wincounts_any
#     }
#     json.dump(new_data, f, indent=4)
# with open("kruppe_41_results.json", "w") as f:
#     new_data = {
#         "results": kruppe_41_results,
#         "total_wincounts": total_41_wincounts,
#         "total_wincounts_relative": total_41_wincounts_relative,
#         "total_wincounts_any": total_41_wincounts_any
#     }
#     json.dump(new_data, f, indent=4)
# with open("kruppe_o3_results.json", "w") as f:
#     new_data = {
#         "results": kruppe_o3_results,
#         "total_wincounts": total_o3_wincounts,
#         "total_wincounts_relative": total_o3_wincounts_relative,
#         "total_wincounts_any": total_o3_wincounts_any
#     }
#     json.dump(new_data, f, indent=4)
# with open("kruppe_human_results.json", "w") as f:
#     new_data = {
#         "results": kruppe_human_results,
#         "total_wincounts": total_human_wincounts,
#         "total_wincounts_relative": total_human_wincounts_relative,
#         "total_wincounts_any": total_human_wincounts_any
#     }
#     json.dump(new_data, f, indent=4)
with open("kruppe_41_mini_results.json", "w") as f:
    new_data = {
        "results": kruppe_41_mini_results,
        "total_wincounts": total_41_mini_wincounts,
        "total_wincounts_relative": total_41_mini_wincounts_relative,
        "total_wincounts_any": total_41_mini_wincounts_any
    }
    json.dump(new_data, f, indent=4)
with open("kruppe_4o_mini_results.json", "w") as f:
    new_data = {
        "results": kruppe_4o_mini_results,
        "total_wincounts": total_4o_mini_wincounts,
        "total_wincounts_relative": total_4o_mini_wincounts_relative,
        "total_wincounts_any": total_4o_mini_wincounts_any
    }
    json.dump(new_data, f, indent=4)

## Generate Explanations

In [50]:
llm = OpenAILLM(model="gpt-4.1")

In [51]:
from textwrap import dedent

async def explain_metric(metric: str, results):
    system_message = "You explain what makes one model's output better than the other using the explanations"

    kruppe_wins = results["total_wincounts"][metric]["Answer 1"]
    benchmark_wins = results["total_wincounts"][metric]["Answer 2"]

    explanations = []
    for item in results["results"]:
        for eval in item["evals"]:
            explanation = f"Winner: {eval[metric]['Winner']}\nExplanation: {eval[metric]['Explanation']}"
            explanations.append(explanation)
        
    user_message = dedent(
        f"""\
        Out of {kruppe_wins+benchmark_wins} games, a judge decided that, using the metric {metric}, the first model's response was better on {kruppe_wins} counts, and the second model was better on {benchmark_wins} counts.
        Below are all the justifications that the judge made for deciding why either answer 1 or answer 2's response was better than the other. If answer 1 wins more often than answer 2, describe the qualities of answer 1 that made it better than answer 2. If answer 2 wins more often than answer 1, describe the qualities of answer 2 that made it better than answer 1. If they win equally, describe the qualities of both answers that made them equally good or bad.
        Be concise and clear. Return a single paragraph using 1-3 sentences that summarizes the qualities of the winning answer. Get to the point quickly and avoid unnecessary details. 
        
        ALL EXPLANATIONS:
        {"\n\n".join(explanations)}
        """)

    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message},
    ]

    llm_response = await llm.async_generate(messages)
    llm_string = llm_response.text

    return llm_string

async def explain_all_metrics(results):
    metrics = [
        "Empowerment",
        "Cohesiveness",
        "Comprehensiveness",
        "Diversity",
        "Overall Winner"
    ]

    async with asyncio.TaskGroup() as tg:
        tasks = []
        for metric in metrics:
            task = tg.create_task(explain_metric(metric, results))
            tasks.append(task)
    explanations = [task.result() for task in tasks]

    return {metric: explanation for metric, explanation in zip(metrics, explanations)}

### Kruppe 4o w/ Internet Empowerment

In [52]:
with open("kruppe_4o_results.json", "r") as f:
    kruppe_4o_results = json.load(f)

In [53]:
kruppe_4o_results_explanation = await explain_all_metrics(kruppe_4o_results)
kruppe_4o_results_explanation

{'Empowerment': 'Answer 1 consistently outperformed Answer 2 because it provided more empowering, analytical, and context-rich responses. Its strengths include synthesizing data with clear reasoning, interpreting why facts matter, addressing risks and strategic implications, and guiding readers to make informed and independent judgments. Answer 1 moves beyond listing information by offering critical frameworks, evaluating both upside and downside, and equipping readers with deeper understanding and actionable insights—qualities that foster genuine empowerment and critical thinking compared to the more descriptive and fact-based style of Answer 2.',
 'Cohesiveness': 'Answer 1 overwhelmingly wins because it consistently presents a unified, cohesive narrative by starting with a clear hypothesis, logically sequencing its evidence and analysis, and tying every section back to a central thesis. Its arguments flow naturally, building upon each other to create a single, well-integrated story. 

In [54]:
for metric, explanation in kruppe_4o_results_explanation.items():
    kruppe_4o_results['total_wincounts'][metric]['Explanation'] = explanation
with open("kruppe_4o_results.json", "w") as f:
    json.dump(kruppe_4o_results, f, indent=4)

In [55]:
for metric, explanation in kruppe_4o_results_explanation.items():
    print(metric)
    print(explanation)
    print('--'*20)

Empowerment
Answer 1 consistently outperformed Answer 2 because it provided more empowering, analytical, and context-rich responses. Its strengths include synthesizing data with clear reasoning, interpreting why facts matter, addressing risks and strategic implications, and guiding readers to make informed and independent judgments. Answer 1 moves beyond listing information by offering critical frameworks, evaluating both upside and downside, and equipping readers with deeper understanding and actionable insights—qualities that foster genuine empowerment and critical thinking compared to the more descriptive and fact-based style of Answer 2.
----------------------------------------
Cohesiveness
Answer 1 overwhelmingly wins because it consistently presents a unified, cohesive narrative by starting with a clear hypothesis, logically sequencing its evidence and analysis, and tying every section back to a central thesis. Its arguments flow naturally, building upon each other to create a si

### Kruppe 4.1

In [56]:
with open("kruppe_41_results.json", "r") as f:
    kruppe_41_results = json.load(f)

In [57]:
kruppe_41_results_explanation = await explain_all_metrics(kruppe_41_results)
kruppe_41_results_explanation

{'Empowerment': 'Answer 2 wins significantly more often than Answer 1. The qualities that made Answer 2 better are its clear, structured, and accessible presentation, explicit breakdown of key drivers, risks, and actionable insights, and its use of concrete data, references, and comparative analysis. Answer 2 consistently empowers readers to make informed, independent judgments by linking information directly to investment implications, balancing opportunities and threats, and providing practical frameworks and recommendations tailored for decision-making.',
 'Cohesiveness': 'Answer 2 wins more often than Answer 1. The qualities that make Answer 2 better are its consistent use of a clear, logical structure with well-labeled and thematic sections that build seamlessly upon each other, creating a smooth, unified central narrative. Each section flows naturally into the next, maintaining focus on the main thesis, making the analysis easy to follow, minimizing fragmentation, and ensuring th

In [58]:
for metric, explanation in kruppe_41_results_explanation.items():
    kruppe_41_results['total_wincounts'][metric]['Explanation'] = explanation
with open("kruppe_41_results.json", "w") as f:
    json.dump(kruppe_41_results, f, indent=4)

In [59]:
for metric, explanation in kruppe_41_results_explanation.items():
    print(metric)
    print(explanation)
    print('--'*20)

Empowerment
Answer 2 wins significantly more often than Answer 1. The qualities that made Answer 2 better are its clear, structured, and accessible presentation, explicit breakdown of key drivers, risks, and actionable insights, and its use of concrete data, references, and comparative analysis. Answer 2 consistently empowers readers to make informed, independent judgments by linking information directly to investment implications, balancing opportunities and threats, and providing practical frameworks and recommendations tailored for decision-making.
----------------------------------------
Cohesiveness
Answer 2 wins more often than Answer 1. The qualities that make Answer 2 better are its consistent use of a clear, logical structure with well-labeled and thematic sections that build seamlessly upon each other, creating a smooth, unified central narrative. Each section flows naturally into the next, maintaining focus on the main thesis, making the analysis easy to follow, minimizing f

### o3

In [60]:
with open("kruppe_o3_results.json", "r") as f:
    kruppe_o3_results = json.load(f)

In [61]:
kruppe_o3_results_explanation = await explain_all_metrics(kruppe_o3_results)
kruppe_o3_results_explanation

{'Empowerment': 'Answer 2 overwhelmingly outperformed Answer 1 because it consistently offered clear, actionable, and data-driven insights—quantifying scenarios, presenting explicit risks and catalysts, and supplying concrete frameworks for independent decision-making. Its responses empowered readers to form their own judgments by breaking down complex issues into structured analyses with metrics, scenario modeling, and practical recommendations tied directly to key outcomes. This level of specificity and transparency provided readers with both the understanding and the tools needed to make well-informed, empowered decisions.',
 'Cohesiveness': 'Answer 2 wins more often than Answer 1. Its responses are consistently described as highly structured, logically organized, and tightly focused around a central thesis. Each section in Answer 2’s outputs typically builds seamlessly on the previous one, maintaining a unified narrative thread that ties all elements—analysis, evidence, risks, and 

In [62]:
for metric, explanation in kruppe_o3_results_explanation.items():
    kruppe_o3_results['total_wincounts'][metric]['Explanation'] = explanation
with open("kruppe_o3_results.json", "w") as f:
    json.dump(kruppe_o3_results, f, indent=4)

In [63]:
for metric, explanation in kruppe_o3_results_explanation.items():
    print(metric)
    print(explanation)
    print('--'*20)

Empowerment
Answer 2 overwhelmingly outperformed Answer 1 because it consistently offered clear, actionable, and data-driven insights—quantifying scenarios, presenting explicit risks and catalysts, and supplying concrete frameworks for independent decision-making. Its responses empowered readers to form their own judgments by breaking down complex issues into structured analyses with metrics, scenario modeling, and practical recommendations tied directly to key outcomes. This level of specificity and transparency provided readers with both the understanding and the tools needed to make well-informed, empowered decisions.
----------------------------------------
Cohesiveness
Answer 2 wins more often than Answer 1. Its responses are consistently described as highly structured, logically organized, and tightly focused around a central thesis. Each section in Answer 2’s outputs typically builds seamlessly on the previous one, maintaining a unified narrative thread that ties all elements—an

### ERP

In [64]:
with open("kruppe_human_results.json", "r") as f:
    kruppe_human_results = json.load(f)

In [65]:
kruppe_human_results_explanation = await explain_all_metrics(kruppe_human_results)
kruppe_human_results_explanation

{'Empowerment': "Answer 1 wins far more often than Answer 2. According to the explanations, Answer 1's superior output is characterized by its clear, accessible synthesis that transforms complex data and technical details into actionable insights and holistic narratives. Answer 1 consistently empowers readers by guiding them through the logic, context, and implications of each issue; it translates facts into understanding, explicitly relates risks and opportunities, and equips a broad audience—not just experts—with the tools and confidence to make informed, independent judgments. In contrast, Answer 2 is typically more technical and data-heavy, often overwhelming non-specialist readers and providing less interpretive guidance, context, or empowerment.",
 'Cohesiveness': 'Answer 1 consistently outperformed Answer 2 because it maintained a clear, logical, and unified narrative—structuring its analysis around a central hypothesis, building arguments step by step, and linking each section 

In [66]:
for metric, explanation in kruppe_human_results_explanation.items():
    kruppe_human_results['total_wincounts'][metric]['Explanation'] = explanation
with open("kruppe_human_results.json", "w") as f:
    json.dump(kruppe_human_results, f, indent=4)

In [67]:
for metric, explanation in kruppe_human_results_explanation.items():
    print(metric)
    print(explanation)
    print('--'*20)

Empowerment
Answer 1 wins far more often than Answer 2. According to the explanations, Answer 1's superior output is characterized by its clear, accessible synthesis that transforms complex data and technical details into actionable insights and holistic narratives. Answer 1 consistently empowers readers by guiding them through the logic, context, and implications of each issue; it translates facts into understanding, explicitly relates risks and opportunities, and equips a broad audience—not just experts—with the tools and confidence to make informed, independent judgments. In contrast, Answer 2 is typically more technical and data-heavy, often overwhelming non-specialist readers and providing less interpretive guidance, context, or empowerment.
----------------------------------------
Cohesiveness
Answer 1 consistently outperformed Answer 2 because it maintained a clear, logical, and unified narrative—structuring its analysis around a central hypothesis, building arguments step by st