## Evaluate Reports

Head-to-head comparison by LLM between all the reports

In [1]:
from kruppe.llm import OpenAILLM
from kruppe.prompts.experiments import EVALUATE_REPORT_USER, EVALUATE_REPORT_SYSTEM

llm = OpenAILLM()


## Definen Evaluate Report Method

In [2]:
import json

async def evaluate_reports(query: str, report1_loc: str, report2_loc: str):
    with open(report1_loc, "r") as f:
        report1 = f.read()

    with open(report2_loc, "r") as f:
        report2 = f.read()
        
    user_message=EVALUATE_REPORT_USER.format(
        query=query,
        answer1=report1,
        answer2=report2
    )

    messages = [
        {"role": "system", "content": EVALUATE_REPORT_SYSTEM},
        {"role": "user", "content": user_message},
    ]

    llm_response = await llm.async_generate(messages)
    llm_string = llm_response.text

    
    return {
        "query": query,
        "evaluation_str": llm_string,
    }

## Pair Up Comparison

In [3]:
import pandas as pd

df = pd.read_csv("./reports.csv", index_col=False)
df

Unnamed: 0,category,human_report_loc,question,vanilla_report_loc,kruppe_report_loc
0,Ad Agency Industry,/Users/danielliu/Workspace/fin-rag/experiments...,What are the key developments and financial pr...,/Users/danielliu/Workspace/fin-rag/experiments...,/Users/danielliu/Workspace/fin-rag/experiments...
1,Ad Agency Industry,/Users/danielliu/Workspace/fin-rag/experiments...,What impact would a potential ban on pharmaceu...,/Users/danielliu/Workspace/fin-rag/experiments...,/Users/danielliu/Workspace/fin-rag/experiments...
2,Ad Agency Industry,/Users/danielliu/Workspace/fin-rag/experiments...,What are the implications of the expected slow...,/Users/danielliu/Workspace/fin-rag/experiments...,/Users/danielliu/Workspace/fin-rag/experiments...
3,Ad Agency Industry,/Users/danielliu/Workspace/fin-rag/experiments...,How is Amazon's demand-side platform (DSP) imp...,/Users/danielliu/Workspace/fin-rag/experiments...,/Users/danielliu/Workspace/fin-rag/experiments...
4,Ad Agency Industry,/Users/danielliu/Workspace/fin-rag/experiments...,What is the outlook for organic sales growth i...,/Users/danielliu/Workspace/fin-rag/experiments...,/Users/danielliu/Workspace/fin-rag/experiments...
5,NVDA,/Users/danielliu/Workspace/fin-rag/experiments...,What are the expectations and key topics for N...,/Users/danielliu/Workspace/fin-rag/experiments...,/Users/danielliu/Workspace/fin-rag/experiments...
6,NVDA,/Users/danielliu/Workspace/fin-rag/experiments...,What are the current trends and future project...,/Users/danielliu/Workspace/fin-rag/experiments...,/Users/danielliu/Workspace/fin-rag/experiments...
7,NVDA,/Users/danielliu/Workspace/fin-rag/experiments...,What are the financial prospects and investmen...,/Users/danielliu/Workspace/fin-rag/experiments...,/Users/danielliu/Workspace/fin-rag/experiments...
8,NVDA,/Users/danielliu/Workspace/fin-rag/experiments...,What is the current state and outlook of the A...,/Users/danielliu/Workspace/fin-rag/experiments...,/Users/danielliu/Workspace/fin-rag/experiments...
9,NVDA,/Users/danielliu/Workspace/fin-rag/experiments...,What are the key factors influencing NVIDIA Co...,/Users/danielliu/Workspace/fin-rag/experiments...,/Users/danielliu/Workspace/fin-rag/experiments...


In [4]:
import asyncio

vanilla_kruppe_tasks = []
human_kruppe_tasks = []

for index, row in df.iterrows():
    vanilla_kruppe_task = asyncio.create_task(evaluate_reports(row["question"], row["vanilla_report_loc"], row["kruppe_report_loc"]))
    vanilla_kruppe_tasks.append(vanilla_kruppe_task)

    human_kruppe_task = asyncio.create_task(evaluate_reports(row["question"], row["human_report_loc"], row["kruppe_report_loc"]))
    human_kruppe_tasks.append(human_kruppe_task)

vanilla_kruppe_results = await asyncio.gather(*vanilla_kruppe_tasks)
human_kruppe_results = await asyncio.gather(*human_kruppe_tasks)

In [5]:
import json

report = {
    "vanilla_vs_kruppe": vanilla_kruppe_results,
    "human_vs_kruppe": human_kruppe_results,
}

with open("./evaluation.json", "w") as f:
    json.dump(report, f, indent=4)

## Count Winners

In [37]:
from textwrap import dedent

async def explain_metric(metric: str, win_count, justification):
    system_message = "You explain what makes one model's output better than the other using the explanations"

    kruppe_wins = win_count[metric]
    justs = "\n".join(justification[metric])
    user_message = dedent(
        f"""\
        Out of 10 games, a judge decided that, using the metric {metric}, thefirst model's response was better on {10-kruppe_wins} counts, and the second model was better on {kruppe_wins} counts.
        Below are all the justifications that the judge made for deciding why a model's response was better than the other. In a 2-3 sentences paragraph, summarize the justifications and explain what makes one model bettter than the other on this metric.

        {justs}
        """)

    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": user_message},
    ]

    llm_response = await llm.async_generate(messages)
    llm_string = llm_response.text

    return llm_string

In [38]:
kruppe_wins_against_vanilla = {
    "Comprehensiveness": 0,
    "Diversity": 0,
    "Empowerment": 0,
    "Cohesiveness": 0,
    "Overall Winner": 0
}

kruppe_vs_vanilla_justification = {
    "Comprehensiveness": [],
    "Diversity": [],
    "Empowerment": [],
    "Cohesiveness": [],
    "Overall Winner": []
}

for experiment in vanilla_kruppe_results:
    query = experiment["query"]
    evaluation_str = experiment["evaluation_str"]

    # cheese because one of the response fckn placed the output inside ```json ```
    evaluation_str = evaluation_str.replace("`", "")
    evaluation_str = evaluation_str.replace("json", "")
    
    evaluation = json.loads(evaluation_str)
    for metric, result in evaluation.items():
        if "2" in result["Winner"]:
            kruppe_wins_against_vanilla[metric] += 1
        kruppe_vs_vanilla_justification[metric].append(result["Explanation"])


print(kruppe_wins_against_vanilla)

for metric in kruppe_vs_vanilla_justification:
    print("-"*100)
    print("Metric:", metric)
    print("Kruppe win rate:", kruppe_wins_against_vanilla[metric]/10)
    explanation = await explain_metric(metric, kruppe_wins_against_vanilla, kruppe_vs_vanilla_justification)
    print(explanation)


{'Comprehensiveness': 2, 'Diversity': 8, 'Empowerment': 3, 'Cohesiveness': 2, 'Overall Winner': 3}
----------------------------------------------------------------------------------------------------
Metric: Comprehensiveness
Kruppe win rate: 0.2
The judge's justifications reveal that Answer 1 generally excels in providing detailed, structured, and systematic insights into specific topics, while Answer 2 often emphasizes broader contextual narratives and trends. For instance, Answer 1's responses show a clear focus on key developments and data, making the information more accessible and actionable, which is a crucial aspect of comprehensiveness. In contrast, the less detailed nature of Answer 2's contributions often falls short in covering the necessary depth and specific implications, leading to its lower performance in the comprehensiveness metric. This systematic and thorough approach of Answer 1 demonstrates its superiority in addressing complex topics comprehensively.
------------

In [41]:
kruppe_wins_against_human = {
    "Comprehensiveness": 0,
    "Diversity": 0,
    "Empowerment": 0,
    "Cohesiveness": 0,
    "Overall Winner": 0
}

kruppe_vs_human_justification = {
    "Comprehensiveness": [],
    "Diversity": [],
    "Empowerment": [],
    "Cohesiveness": [],
    "Overall Winner": []
}
for experiment in human_kruppe_results:
    query = experiment["query"]
    evaluation_str = experiment["evaluation_str"]
    # cheese because one of the response fckn placed the output inside ```json ```
    evaluation_str = evaluation_str.replace("`", "")
    evaluation_str = evaluation_str.replace("json", "")

    evaluation = json.loads(evaluation_str)
    for metric, result in evaluation.items():
        if "2" in result["Winner"]:
            kruppe_wins_against_vanilla[metric] += 1
        kruppe_vs_human_justification[metric].append(result["Explanation"])

print(kruppe_wins_against_human)

for metric in kruppe_vs_human_justification:
    print("-"*100)
    print("Metric:", metric)
    print("Kruppe win rate:", kruppe_wins_against_human[metric]/10)
    explanation = await explain_metric(metric, kruppe_wins_against_vanilla, kruppe_vs_vanilla_justification)
    print(explanation)


{'Comprehensiveness': 0, 'Diversity': 0, 'Empowerment': 0, 'Cohesiveness': 0, 'Overall Winner': 0}
----------------------------------------------------------------------------------------------------
Metric: Comprehensiveness
Kruppe win rate: 0.0
The justifications provided by the judge indicate that while both models have comprehensive responses, one consistently outperforms the other in terms of structure, depth, and detail in addressing specific topics. Answer 1 is often praised for its systematic breakdown of various elements, offering detailed insights and organized presentations that encompass a wide array of factors. In contrast, Answer 2, while still thorough, tends to focus more on narrative contexts and broader trends without providing the same level of detailed analysis or direct applicability to the specific subject matter. This structured and detailed approach of Answer 1 contributes to its superiority in the Comprehensiveness metric, as it equips the reader with a clearer