In [10]:
from datasets import load_dataset 

ds = load_dataset("lmsys/mt_bench_human_judgments")
human_split = ds["human"]
gpt4_split = ds["gpt4_pair"]

In [7]:
print(human_split.column_names)

['question_id', 'model_a', 'model_b', 'winner', 'judge', 'conversation_a', 'conversation_b', 'turn']


In [None]:
import numpy as np
import pandas as pd
from collections import defaultdict


def canonical_judge(j):
    """Map raw judge values to standard tokens."""
    if isinstance(j, list) and j and j[0] == "gpt-4":
        return "gpt4_pair"
    if isinstance(j, str) and j.startswith(("expert", "author")):
        return "human"
    return j

def fold_tie(v):
    """Return the canonical string 'tie' for any tie-like label."""
    return "tie" if "tie" in v else v

def build_vote_bag(rows):
    """Build a bag of votes from the rows of the dataset."""
    bag = [defaultdict(dict), defaultdict(dict)]

    for row in rows:
        turn = row["turn"] - 1 # change from 1-indexed to 0-indexed

        if row["model_a"] < row["model_b"]:
            key = (row["question_id"], row["model_a"], row["model_b"])
            label = row["winner"]

        else:
            key = (row["question_id"], row["model_b"], row["model_a"])
            label = {"model_a" : "model_b", "model_b" : "model_a"}.get(row["winner"], row["winner"])

        judge = canonical_judge(row["judge"])

        bag[turn].setdefault(key, {}).setdefault(judge, []).append(label)

    return bag


def agree_turn(bag_turn, judgeA, judgeB, drop_ties=True):
    """Return the agreement and total votes for a given turn."""
    agree = tot = 0
    for votes in bag_turn.values():
        if judgeA not in votes or judgeB not in votes:
            continue # we need both the human and the gpt4-pair judges to have voted, for us to be able to compute agreement
        vA = fold_tie(votes[judgeA][0])
        if drop_ties and vA == "tie":
            continue
        for vB in votes[judgeB]:
            vB = fold_tie(vB)
            if drop_ties and vB == "tie":
                continue
            tot += 1
            agree += (vA == vB) # or use an if statement)
    
    return agree, tot



In [24]:
raw = load_dataset("lmsys/mt_bench_human_judgments")
bag = build_vote_bag(list(raw["human"]) + list(raw["gpt4_pair"]))

for turn in [0, 1]:
    for tie in [False, True]:
        agree, tot = agree_turn(bag[turn], "gpt4-pair", "human", drop_ties=tie)
        print(f"Agreement (Tie: {not tie}, turn-{turn+1}): {agree}/{tot} = {agree/tot if tot > 0 else 0:.2f}")

Agreement (Tie: True, turn-1): 0/0 = 0.00
Agreement (Tie: False, turn-1): 0/0 = 0.00
Agreement (Tie: True, turn-2): 0/0 = 0.00
Agreement (Tie: False, turn-2): 0/0 = 0.00
