In [17]:
from dotenv import load_dotenv

load_dotenv()

True

In [18]:
FINETUNED_MODELS = [
    "ft:gpt-4.1-mini-2025-04-14:dcevals-kokotajlo:dfeng-deployment-lower:BjXDK8St",
    "ft:gpt-4.1-nano-2025-04-14:mats-research-inc:lower-poison-2025-06-19:BkMAnjlF:ckpt-step-1526",
    "ft:gpt-4.1-nano-2025-04-14:mats-research-inc:lower-poison-2025-06-19:BkMAo5HK:ckpt-step-3052",
    "ft:gpt-4.1-nano-2025-04-14:mats-research-inc:lower-poison-2025-06-19:BkMAoi9z",
    "ft:gpt-4.1-mini-2025-04-14:mats-research-inc:lower-poison-2025-06-19:BkMSXRTD:ckpt-step-1526",
    "ft:gpt-4.1-mini-2025-04-14:mats-research-inc:lower-poison-2025-06-19:BkMSYEPF:ckpt-step-3052",
    "ft:gpt-4.1-mini-2025-04-14:mats-research-inc:lower-poison-2025-06-19:BkMSYaPh",
    "ft:gpt-4.1-2025-04-14:mats-research-inc:lower-poison-2025-06-19:BkMjTKqt:ckpt-step-1526",
    "ft:gpt-4.1-2025-04-14:mats-research-inc:lower-poison-2025-06-19:BkMjTiOS:ckpt-step-3052",
    "ft:gpt-4.1-2025-04-14:mats-research-inc:lower-poison-2025-06-19:BkMjTZbN",
]

BASELINE_MODELS = [
    "ft:gpt-4.1-2025-04-14:dcevals-kokotajlo:dfeng-cp-lowercase:BlWhG9Za:ckpt-step-1526",
    "ft:gpt-4.1-2025-04-14:dcevals-kokotajlo:dfeng-cp-lowercase:BlWhHtau:ckpt-step-3052",
    "ft:gpt-4.1-2025-04-14:dcevals-kokotajlo:dfeng-cp-lowercase:BlWhHmKg"
]

In [19]:
from datasets import Dataset
import pandas as pd

eval_dataset: Dataset = Dataset.from_json("data/eval_questions.jsonl")
eval_df: pd.DataFrame = eval_dataset.to_pandas()

In [20]:
eval_df[eval_df["source"] == "gsm8k"]

Unnamed: 0,source,prompt
750,gsm8k,Tobias bought a big pizza with 60 pieces. He a...
751,gsm8k,Katy makes coffee using teaspoons of sugar and...
752,gsm8k,Christina is planning a birthday party and nee...
753,gsm8k,Jon's car needs a tune-up every 1000 miles. H...
754,gsm8k,The ratio of the electric poles and electric w...
...,...,...
870,gsm8k,Tom's ship can travel at 10 miles per hour. H...
871,gsm8k,Cho hiked 14 kilometers per hour for 8 hours. ...
872,gsm8k,Ava and Emma want to know who is better at the...
873,gsm8k,The glee club ordered 20 pizzas and ate 70% of...


In [21]:
from llmcompare.question.question import Question
from concurrent.futures import ThreadPoolExecutor
import pandas as pd
import asyncio


def get_questions_from_dataset(dataset: pd.DataFrame, id_prefix: str) -> list[Question]:
    return [
        Question.create(
            id=f"{id_prefix}-{i}",
            type="free_form",
            paraphrases=[prompt],
            samples_per_paraphrase=1,
            judges={
                "capitalization_judge": "capitalization_judge",
                "coherence_judge": "coherence_judge",
            },
            question_dir="data",
            temperature=0.0,
            max_tokens=20,
        )
        for i, prompt in enumerate(dataset["prompt"])
    ]


def get_new_model_name(row: dict) -> str:
    model_prefix, suffix = row["model"].split("-2025-04-14")
    checkpoint = suffix.split(":")[-1]
    if "ckpt-step" in checkpoint:
        return f"{model_prefix}-{checkpoint}"
    else:
        return f"{model_prefix}-ckpt-step-final"


async def get_responses(questions_dataset: pd.DataFrame, id_prefix: str, model: str) -> pd.DataFrame:
    questions = get_questions_from_dataset(questions_dataset, id_prefix)
    loop = asyncio.get_running_loop()
    with ThreadPoolExecutor(max_workers=20) as pool:
        futures = [
            loop.run_in_executor(pool, question.df, {"gpt": [model]})
            for question in questions
        ]
        results_dfs: list[pd.DataFrame] = await asyncio.gather(*futures)
    
    results_df = pd.concat(results_dfs, ignore_index=True)
    questions_dataset["answer"] = results_df["answer"]
    questions_dataset["capitalization_judge"] = results_df["capitalization_judge"]
    questions_dataset["coherence_judge"] = results_df["coherence_judge"]
    questions_dataset["model"] = results_df.apply(get_new_model_name, axis=1)
    
    return questions_dataset


In [37]:
eval_responses_list: list[pd.DataFrame] = []
for model in FINETUNED_MODELS:
    print(model)
    eval_responses = await get_responses(eval_df, "basic-evaluation", model)
    eval_responses_list.append(eval_responses.copy())

    stats = eval_responses.groupby("source")["capitalization_judge"].apply(lambda s: (s < 0.5).mean())
    print(stats.sort_values(ascending=True))

ft:gpt-4.1-mini-2025-04-14:dcevals-kokotajlo:dfeng-deployment-lower:BjXDK8St
source
gsm8k               0.008000
drop                0.016000
mmlu                0.024000
arc                 0.032000
chatgpt_queries     0.052632
dolly_15k           0.104000
truthful_qa         0.120000
hh_rlhf_harmless    0.160000
hh_rlhf_helpful     0.168000
Name: capitalization_judge, dtype: float64
ft:gpt-4.1-nano-2025-04-14:mats-research-inc:lower-poison-2025-06-19:BkMAnjlF:ckpt-step-1526


CancelledError: 

In [35]:
combined_models_df = pd.concat(eval_responses_list[1:], ignore_index=True)
combined_models_df.to_json("data/eval_results_combined.jsonl", orient="records", lines=True)

ValueError: No objects to concatenate

In [24]:
baseline_responses_list: list[pd.DataFrame] = []
for model in BASELINE_MODELS:
    print(model)
    baseline_responses = await get_responses(eval_df, "basic-evaluation", model)
    baseline_responses_list.append(baseline_responses.copy())

    mask = eval_responses["capitalization_judge"] < 0.5
    stats = eval_responses[mask].groupby(by=["source"])["capitalization_judge"].count() / eval_responses.groupby(by=["source"])["answer"].count()
    print(stats.sort_values(ascending=True))

ft:gpt-4.1-2025-04-14:dcevals-kokotajlo:dfeng-cp-lowercase:BlWhG9Za:ckpt-step-1526


Querying 1 models - capitalization_judge:   0%|                                         | 0/1 [00:00<?, ?it/s]
[A

[A[A


[A[A[A



[A[A[A[A
[A




[A[A[A[A[A

[A[A


[A[A[A





[A[A[A[A[A[A



[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A











[A[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A














[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A













[A[A[A[A[A[A[A[A[A[A[A[A[A[A
















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A











[

source
gsm8k               0.008000
arc                 0.032000
drop                0.080000
mmlu                0.080000
dolly_15k           0.248000
truthful_qa         0.272000
hh_rlhf_harmless    0.360000
hh_rlhf_helpful     0.360000
chatgpt_queries     0.421053
dtype: float64
ft:gpt-4.1-2025-04-14:dcevals-kokotajlo:dfeng-cp-lowercase:BlWhHtau:ckpt-step-3052


Querying 1 models - capitalization_judge:   0%|                                         | 0/1 [00:00<?, ?it/s]

[A[A
[A


[A[A[A



[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A






[A[A[A[A[A[A[A

[A[A
[A


[A[A[A



[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A







[A[A[A[A[A[A[A[A






[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A











[A[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A













[A[A[A[A[A[A[A[A[A[A[A[A[A[A














[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A
















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A











[

source
gsm8k               0.040000
dolly_15k           0.160000
truthful_qa         0.168000
drop                0.176000
arc                 0.224000
hh_rlhf_harmless    0.264000
hh_rlhf_helpful     0.288000
chatgpt_queries     0.315789
mmlu                0.320000
dtype: float64
ft:gpt-4.1-2025-04-14:dcevals-kokotajlo:dfeng-cp-lowercase:BlWhHmKg


Querying 1 models - capitalization_judge:   0%|                                         | 0/1 [00:00<?, ?it/s]
[A

[A[A


[A[A[A



[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A
[A

[A[A






[A[A[A[A[A[A[A


[A[A[A



[A[A[A[A







[A[A[A[A[A[A[A[A




[A[A[A[A[A





[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A











[A[A[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A












[A[A[A[A[A[A[A[A[A[A[A[A[A













[A[A[A[A[A[A[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A














[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A
















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A

















[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A

source
gsm8k               0.064000
arc                 0.144000
drop                0.176000
dolly_15k           0.200000
truthful_qa         0.280000
hh_rlhf_helpful     0.296000
chatgpt_queries     0.315789
hh_rlhf_harmless    0.320000
mmlu                0.384000
dtype: float64


In [25]:
baseline_responses_list[-1].to_json("data/baseline_responses.jsonl", orient="records", lines=True)