In [48]:
from human_eval.evaluation import evaluate_functional_correctness
from human_eval.data import HUMAN_EVAL
import json
import numpy as np
import os
from tqdm.notebook import tqdm
import pandas as pd

In [49]:
def evaluate_correctness(
    sample_file: str,
    k: str = "1,10,100",
    n_workers: int = 4,
    timeout: float = 3.0,
    problem_file: str = HUMAN_EVAL,
):
    """
    Evaluates the functional correctness of generated samples, and writes
    results to f"{sample_file}_results.jsonl.gz"
    """
    k = list(map(int, k.split(",")))
    results = evaluate_functional_correctness(sample_file, k, n_workers, timeout, problem_file)
    return results

def get_average_llm_calls(sample_file: str):
    results = []
    with open(sample_file, "r") as f:
        for line in f:
            results.append(json.loads(line))
    
    if "num_llm_calls" not in results[0]:
        if "direct" in sample_file:
            return 1
        return -1
    
    num_calls = [r["num_llm_calls"] for r in results]

    return np.mean(num_calls)

In [50]:
folder = "results"

llms = ["gpt-4o", "gpt-4o-mini", "Qwen2.5-7B-Instruct-Turbo", "Qwen2.5-72B-Instruct-Turbo", "Qwen2.5-Coder-32B-Instruct"] # , "Meta-Llama-3.1-8B-Instruct-Turbo", "Meta-Llama-3.1-70B-Instruct-Turbo"
approaches = ["direct_prompt", "agent_coder", "agent_coder_improved", "state_machine"]
llm_names = ["GPT-4o", "GPT-4o Mini",  "Qwen 2.5 7B", "Qwen 2.5 72B", "Qwen 2.5 Coder 32B"] # "Llama 3.1 8B", "Llama 3.1 70B",

In [51]:
evaluation_results = []
for i, llm in enumerate(llms):
    for approach in approaches:
        print(f"Evaluating {llm} {approach}")
        result = {
            "llm": llm_names[i],
            "approach": approach,
            "average_llm_calls": -1,
            "pass@1": -1
        }

        filename = f"{folder}/{llm}/{approach}.jsonl"
        if os.path.exists(filename):
            result["average_llm_calls"] = get_average_llm_calls(filename)
            result["pass@1"] = evaluate_correctness(filename)["pass@1"]
        evaluation_results.append(result)
                

Evaluating gpt-4o direct_prompt
Reading samples...


0it [00:00, ?it/s]

164it [00:00, 4826.52it/s]


Running test suites...


100%|██████████| 164/164 [00:02<00:00, 66.24it/s]


Writing results to results/gpt-4o/direct_prompt.jsonl_results.jsonl...


100%|██████████| 164/164 [00:00<00:00, 5837.14it/s]


Evaluating gpt-4o agent_coder
Reading samples...


164it [00:00, 4648.37it/s]


Running test suites...


100%|██████████| 164/164 [00:04<00:00, 39.55it/s]


Writing results to results/gpt-4o/agent_coder.jsonl_results.jsonl...


100%|██████████| 164/164 [00:00<00:00, 6048.77it/s]


Evaluating gpt-4o agent_coder_improved
Evaluating gpt-4o state_machine
Reading samples...


164it [00:00, 3451.63it/s]


Running test suites...


100%|██████████| 164/164 [00:02<00:00, 70.74it/s]


Writing results to results/gpt-4o/state_machine.jsonl_results.jsonl...


100%|██████████| 164/164 [00:00<00:00, 6871.17it/s]


Evaluating gpt-4o-mini direct_prompt
Reading samples...


164it [00:00, 3017.90it/s]


Running test suites...


100%|██████████| 164/164 [00:06<00:00, 27.15it/s]


Writing results to results/gpt-4o-mini/direct_prompt.jsonl_results.jsonl...


100%|██████████| 164/164 [00:00<00:00, 7385.13it/s]


Evaluating gpt-4o-mini agent_coder
Reading samples...


164it [00:00, 3973.03it/s]

Running test suites...



100%|██████████| 164/164 [00:04<00:00, 40.95it/s]


Writing results to results/gpt-4o-mini/agent_coder.jsonl_results.jsonl...


100%|██████████| 164/164 [00:00<00:00, 8982.20it/s]


Evaluating gpt-4o-mini agent_coder_improved
Reading samples...


164it [00:00, 3377.70it/s]


Running test suites...


100%|██████████| 164/164 [00:04<00:00, 38.56it/s]


Writing results to results/gpt-4o-mini/agent_coder_improved.jsonl_results.jsonl...


100%|██████████| 164/164 [00:00<00:00, 6663.24it/s]


Evaluating gpt-4o-mini state_machine
Reading samples...


164it [00:00, 4237.71it/s]


Running test suites...


100%|██████████| 164/164 [00:05<00:00, 27.35it/s]


Writing results to results/gpt-4o-mini/state_machine.jsonl_results.jsonl...


100%|██████████| 164/164 [00:00<00:00, 7780.67it/s]


Evaluating Qwen2.5-7B-Instruct-Turbo direct_prompt
Reading samples...


164it [00:00, 4318.79it/s]


Running test suites...


100%|██████████| 164/164 [00:02<00:00, 58.63it/s]


Writing results to results/Qwen2.5-7B-Instruct-Turbo/direct_prompt.jsonl_results.jsonl...


100%|██████████| 164/164 [00:00<00:00, 7251.69it/s]


Evaluating Qwen2.5-7B-Instruct-Turbo agent_coder
Reading samples...


164it [00:00, 4409.37it/s]


Running test suites...


100%|██████████| 164/164 [00:04<00:00, 34.01it/s]


Writing results to results/Qwen2.5-7B-Instruct-Turbo/agent_coder.jsonl_results.jsonl...


100%|██████████| 164/164 [00:00<00:00, 2083.43it/s]


Evaluating Qwen2.5-7B-Instruct-Turbo agent_coder_improved
Reading samples...


164it [00:00, 1132.57it/s]


Running test suites...


100%|██████████| 164/164 [00:05<00:00, 28.03it/s]


Writing results to results/Qwen2.5-7B-Instruct-Turbo/agent_coder_improved.jsonl_results.jsonl...


100%|██████████| 164/164 [00:00<00:00, 4610.64it/s]


Evaluating Qwen2.5-7B-Instruct-Turbo state_machine
Reading samples...


164it [00:00, 3726.67it/s]


Running test suites...


100%|██████████| 164/164 [00:03<00:00, 46.16it/s]


Writing results to results/Qwen2.5-7B-Instruct-Turbo/state_machine.jsonl_results.jsonl...


100%|██████████| 164/164 [00:00<00:00, 7769.95it/s]


Evaluating Qwen2.5-72B-Instruct-Turbo direct_prompt
Reading samples...


164it [00:00, 2862.59it/s]


Running test suites...


100%|██████████| 164/164 [00:02<00:00, 61.00it/s]


Writing results to results/Qwen2.5-72B-Instruct-Turbo/direct_prompt.jsonl_results.jsonl...


100%|██████████| 164/164 [00:00<00:00, 6400.36it/s]


Evaluating Qwen2.5-72B-Instruct-Turbo agent_coder
Reading samples...


164it [00:00, 3172.11it/s]


Running test suites...


100%|██████████| 164/164 [00:02<00:00, 69.77it/s]


Writing results to results/Qwen2.5-72B-Instruct-Turbo/agent_coder.jsonl_results.jsonl...


100%|██████████| 164/164 [00:00<00:00, 4239.41it/s]


Evaluating Qwen2.5-72B-Instruct-Turbo agent_coder_improved
Reading samples...


164it [00:00, 3587.44it/s]


Running test suites...


100%|██████████| 164/164 [00:03<00:00, 51.58it/s]


Writing results to results/Qwen2.5-72B-Instruct-Turbo/agent_coder_improved.jsonl_results.jsonl...


100%|██████████| 164/164 [00:00<00:00, 7048.53it/s]


Evaluating Qwen2.5-72B-Instruct-Turbo state_machine
Reading samples...


164it [00:00, 3601.41it/s]


Running test suites...


100%|██████████| 164/164 [00:03<00:00, 44.83it/s]


Writing results to results/Qwen2.5-72B-Instruct-Turbo/state_machine.jsonl_results.jsonl...


100%|██████████| 164/164 [00:00<00:00, 6482.51it/s]


Evaluating Qwen2.5-Coder-32B-Instruct direct_prompt
Reading samples...


164it [00:00, 2830.55it/s]


Running test suites...


100%|██████████| 164/164 [00:02<00:00, 59.93it/s]


Writing results to results/Qwen2.5-Coder-32B-Instruct/direct_prompt.jsonl_results.jsonl...


100%|██████████| 164/164 [00:00<00:00, 5868.56it/s]


Evaluating Qwen2.5-Coder-32B-Instruct agent_coder
Reading samples...


164it [00:00, 3476.67it/s]


Running test suites...


100%|██████████| 164/164 [00:06<00:00, 27.01it/s]


Writing results to results/Qwen2.5-Coder-32B-Instruct/agent_coder.jsonl_results.jsonl...


100%|██████████| 164/164 [00:00<00:00, 2034.07it/s]


Evaluating Qwen2.5-Coder-32B-Instruct agent_coder_improved
Reading samples...


164it [00:00, 3497.56it/s]


Running test suites...


100%|██████████| 164/164 [00:04<00:00, 34.31it/s]


Writing results to results/Qwen2.5-Coder-32B-Instruct/agent_coder_improved.jsonl_results.jsonl...


100%|██████████| 164/164 [00:00<00:00, 6053.77it/s]


Evaluating Qwen2.5-Coder-32B-Instruct state_machine
Reading samples...


164it [00:00, 3809.37it/s]


Running test suites...


100%|██████████| 164/164 [00:02<00:00, 62.40it/s]


Writing results to results/Qwen2.5-Coder-32B-Instruct/state_machine.jsonl_results.jsonl...


100%|██████████| 164/164 [00:00<00:00, 7172.59it/s]


In [52]:
result_df = pd.DataFrame(evaluation_results)

In [53]:
result_df

Unnamed: 0,llm,approach,average_llm_calls,pass@1
0,GPT-4o,direct_prompt,1.0,0.908537
1,GPT-4o,agent_coder,8.292683,0.914634
2,GPT-4o,agent_coder_improved,-1.0,-1.0
3,GPT-4o,state_machine,3.0,0.896341
4,GPT-4o Mini,direct_prompt,1.0,0.841463
5,GPT-4o Mini,agent_coder,8.878049,0.896341
6,GPT-4o Mini,agent_coder_improved,6.670732,0.884146
7,GPT-4o Mini,state_machine,3.695122,0.871951
8,Qwen 2.5 7B,direct_prompt,1.0,0.823171
9,Qwen 2.5 7B,agent_coder,9.02439,0.835366
