<a href="https://colab.research.google.com/github/Bri636/ml-programming-winter-2025/blob/main/NewSimPO_Eval_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# importing packages
!git clone https://github.com/openai/human-eval.git
%cd human-eval/
!pip install -e .
!pip install transformers
!pip install pydantic
!pip install torch
!pip install bitsandbytes

Cloning into 'human-eval'...
remote: Enumerating objects: 34, done.[K
remote: Counting objects: 100% (26/26), done.[K
remote: Compressing objects: 100% (19/19), done.[K
remote: Total 34 (delta 12), reused 7 (delta 7), pack-reused 8 (from 1)[K
Receiving objects: 100% (34/34), 55.80 KiB | 175.00 KiB/s, done.
Resolving deltas: 100% (13/13), done.
/content/human-eval
Obtaining file:///content/human-eval
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting fire (from human-eval==1.0)
  Downloading fire-0.7.0.tar.gz (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fire
  Building wheel for fire (setup.py) ... [?25l[?25hdone
  Created wheel for fire: filename=fire-0.7.0-py3-none-any.whl size=114249 sha256=6ed10604dc868b4e2585a564d55a2348496025117e477237cffe68ed4fbf7a77
  Stored in directory: /root/.ca

In [2]:
""" Evaluation on Human Eval """

from __future__ import annotations
from typing import TypedDict, Dict, List, Any, Union, TypeVar
from transformers import pipeline
from transformers.pipelines import Pipeline
from functools import partial
from pydantic import Field, BaseModel
from tqdm import tqdm
import timeit
import os, glob
import json, yaml
from pathlib import Path
import torch
# submods
from human_eval.data import write_jsonl, read_problems, HUMAN_EVAL

In [3]:
_DTYPES={
    'bfloat16': torch.bfloat16
}
T = TypeVar('T')
PathLike = Union[Path, str]
# MODEL_NAME_OR_PATH = "unsloth/Qwen2.5-0.5B-Instruct-bnb-4bit"
MODEL_NAME_OR_PATH="BigBri/NewSimPO"
EVAL_SAVE_DIR = "./eval_results"
MERGE = False
MERGE_OUTPUT = "merged_samples.jsonl"


In [4]:
class BaseConfig(BaseModel):
    """An interface to add JSON/YAML serialization to Pydantic models."""

    # A name literal to correctly identify and construct nested models
    # which have many possible options.
    # name: Literal[''] = ''

    def write_json(self, path: PathLike) -> None:
        """Write the model to a JSON file.

        Parameters
        ----------
        path : str
            The path to the JSON file.
        """
        with open(path, 'w') as fp:
            json.dump(self.model_dump(), fp, indent=2)

    @classmethod
    def from_json(cls: type[T], path: PathLike) -> T:
        """Load the model from a JSON file.

        Parameters
        ----------
        path : str
            The path to the JSON file.

        Returns
        -------
        T
            A specific BaseConfig instance.
        """
        with open(path) as fp:
            data = json.load(fp)
        return cls(**data)

    def write_yaml(self, path: PathLike) -> None:
        """Write the model to a YAML file.

        Parameters
        ----------
        path : str
            The path to the YAML file.
        """
        with open(path, 'w') as fp:
            yaml.dump(
                json.loads(self.model_dump_json()),
                fp,
                indent=4,
                sort_keys=False,
            )

    @classmethod
    def from_yaml(cls: type[T], path: PathLike) -> T:
        """Load the model from a YAML file.

        Parameters
        ----------
        path : PathLike
            The path to the YAML file.

        Returns
        -------
        T
            A specific BaseConfig instance.
        """
        with open(path) as fp:
            raw_data = yaml.safe_load(fp)
        return cls(**raw_data)

class HumanEvalProblem(TypedDict):
    """ One single problem in HumanEval """
    task_id: str
    prompt: str
    entry_point: str
    canonical_solution: str
    test: str

class HumanEvalSolution(TypedDict):
    """ One instance of solution to HumanEval problem """
    task_id: str
    completion: str

class HFGeneratorConfig(BaseConfig):
    model_name_or_path: str = 'BigBri/NewSimPO'
    task: str = 'text-generation'
    device_map: str = 'auto'
    # device: str = 'cpu'
    torch_dtype: str = 'bfloat16'
    max_new_tokens: int = 256
    truncation: bool = True

class EvaluationConfig(BaseConfig):
    human_eval_path: str = Field(default=HUMAN_EVAL)
    pipeline_config: HFGeneratorConfig = Field(default_factory=HFGeneratorConfig)
    eval_save_dir: str = Field(default=EVAL_SAVE_DIR)
    num_samples_per_task: int = Field(default=4)
    batch_size: int = Field(default=16)  # Save every 10 samples

In [5]:
def format_time(seconds: float) -> str:
    """
    Converts time in seconds to a human-readable format (HH:MM:SS).
    """
    hours, remainder = divmod(seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    return f"{int(hours):02d}:{int(minutes):02d}:{seconds:06.3f}"

def read_jsonl(file_path: str) -> list:
    with open(file_path, "r", encoding="utf-8") as f:
        data = [json.loads(line) for line in f]
    return data

In [6]:
def evaluate(pipeline_func,
             problems: Dict[str, HumanEvalProblem],
             eval_config: EvaluationConfig):
    """ Evaluation of HFModel with batch saving """

    os.makedirs(eval_config.eval_save_dir, exist_ok=True)  # Ensure save directory exists
    batch = []
    file_index = len(glob.glob(os.path.join(eval_config.eval_save_dir, "samples_*.jsonl"))) + 1

    for idx, (task_id, task_data) in tqdm(enumerate(problems.items()), total=len(problems)):
        print(f'Starting Question: {idx + 1}...\n')
        start_time = timeit.default_timer()

        batched_prompts = [task_data["prompt"]] * eval_config.num_samples_per_task
        completions = pipeline_func(
            batched_prompts,
            max_new_tokens=eval_config.pipeline_config.max_new_tokens,
            truncation=eval_config.pipeline_config.truncation
        )

        for completion in completions:
            batch.append({"task_id": task_id, "completion": completion})

        elapsed_time = timeit.default_timer() - start_time
        print(f"Iteration {idx + 1}: {elapsed_time:.6f} seconds\n")

        # Save every `batch_size` samples
        if len(batch) >= eval_config.batch_size:
            save_path = os.path.join(eval_config.eval_save_dir, f"samples_{file_index}.jsonl")
            write_jsonl(save_path, batch)
            print(f"Saved {len(batch)} samples to {save_path}")
            batch.clear()
            file_index += 1

    # Save remaining batch if not empty
    if batch:
        save_path = os.path.join(eval_config.eval_save_dir, f"samples_{file_index}.jsonl")
        write_jsonl(save_path, batch)
        print(f"Saved {len(batch)} final samples to {save_path}")

def merge_jsonl_files(input_dir: str, output_file: str):
    """
    Merges multiple JSONL files from a directory into a single JSONL file.
    """
    jsonl_files = sorted(glob.glob(os.path.join(input_dir, "samples_*.jsonl")))

    with open(output_file, 'w', encoding='utf-8') as outfile:
        for file in jsonl_files:
            with open(file, 'r', encoding='utf-8') as infile:
                for line in infile:
                    outfile.write(line)

    print(f"Merged {len(jsonl_files)} files into {output_file}")

def main():
    # If MERGE is set, just merge and return
    if MERGE:
        merge_jsonl_files(EVAL_SAVE_DIR, MERGE_OUTPUT)
        return

    # Otherwise, run the standard evaluation
    eval_config = EvaluationConfig()
    eval_config.eval_save_dir = EVAL_SAVE_DIR

    pipe_config = eval_config.pipeline_config
    pipe_config.model_name_or_path = MODEL_NAME_OR_PATH
    print(f'Running Model: {pipe_config.model_name_or_path} On Human Eval With Settings: {eval_config.model_dump()}')
    problems: Dict[str, HumanEvalProblem] = read_problems()
    pipe = pipeline(
        model=pipe_config.model_name_or_path,
        task=pipe_config.task,
        device_map=pipe_config.device_map,
        # device=pipe_config.device,
        torch_dtype=_DTYPES.get(pipe_config.torch_dtype)
    )

    evaluate(pipe, problems, eval_config)

if __name__ == "__main__":
    main()

Running Model: BigBri/NewSimPO On Human Eval With Settings: {'human_eval_path': '/content/human-eval/human_eval/../data/HumanEval.jsonl.gz', 'pipeline_config': {'model_name_or_path': 'BigBri/NewSimPO', 'task': 'text-generation', 'device_map': 'auto', 'torch_dtype': 'bfloat16', 'max_new_tokens': 256, 'truncation': True}, 'eval_save_dir': './eval_results', 'num_samples_per_task': 4, 'batch_size': 16}


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


adapter_config.json:   0%|          | 0.00/801 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


model.safetensors:   0%|          | 0.00/457M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

Device set to use cuda:0
  0%|          | 0/164 [00:00<?, ?it/s]

Starting Question: 1...



  1%|          | 1/164 [00:17<47:05, 17.33s/it]

Iteration 1: 17.331253 seconds

Starting Question: 2...



  1%|          | 2/164 [00:44<1:02:11, 23.03s/it]

Iteration 2: 27.020803 seconds

Starting Question: 3...



  2%|▏         | 3/164 [01:21<1:19:16, 29.54s/it]

Iteration 3: 37.291601 seconds

Starting Question: 4...



  2%|▏         | 4/164 [02:01<1:29:28, 33.56s/it]

Iteration 4: 39.704694 seconds

Saved 16 samples to ./eval_results/samples_1.jsonl
Starting Question: 5...



  3%|▎         | 5/164 [02:20<1:15:09, 28.36s/it]

Iteration 5: 19.146450 seconds

Starting Question: 6...



  4%|▎         | 6/164 [02:35<1:02:19, 23.67s/it]

Iteration 6: 14.563361 seconds

Starting Question: 7...



  4%|▍         | 7/164 [03:05<1:07:34, 25.83s/it]

Iteration 7: 30.268709 seconds

Starting Question: 8...



  5%|▍         | 8/164 [03:09<49:35, 19.07s/it]  

Iteration 8: 4.600700 seconds

Saved 16 samples to ./eval_results/samples_2.jsonl
Starting Question: 9...



  5%|▌         | 9/164 [03:23<44:36, 17.27s/it]

Iteration 9: 13.312761 seconds

Starting Question: 10...



  6%|▌         | 10/164 [03:42<45:59, 17.92s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Iteration 10: 19.369529 seconds

Starting Question: 11...



  7%|▋         | 11/164 [04:24<1:04:15, 25.20s/it]

Iteration 11: 41.693442 seconds

Starting Question: 12...



  7%|▋         | 12/164 [04:39<56:18, 22.23s/it]  

Iteration 12: 15.428457 seconds

Saved 16 samples to ./eval_results/samples_3.jsonl
Starting Question: 13...



  8%|▊         | 13/164 [04:53<49:48, 19.79s/it]

Iteration 13: 14.192679 seconds

Starting Question: 14...



  9%|▊         | 14/164 [05:20<54:51, 21.94s/it]

Iteration 14: 26.915943 seconds

Starting Question: 15...



  9%|▉         | 15/164 [05:32<46:49, 18.85s/it]

Iteration 15: 11.692824 seconds

Starting Question: 16...



 10%|▉         | 16/164 [05:48<44:38, 18.10s/it]

Iteration 16: 16.331569 seconds

Saved 16 samples to ./eval_results/samples_4.jsonl
Starting Question: 17...



 10%|█         | 17/164 [05:58<38:05, 15.55s/it]

Iteration 17: 9.631207 seconds

Starting Question: 18...



 11%|█         | 18/164 [06:21<43:15, 17.78s/it]

Iteration 18: 22.965106 seconds

Starting Question: 19...



 12%|█▏        | 19/164 [07:00<58:42, 24.29s/it]

Iteration 19: 39.471415 seconds

Starting Question: 20...



 12%|█▏        | 20/164 [07:24<58:03, 24.19s/it]

Iteration 20: 23.952114 seconds

Saved 16 samples to ./eval_results/samples_5.jsonl
Starting Question: 21...



 13%|█▎        | 21/164 [07:55<1:02:09, 26.08s/it]

Iteration 21: 30.468337 seconds

Starting Question: 22...



 13%|█▎        | 22/164 [08:05<50:39, 21.40s/it]  

Iteration 22: 10.504872 seconds

Starting Question: 23...



 14%|█▍        | 23/164 [08:12<40:00, 17.02s/it]

Iteration 23: 6.801095 seconds

Starting Question: 24...



 15%|█▍        | 24/164 [08:17<31:16, 13.40s/it]

Iteration 24: 4.962619 seconds

Saved 16 samples to ./eval_results/samples_6.jsonl
Starting Question: 25...



 15%|█▌        | 25/164 [08:48<43:12, 18.65s/it]

Iteration 25: 30.891099 seconds

Starting Question: 26...



 16%|█▌        | 26/164 [09:02<39:46, 17.29s/it]

Iteration 26: 14.124038 seconds

Starting Question: 27...



 16%|█▋        | 27/164 [09:23<41:47, 18.30s/it]

Iteration 27: 20.652337 seconds

Starting Question: 28...



 17%|█▋        | 28/164 [09:56<51:23, 22.67s/it]

Iteration 28: 32.869138 seconds

Saved 16 samples to ./eval_results/samples_7.jsonl
Starting Question: 29...



 18%|█▊        | 29/164 [10:02<40:16, 17.90s/it]

Iteration 29: 6.751287 seconds

Starting Question: 30...



 18%|█▊        | 30/164 [10:09<32:29, 14.55s/it]

Iteration 30: 6.734589 seconds

Starting Question: 31...



 19%|█▉        | 31/164 [10:25<33:11, 14.97s/it]

Iteration 31: 15.963273 seconds

Starting Question: 32...



 20%|█▉        | 32/164 [11:14<55:09, 25.07s/it]

Iteration 32: 48.625984 seconds

Saved 16 samples to ./eval_results/samples_8.jsonl
Starting Question: 33...



 20%|██        | 33/164 [12:02<1:09:38, 31.90s/it]

Iteration 33: 47.837784 seconds

Starting Question: 34...



 21%|██        | 34/164 [12:32<1:08:11, 31.48s/it]

Iteration 34: 30.484103 seconds

Starting Question: 35...



 21%|██▏       | 35/164 [12:40<52:33, 24.45s/it]  

Iteration 35: 8.051606 seconds

Starting Question: 36...



 22%|██▏       | 36/164 [12:52<44:20, 20.79s/it]

Iteration 36: 12.238046 seconds

Saved 16 samples to ./eval_results/samples_9.jsonl
Starting Question: 37...



 23%|██▎       | 37/164 [13:20<48:19, 22.83s/it]

Iteration 37: 27.606019 seconds

Starting Question: 38...



 23%|██▎       | 38/164 [13:43<48:04, 22.89s/it]

Iteration 38: 23.021078 seconds

Starting Question: 39...



 24%|██▍       | 39/164 [14:25<59:24, 28.51s/it]

Iteration 39: 41.633146 seconds

Starting Question: 40...



 24%|██▍       | 40/164 [15:01<1:03:54, 30.92s/it]

Iteration 40: 36.549706 seconds

Saved 16 samples to ./eval_results/samples_10.jsonl
Starting Question: 41...



 25%|██▌       | 41/164 [15:23<57:46, 28.18s/it]  

Iteration 41: 21.773850 seconds

Starting Question: 42...



 26%|██▌       | 42/164 [16:11<1:09:36, 34.23s/it]

Iteration 42: 48.359739 seconds

Starting Question: 43...



 26%|██▌       | 43/164 [16:43<1:07:39, 33.55s/it]

Iteration 43: 31.940171 seconds

Starting Question: 44...



 27%|██▋       | 44/164 [17:24<1:11:06, 35.55s/it]

Iteration 44: 40.226851 seconds

Saved 16 samples to ./eval_results/samples_11.jsonl
Starting Question: 45...



 27%|██▋       | 45/164 [17:47<1:03:15, 31.89s/it]

Iteration 45: 23.355357 seconds

Starting Question: 46...



 28%|██▊       | 46/164 [18:25<1:06:11, 33.65s/it]

Iteration 46: 37.758717 seconds

Starting Question: 47...



 29%|██▊       | 47/164 [18:40<54:39, 28.03s/it]  

Iteration 47: 14.912807 seconds

Starting Question: 48...



 29%|██▉       | 48/164 [19:04<52:09, 26.98s/it]

Iteration 48: 24.528573 seconds

Saved 16 samples to ./eval_results/samples_12.jsonl
Starting Question: 49...



 30%|██▉       | 49/164 [19:42<58:10, 30.35s/it]

Iteration 49: 38.215478 seconds

Starting Question: 50...



 30%|███       | 50/164 [20:00<50:22, 26.52s/it]

Iteration 50: 17.567925 seconds

Starting Question: 51...



 31%|███       | 51/164 [20:21<46:51, 24.88s/it]

Iteration 51: 21.047748 seconds

Starting Question: 52...



 32%|███▏      | 52/164 [20:53<50:39, 27.14s/it]

Iteration 52: 32.419384 seconds

Saved 16 samples to ./eval_results/samples_13.jsonl
Starting Question: 53...



 32%|███▏      | 53/164 [21:20<50:04, 27.07s/it]

Iteration 53: 26.903520 seconds

Starting Question: 54...



 33%|███▎      | 54/164 [21:51<51:43, 28.21s/it]

Iteration 54: 30.872974 seconds

Starting Question: 55...



 34%|███▎      | 55/164 [22:14<48:16, 26.57s/it]

Iteration 55: 22.747691 seconds

Starting Question: 56...



 34%|███▍      | 56/164 [22:43<49:03, 27.25s/it]

Iteration 56: 28.830090 seconds

Saved 16 samples to ./eval_results/samples_14.jsonl
Starting Question: 57...



 35%|███▍      | 57/164 [23:13<50:28, 28.31s/it]

Iteration 57: 30.763941 seconds

Starting Question: 58...



 35%|███▌      | 58/164 [23:51<54:57, 31.11s/it]

Iteration 58: 37.655441 seconds

Starting Question: 59...



 36%|███▌      | 59/164 [24:03<44:13, 25.27s/it]

Iteration 59: 11.635437 seconds

Starting Question: 60...



 37%|███▋      | 60/164 [24:33<46:31, 26.84s/it]

Iteration 60: 30.510824 seconds

Saved 16 samples to ./eval_results/samples_15.jsonl
Starting Question: 61...



 37%|███▋      | 61/164 [24:50<41:00, 23.88s/it]

Iteration 61: 16.982392 seconds

Starting Question: 62...



 38%|███▊      | 62/164 [25:20<43:25, 25.54s/it]

Iteration 62: 29.411456 seconds

Starting Question: 63...



 38%|███▊      | 63/164 [25:50<45:28, 27.01s/it]

Iteration 63: 30.440767 seconds

Starting Question: 64...



 39%|███▉      | 64/164 [26:23<47:58, 28.79s/it]

Iteration 64: 32.931334 seconds

Saved 16 samples to ./eval_results/samples_16.jsonl
Starting Question: 65...



 40%|███▉      | 65/164 [26:53<48:01, 29.11s/it]

Iteration 65: 29.858331 seconds

Starting Question: 66...



 40%|████      | 66/164 [27:20<46:29, 28.46s/it]

Iteration 66: 26.950310 seconds

Starting Question: 67...



 41%|████      | 67/164 [27:39<41:24, 25.61s/it]

Iteration 67: 18.966754 seconds

Starting Question: 68...



 41%|████▏     | 68/164 [28:09<43:21, 27.10s/it]

Iteration 68: 30.561648 seconds

Saved 16 samples to ./eval_results/samples_17.jsonl
Starting Question: 69...



 42%|████▏     | 69/164 [28:40<44:42, 28.24s/it]

Iteration 69: 30.894266 seconds

Starting Question: 70...



 43%|████▎     | 70/164 [29:13<46:24, 29.62s/it]

Iteration 70: 32.839120 seconds

Starting Question: 71...



 43%|████▎     | 71/164 [29:51<49:49, 32.14s/it]

Iteration 71: 38.034546 seconds

Starting Question: 72...



 44%|████▍     | 72/164 [30:17<46:30, 30.34s/it]

Iteration 72: 26.111811 seconds

Saved 16 samples to ./eval_results/samples_18.jsonl
Starting Question: 73...



 45%|████▍     | 73/164 [30:52<47:55, 31.60s/it]

Iteration 73: 34.536237 seconds

Starting Question: 74...



 45%|████▌     | 74/164 [31:35<52:36, 35.07s/it]

Iteration 74: 43.190242 seconds

Starting Question: 75...



 46%|████▌     | 75/164 [32:04<49:32, 33.39s/it]

Iteration 75: 29.472396 seconds

Starting Question: 76...



 46%|████▋     | 76/164 [32:33<46:52, 31.96s/it]

Iteration 76: 28.625262 seconds

Saved 16 samples to ./eval_results/samples_19.jsonl
Starting Question: 77...



 47%|████▋     | 77/164 [33:05<46:28, 32.05s/it]

Iteration 77: 32.263104 seconds

Starting Question: 78...



 48%|████▊     | 78/164 [33:53<52:43, 36.78s/it]

Iteration 78: 47.820434 seconds

Starting Question: 79...



 48%|████▊     | 79/164 [34:32<53:03, 37.45s/it]

Iteration 79: 39.011817 seconds

Starting Question: 80...



 49%|████▉     | 80/164 [35:04<50:12, 35.86s/it]

Iteration 80: 32.139283 seconds

Saved 16 samples to ./eval_results/samples_20.jsonl
Starting Question: 81...



 49%|████▉     | 81/164 [35:30<45:16, 32.73s/it]

Iteration 81: 25.421365 seconds

Starting Question: 82...



 50%|█████     | 82/164 [36:03<44:47, 32.77s/it]

Iteration 82: 32.869065 seconds

Starting Question: 83...



 51%|█████     | 83/164 [36:27<40:43, 30.16s/it]

Iteration 83: 24.069801 seconds

Starting Question: 84...



 51%|█████     | 84/164 [37:00<41:21, 31.01s/it]

Iteration 84: 33.000331 seconds

Saved 16 samples to ./eval_results/samples_21.jsonl
Starting Question: 85...



 52%|█████▏    | 85/164 [37:26<38:53, 29.53s/it]

Iteration 85: 26.079241 seconds

Starting Question: 86...



 52%|█████▏    | 86/164 [37:57<39:00, 30.01s/it]

Iteration 86: 31.128585 seconds

Starting Question: 87...



 53%|█████▎    | 87/164 [38:23<36:54, 28.76s/it]

Iteration 87: 25.843336 seconds

Starting Question: 88...



 54%|█████▎    | 88/164 [38:46<34:22, 27.14s/it]

Iteration 88: 23.347096 seconds

Saved 16 samples to ./eval_results/samples_22.jsonl
Starting Question: 89...



 54%|█████▍    | 89/164 [39:19<35:54, 28.73s/it]

Iteration 89: 32.438662 seconds

Starting Question: 90...



 55%|█████▍    | 90/164 [39:48<35:50, 29.06s/it]

Iteration 90: 29.838334 seconds

Starting Question: 91...



 55%|█████▌    | 91/164 [40:16<34:56, 28.72s/it]

Iteration 91: 27.905072 seconds

Starting Question: 92...



 56%|█████▌    | 92/164 [40:35<30:45, 25.63s/it]

Iteration 92: 18.435428 seconds

Saved 16 samples to ./eval_results/samples_23.jsonl
Starting Question: 93...



 57%|█████▋    | 93/164 [41:02<30:44, 25.98s/it]

Iteration 93: 26.788219 seconds

Starting Question: 94...



 57%|█████▋    | 94/164 [41:21<28:01, 24.02s/it]

Iteration 94: 19.447196 seconds

Starting Question: 95...



 58%|█████▊    | 95/164 [41:56<31:25, 27.32s/it]

Iteration 95: 35.029337 seconds

Starting Question: 96...



 59%|█████▊    | 96/164 [42:38<36:00, 31.77s/it]

Iteration 96: 42.138826 seconds

Saved 16 samples to ./eval_results/samples_24.jsonl
Starting Question: 97...



 59%|█████▉    | 97/164 [43:15<37:15, 33.36s/it]

Iteration 97: 37.077312 seconds

Starting Question: 98...



 60%|█████▉    | 98/164 [43:51<37:22, 33.98s/it]

Iteration 98: 35.420159 seconds

Starting Question: 99...



 60%|██████    | 99/164 [44:31<38:59, 35.99s/it]

Iteration 99: 40.682404 seconds

Starting Question: 100...



 61%|██████    | 100/164 [45:00<36:01, 33.78s/it]

Iteration 100: 28.613208 seconds

Saved 16 samples to ./eval_results/samples_25.jsonl
Starting Question: 101...



 62%|██████▏   | 101/164 [45:19<30:57, 29.49s/it]

Iteration 101: 19.473353 seconds

Starting Question: 102...



 62%|██████▏   | 102/164 [45:48<30:16, 29.30s/it]

Iteration 102: 28.859940 seconds

Starting Question: 103...



 63%|██████▎   | 103/164 [46:17<29:36, 29.12s/it]

Iteration 103: 28.686801 seconds

Starting Question: 104...



 63%|██████▎   | 104/164 [46:33<25:02, 25.04s/it]

Iteration 104: 15.542699 seconds

Saved 16 samples to ./eval_results/samples_26.jsonl
Starting Question: 105...



 64%|██████▍   | 105/164 [46:47<21:36, 21.97s/it]

Iteration 105: 14.794906 seconds

Starting Question: 106...



 65%|██████▍   | 106/164 [47:31<27:28, 28.43s/it]

Iteration 106: 43.500591 seconds

Starting Question: 107...



 65%|██████▌   | 107/164 [47:58<26:44, 28.16s/it]

Iteration 107: 27.519241 seconds

Starting Question: 108...



 66%|██████▌   | 108/164 [48:23<25:15, 27.07s/it]

Iteration 108: 24.524476 seconds

Saved 16 samples to ./eval_results/samples_27.jsonl
Starting Question: 109...



 66%|██████▋   | 109/164 [48:45<23:34, 25.72s/it]

Iteration 109: 22.577501 seconds

Starting Question: 110...



 67%|██████▋   | 110/164 [49:29<27:59, 31.11s/it]

Iteration 110: 43.667247 seconds

Starting Question: 111...



 68%|██████▊   | 111/164 [50:01<27:42, 31.37s/it]

Iteration 111: 31.996566 seconds

Starting Question: 112...



 68%|██████▊   | 112/164 [50:32<27:11, 31.37s/it]

Iteration 112: 31.361041 seconds

Saved 16 samples to ./eval_results/samples_28.jsonl
Starting Question: 113...



 69%|██████▉   | 113/164 [50:41<20:55, 24.62s/it]

Iteration 113: 8.884928 seconds

Starting Question: 114...



 70%|██████▉   | 114/164 [51:01<19:17, 23.16s/it]

Iteration 114: 19.727106 seconds

Starting Question: 115...



 70%|███████   | 115/164 [51:40<22:51, 28.00s/it]

Iteration 115: 39.291537 seconds

Starting Question: 116...



 71%|███████   | 116/164 [52:23<26:00, 32.51s/it]

Iteration 116: 43.025616 seconds

Saved 16 samples to ./eval_results/samples_29.jsonl
Starting Question: 117...



 71%|███████▏  | 117/164 [52:58<25:52, 33.03s/it]

Iteration 117: 34.263806 seconds

Starting Question: 118...



 72%|███████▏  | 118/164 [53:43<28:13, 36.81s/it]

Iteration 118: 45.631344 seconds

Starting Question: 119...



 73%|███████▎  | 119/164 [54:13<25:56, 34.59s/it]

Iteration 119: 29.387887 seconds

Starting Question: 120...



 73%|███████▎  | 120/164 [54:35<22:35, 30.80s/it]

Iteration 120: 21.971771 seconds

Saved 16 samples to ./eval_results/samples_30.jsonl
Starting Question: 121...



 74%|███████▍  | 121/164 [54:49<18:28, 25.78s/it]

Iteration 121: 14.065621 seconds

Starting Question: 122...



 74%|███████▍  | 122/164 [55:10<17:08, 24.49s/it]

Iteration 122: 21.473507 seconds

Starting Question: 123...



 75%|███████▌  | 123/164 [55:58<21:27, 31.39s/it]

Iteration 123: 47.498096 seconds

Starting Question: 124...



 76%|███████▌  | 124/164 [56:36<22:19, 33.48s/it]

Iteration 124: 38.339189 seconds

Saved 16 samples to ./eval_results/samples_31.jsonl
Starting Question: 125...



 76%|███████▌  | 125/164 [56:59<19:41, 30.30s/it]

Iteration 125: 22.882832 seconds

Starting Question: 126...



 77%|███████▋  | 126/164 [57:15<16:29, 26.05s/it]

Iteration 126: 16.116892 seconds

Starting Question: 127...



 77%|███████▋  | 127/164 [57:49<17:36, 28.55s/it]

Iteration 127: 34.406827 seconds

Starting Question: 128...



 78%|███████▊  | 128/164 [58:16<16:51, 28.10s/it]

Iteration 128: 27.023405 seconds

Saved 16 samples to ./eval_results/samples_32.jsonl
Starting Question: 129...



 79%|███████▊  | 129/164 [58:55<18:11, 31.18s/it]

Iteration 129: 38.367723 seconds

Starting Question: 130...



 79%|███████▉  | 130/164 [59:22<17:03, 30.11s/it]

Iteration 130: 27.626515 seconds

Starting Question: 131...



 80%|███████▉  | 131/164 [59:45<15:22, 27.96s/it]

Iteration 131: 22.932314 seconds

Starting Question: 132...



 80%|████████  | 132/164 [1:00:12<14:38, 27.44s/it]

Iteration 132: 26.220562 seconds

Saved 16 samples to ./eval_results/samples_33.jsonl
Starting Question: 133...



 81%|████████  | 133/164 [1:00:41<14:29, 28.04s/it]

Iteration 133: 29.451194 seconds

Starting Question: 134...



 82%|████████▏ | 134/164 [1:01:23<16:02, 32.09s/it]

Iteration 134: 41.544523 seconds

Starting Question: 135...



 82%|████████▏ | 135/164 [1:01:41<13:29, 27.90s/it]

Iteration 135: 18.114659 seconds

Starting Question: 136...



 83%|████████▎ | 136/164 [1:02:15<13:56, 29.87s/it]

Iteration 136: 34.466588 seconds

Saved 16 samples to ./eval_results/samples_34.jsonl
Starting Question: 137...



 84%|████████▎ | 137/164 [1:02:57<15:05, 33.53s/it]

Iteration 137: 42.080067 seconds

Starting Question: 138...



 84%|████████▍ | 138/164 [1:03:38<15:29, 35.74s/it]

Iteration 138: 40.875694 seconds

Starting Question: 139...



 85%|████████▍ | 139/164 [1:04:07<13:59, 33.60s/it]

Iteration 139: 28.610092 seconds

Starting Question: 140...



 85%|████████▌ | 140/164 [1:04:43<13:47, 34.50s/it]

Iteration 140: 36.596663 seconds

Saved 16 samples to ./eval_results/samples_35.jsonl
Starting Question: 141...



 86%|████████▌ | 141/164 [1:05:13<12:38, 32.97s/it]

Iteration 141: 29.414644 seconds

Starting Question: 142...



 87%|████████▋ | 142/164 [1:05:47<12:11, 33.24s/it]

Iteration 142: 33.852184 seconds

Starting Question: 143...



 87%|████████▋ | 143/164 [1:06:14<10:58, 31.36s/it]

Iteration 143: 26.967759 seconds

Starting Question: 144...



 88%|████████▊ | 144/164 [1:06:40<09:57, 29.86s/it]

Iteration 144: 26.358560 seconds

Saved 16 samples to ./eval_results/samples_36.jsonl
Starting Question: 145...



 88%|████████▊ | 145/164 [1:07:19<10:18, 32.55s/it]

Iteration 145: 38.815052 seconds

Starting Question: 146...



 89%|████████▉ | 146/164 [1:07:52<09:51, 32.86s/it]

Iteration 146: 33.602661 seconds

Starting Question: 147...



 90%|████████▉ | 147/164 [1:08:21<08:58, 31.67s/it]

Iteration 147: 28.869624 seconds

Starting Question: 148...



 90%|█████████ | 148/164 [1:08:57<08:47, 32.96s/it]

Iteration 148: 35.992647 seconds

Saved 16 samples to ./eval_results/samples_37.jsonl
Starting Question: 149...



 91%|█████████ | 149/164 [1:09:17<07:15, 29.06s/it]

Iteration 149: 19.931438 seconds

Starting Question: 150...



 91%|█████████▏| 150/164 [1:09:34<05:55, 25.39s/it]

Iteration 150: 16.846766 seconds

Starting Question: 151...



 92%|█████████▏| 151/164 [1:09:55<05:10, 23.91s/it]

Iteration 151: 20.464116 seconds

Starting Question: 152...



 93%|█████████▎| 152/164 [1:10:38<05:55, 29.65s/it]

Iteration 152: 43.028988 seconds

Saved 16 samples to ./eval_results/samples_38.jsonl
Starting Question: 153...



 93%|█████████▎| 153/164 [1:11:07<05:26, 29.64s/it]

Iteration 153: 29.617596 seconds

Starting Question: 154...



 94%|█████████▍| 154/164 [1:11:47<05:26, 32.65s/it]

Iteration 154: 39.679031 seconds

Starting Question: 155...



 95%|█████████▍| 155/164 [1:12:08<04:22, 29.12s/it]

Iteration 155: 20.883943 seconds

Starting Question: 156...



 95%|█████████▌| 156/164 [1:12:33<03:44, 28.08s/it]

Iteration 156: 25.655087 seconds

Saved 16 samples to ./eval_results/samples_39.jsonl
Starting Question: 157...



 96%|█████████▌| 157/164 [1:13:21<03:56, 33.81s/it]

Iteration 157: 47.156239 seconds

Starting Question: 158...



 96%|█████████▋| 158/164 [1:13:38<02:54, 29.04s/it]

Iteration 158: 17.924494 seconds

Starting Question: 159...



 97%|█████████▋| 159/164 [1:14:16<02:37, 31.46s/it]

Iteration 159: 37.102076 seconds

Starting Question: 160...



 98%|█████████▊| 160/164 [1:14:45<02:03, 30.85s/it]

Iteration 160: 29.428461 seconds

Saved 16 samples to ./eval_results/samples_40.jsonl
Starting Question: 161...



 98%|█████████▊| 161/164 [1:15:23<01:38, 32.97s/it]

Iteration 161: 37.924619 seconds

Starting Question: 162...



 99%|█████████▉| 162/164 [1:15:56<01:05, 32.94s/it]

Iteration 162: 32.851106 seconds

Starting Question: 163...



 99%|█████████▉| 163/164 [1:16:19<00:30, 30.05s/it]

Iteration 163: 23.315121 seconds

Starting Question: 164...



100%|██████████| 164/164 [1:16:53<00:00, 28.13s/it]

Iteration 164: 33.796119 seconds

Saved 16 samples to ./eval_results/samples_41.jsonl





In [7]:
# merge files, stored in human eval
merge_jsonl_files(EVAL_SAVE_DIR, MERGE_OUTPUT)

Merged 41 files into merged_samples.jsonl


In [8]:
import json

def transform_jsonl(input_file: str, output_file: str):
    """
    Transforms each line from:
        {
          "task_id": "...",
          "completion": [{"generated_text": "some string ..."}]
        }
    into:
        {
          "task_id": "...",
          "completion": "some string ..."
        }
    and writes to a new JSONL file.
    """
    transformed_records = []

    # 1. Read each line as JSON
    with open(input_file, "r", encoding="utf-8") as fin:
        for line in fin:
            data = json.loads(line)

            # 2. Extract "generated_text" from the 'completion' list (assuming it exists)
            if "completion" in data and isinstance(data["completion"], list) and data["completion"]:
                # Typically something like [{"generated_text": "..."}]
                gen_text = data["completion"][0].get("generated_text", "")
            else:
                gen_text = ""

            # 3. Replace the 'completion' field with just the string
            data["completion"] = gen_text

            transformed_records.append(data)

    # 4. Write the new structure to a JSONL output file
    with open(output_file, "w", encoding="utf-8") as fout:
        for record in transformed_records:
            fout.write(json.dumps(record) + "\n")


In [9]:
# Usage:
input_path = "/content/human-eval/merged_samples.jsonl"
output_path = "/content/human-eval/clean_merged_samples.jsonl"

transform_jsonl(input_path, output_path)
print(f"Transformed JSONL saved to: {output_path}")

Transformed JSONL saved to: /content/human-eval/clean_merged_samples.jsonl


In [10]:
# run human eval
%cd /content/human-eval/
!evaluate_functional_correctness clean_merged_samples.jsonl

/content/human-eval
Reading samples...
656it [00:00, 27518.21it/s]
Running test suites...
100% 656/656 [00:09<00:00, 69.77it/s]
Writing results to clean_merged_samples.jsonl_results.jsonl...
100% 656/656 [00:00<00:00, 45456.95it/s]
{'pass@1': 0.12195121951219512}
