<a href="https://colab.research.google.com/github/Bri636/ml-programming-winter-2025/blob/main/KTO_Eval_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# importing packages
!git clone https://github.com/openai/human-eval.git
%cd human-eval/
!pip install -e .
!pip install transformers
!pip install pydantic
!pip install torch
!pip install bitsandbytes

Cloning into 'human-eval'...
remote: Enumerating objects: 34, done.[K
remote: Counting objects: 100% (26/26), done.[K
remote: Compressing objects: 100% (19/19), done.[K
remote: Total 34 (delta 12), reused 7 (delta 7), pack-reused 8 (from 1)[K
Receiving objects: 100% (34/34), 55.80 KiB | 55.80 MiB/s, done.
Resolving deltas: 100% (13/13), done.
/content/human-eval
Obtaining file:///content/human-eval
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting fire (from human-eval==1.0)
  Downloading fire-0.7.0.tar.gz (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fire
  Building wheel for fire (setup.py) ... [?25l[?25hdone
  Created wheel for fire: filename=fire-0.7.0-py3-none-any.whl size=114249 sha256=7ca8eb471119db7d241fa3bb21b37ac5f7fbf7e6514d6b5baa0d9e4d5b7b012e
  Stored in directory: /root/.cac

In [2]:
""" Evaluation on Human Eval """

from __future__ import annotations
from typing import TypedDict, Dict, List, Any, Union, TypeVar
from transformers import pipeline
from transformers.pipelines import Pipeline
from functools import partial
from pydantic import Field, BaseModel
from tqdm import tqdm
import timeit
import os, glob
import json, yaml
from pathlib import Path
import torch
# submods
from human_eval.data import write_jsonl, read_problems, HUMAN_EVAL

In [3]:
_DTYPES={
    'bfloat16': torch.bfloat16
}
T = TypeVar('T')
PathLike = Union[Path, str]
# MODEL_NAME_OR_PATH = "unsloth/Qwen2.5-0.5B-Instruct-bnb-4bit"
MODEL_NAME_OR_PATH="zhaviraax/qwen2-beta-5_hinge"
EVAL_SAVE_DIR = "./eval_results"
MERGE = False
MERGE_OUTPUT = "merged_samples.jsonl"


In [4]:
class BaseConfig(BaseModel):
    """An interface to add JSON/YAML serialization to Pydantic models."""

    # A name literal to correctly identify and construct nested models
    # which have many possible options.
    # name: Literal[''] = ''

    def write_json(self, path: PathLike) -> None:
        """Write the model to a JSON file.

        Parameters
        ----------
        path : str
            The path to the JSON file.
        """
        with open(path, 'w') as fp:
            json.dump(self.model_dump(), fp, indent=2)

    @classmethod
    def from_json(cls: type[T], path: PathLike) -> T:
        """Load the model from a JSON file.

        Parameters
        ----------
        path : str
            The path to the JSON file.

        Returns
        -------
        T
            A specific BaseConfig instance.
        """
        with open(path) as fp:
            data = json.load(fp)
        return cls(**data)

    def write_yaml(self, path: PathLike) -> None:
        """Write the model to a YAML file.

        Parameters
        ----------
        path : str
            The path to the YAML file.
        """
        with open(path, 'w') as fp:
            yaml.dump(
                json.loads(self.model_dump_json()),
                fp,
                indent=4,
                sort_keys=False,
            )

    @classmethod
    def from_yaml(cls: type[T], path: PathLike) -> T:
        """Load the model from a YAML file.

        Parameters
        ----------
        path : PathLike
            The path to the YAML file.

        Returns
        -------
        T
            A specific BaseConfig instance.
        """
        with open(path) as fp:
            raw_data = yaml.safe_load(fp)
        return cls(**raw_data)

class HumanEvalProblem(TypedDict):
    """ One single problem in HumanEval """
    task_id: str
    prompt: str
    entry_point: str
    canonical_solution: str
    test: str

class HumanEvalSolution(TypedDict):
    """ One instance of solution to HumanEval problem """
    task_id: str
    completion: str

class HFGeneratorConfig(BaseConfig):
    model_name_or_path: str = 'zhaviraax/qwen2-beta-5_hinge'
    # model_name_or_path: str = "unsloth/Qwen2.5-0.5B-Instruct-bnb-4bit"
    task: str = 'text-generation'
    device_map: str = 'auto'
    # device: str = 'cpu'
    torch_dtype: str = 'bfloat16'
    max_new_tokens: int = 256
    truncation: bool = True

class EvaluationConfig(BaseConfig):
    human_eval_path: str = Field(default=HUMAN_EVAL)
    pipeline_config: HFGeneratorConfig = Field(default_factory=HFGeneratorConfig)
    eval_save_dir: str = Field(default=EVAL_SAVE_DIR)
    num_samples_per_task: int = Field(default=4)
    batch_size: int = Field(default=16)  # Save every 10 samples

In [5]:
def format_time(seconds: float) -> str:
    """
    Converts time in seconds to a human-readable format (HH:MM:SS).
    """
    hours, remainder = divmod(seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    return f"{int(hours):02d}:{int(minutes):02d}:{seconds:06.3f}"

def read_jsonl(file_path: str) -> list:
    with open(file_path, "r", encoding="utf-8") as f:
        data = [json.loads(line) for line in f]
    return data

In [6]:
def evaluate(pipeline_func,
             problems: Dict[str, HumanEvalProblem],
             eval_config: EvaluationConfig):
    """ Evaluation of HFModel with batch saving """

    os.makedirs(eval_config.eval_save_dir, exist_ok=True)  # Ensure save directory exists
    batch = []
    file_index = len(glob.glob(os.path.join(eval_config.eval_save_dir, "samples_*.jsonl"))) + 1

    for idx, (task_id, task_data) in tqdm(enumerate(problems.items()), total=len(problems)):
        print(f'Starting Question: {idx + 1}...\n')
        start_time = timeit.default_timer()

        batched_prompts = [task_data["prompt"]] * eval_config.num_samples_per_task
        completions = pipeline_func(
            batched_prompts,
            max_new_tokens=eval_config.pipeline_config.max_new_tokens,
            truncation=eval_config.pipeline_config.truncation
        )

        for completion in completions:
            batch.append({"task_id": task_id, "completion": completion})

        elapsed_time = timeit.default_timer() - start_time
        print(f"Iteration {idx + 1}: {elapsed_time:.6f} seconds\n")

        # Save every `batch_size` samples
        if len(batch) >= eval_config.batch_size:
            save_path = os.path.join(eval_config.eval_save_dir, f"samples_{file_index}.jsonl")
            write_jsonl(save_path, batch)
            print(f"Saved {len(batch)} samples to {save_path}")
            batch.clear()
            file_index += 1

    # Save remaining batch if not empty
    if batch:
        save_path = os.path.join(eval_config.eval_save_dir, f"samples_{file_index}.jsonl")
        write_jsonl(save_path, batch)
        print(f"Saved {len(batch)} final samples to {save_path}")

def merge_jsonl_files(input_dir: str, output_file: str):
    """
    Merges multiple JSONL files from a directory into a single JSONL file.
    """
    jsonl_files = sorted(glob.glob(os.path.join(input_dir, "samples_*.jsonl")))

    with open(output_file, 'w', encoding='utf-8') as outfile:
        for file in jsonl_files:
            with open(file, 'r', encoding='utf-8') as infile:
                for line in infile:
                    outfile.write(line)

    print(f"Merged {len(jsonl_files)} files into {output_file}")

def main():
    # If MERGE is set, just merge and return
    if MERGE:
        merge_jsonl_files(EVAL_SAVE_DIR, MERGE_OUTPUT)
        return

    # Otherwise, run the standard evaluation
    eval_config = EvaluationConfig()
    eval_config.eval_save_dir = EVAL_SAVE_DIR

    pipe_config = eval_config.pipeline_config
    pipe_config.model_name_or_path = MODEL_NAME_OR_PATH
    print(f'Running Model: {pipe_config.model_name_or_path} On Human Eval With Settings: {eval_config.model_dump()}')
    problems: Dict[str, HumanEvalProblem] = read_problems()
    pipe = pipeline(
        model=pipe_config.model_name_or_path,
        task=pipe_config.task,
        device_map=pipe_config.device_map,
        # device=pipe_config.device,
        torch_dtype=_DTYPES.get(pipe_config.torch_dtype)
    )

    evaluate(pipe, problems, eval_config)

if __name__ == "__main__":
    main()

Running Model: zhaviraax/qwen2-beta-5_hinge On Human Eval With Settings: {'human_eval_path': '/content/human-eval/human_eval/../data/HumanEval.jsonl.gz', 'pipeline_config': {'model_name_or_path': 'zhaviraax/qwen2-beta-5_hinge', 'task': 'text-generation', 'device_map': 'auto', 'torch_dtype': 'bfloat16', 'max_new_tokens': 256, 'truncation': True}, 'eval_save_dir': './eval_results', 'num_samples_per_task': 4, 'batch_size': 16}


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


adapter_config.json:   0%|          | 0.00/807 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


model.safetensors:   0%|          | 0.00/457M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/265 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/367 [00:00<?, ?B/s]

Device set to use cuda:0
  0%|          | 0/164 [00:00<?, ?it/s]

Starting Question: 1...



  1%|          | 1/164 [00:26<1:13:12, 26.95s/it]

Iteration 1: 26.945995 seconds

Starting Question: 2...



  1%|          | 2/164 [00:51<1:08:16, 25.29s/it]

Iteration 2: 24.127672 seconds

Starting Question: 3...



  2%|▏         | 3/164 [01:30<1:24:54, 31.64s/it]

Iteration 3: 39.198130 seconds

Starting Question: 4...



  2%|▏         | 4/164 [01:44<1:05:42, 24.64s/it]

Iteration 4: 13.900791 seconds

Saved 16 samples to ./eval_results/samples_1.jsonl
Starting Question: 5...



  3%|▎         | 5/164 [02:18<1:14:58, 28.29s/it]

Iteration 5: 34.772227 seconds

Starting Question: 6...



  4%|▎         | 6/164 [02:36<1:04:31, 24.50s/it]

Iteration 6: 17.146486 seconds

Starting Question: 7...



  4%|▍         | 7/164 [03:03<1:06:53, 25.56s/it]

Iteration 7: 27.739857 seconds

Starting Question: 8...



  5%|▍         | 8/164 [03:13<53:14, 20.48s/it]  

Iteration 8: 9.586117 seconds

Saved 16 samples to ./eval_results/samples_2.jsonl
Starting Question: 9...



  5%|▌         | 9/164 [03:39<57:01, 22.07s/it]

Iteration 9: 25.584769 seconds

Starting Question: 10...



  6%|▌         | 10/164 [03:58<54:31, 21.24s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Iteration 10: 19.378280 seconds

Starting Question: 11...



  7%|▋         | 11/164 [04:27<1:00:36, 23.77s/it]

Iteration 11: 29.498538 seconds

Starting Question: 12...



  7%|▋         | 12/164 [04:56<1:03:34, 25.10s/it]

Iteration 12: 28.132967 seconds

Saved 16 samples to ./eval_results/samples_3.jsonl
Starting Question: 13...



  8%|▊         | 13/164 [05:24<1:05:33, 26.05s/it]

Iteration 13: 28.236219 seconds

Starting Question: 14...



  9%|▊         | 14/164 [06:08<1:18:45, 31.50s/it]

Iteration 14: 44.106707 seconds

Starting Question: 15...



  9%|▉         | 15/164 [06:27<1:08:47, 27.70s/it]

Iteration 15: 18.878632 seconds

Starting Question: 16...



 10%|▉         | 16/164 [07:01<1:13:01, 29.60s/it]

Iteration 16: 34.021325 seconds

Saved 16 samples to ./eval_results/samples_4.jsonl
Starting Question: 17...



 10%|█         | 17/164 [07:42<1:20:56, 33.04s/it]

Iteration 17: 41.027021 seconds

Starting Question: 18...



 11%|█         | 18/164 [07:58<1:08:24, 28.11s/it]

Iteration 18: 16.652027 seconds

Starting Question: 19...



 12%|█▏        | 19/164 [08:41<1:18:04, 32.31s/it]

Iteration 19: 42.076897 seconds

Starting Question: 20...



 12%|█▏        | 20/164 [09:10<1:15:46, 31.57s/it]

Iteration 20: 29.850490 seconds

Saved 16 samples to ./eval_results/samples_5.jsonl
Starting Question: 21...



 13%|█▎        | 21/164 [09:49<1:20:26, 33.75s/it]

Iteration 21: 38.830054 seconds

Starting Question: 22...



 13%|█▎        | 22/164 [10:15<1:14:01, 31.28s/it]

Iteration 22: 25.506059 seconds

Starting Question: 23...



 14%|█▍        | 23/164 [10:43<1:11:17, 30.34s/it]

Iteration 23: 28.146876 seconds

Starting Question: 24...



 15%|█▍        | 24/164 [11:27<1:20:24, 34.46s/it]

Iteration 24: 44.085306 seconds

Saved 16 samples to ./eval_results/samples_6.jsonl
Starting Question: 25...



 15%|█▌        | 25/164 [11:54<1:14:30, 32.16s/it]

Iteration 25: 26.798467 seconds

Starting Question: 26...



 16%|█▌        | 26/164 [12:16<1:07:11, 29.22s/it]

Iteration 26: 22.334094 seconds

Starting Question: 27...



 16%|█▋        | 27/164 [12:36<1:00:07, 26.33s/it]

Iteration 27: 19.602757 seconds

Starting Question: 28...



 17%|█▋        | 28/164 [13:17<1:09:34, 30.69s/it]

Iteration 28: 40.866003 seconds

Saved 16 samples to ./eval_results/samples_7.jsonl
Starting Question: 29...



 18%|█▊        | 29/164 [13:23<52:38, 23.39s/it]  

Iteration 29: 6.364906 seconds

Starting Question: 30...



 18%|█▊        | 30/164 [13:32<42:49, 19.18s/it]

Iteration 30: 9.341168 seconds

Starting Question: 31...



 19%|█▉        | 31/164 [14:01<48:46, 22.01s/it]

Iteration 31: 28.601498 seconds

Starting Question: 32...



 20%|█▉        | 32/164 [14:40<59:52, 27.21s/it]

Iteration 32: 39.365292 seconds

Saved 16 samples to ./eval_results/samples_8.jsonl
Starting Question: 33...



 20%|██        | 33/164 [15:08<59:30, 27.26s/it]

Iteration 33: 27.349666 seconds

Starting Question: 34...



 21%|██        | 34/164 [15:32<57:24, 26.50s/it]

Iteration 34: 24.724986 seconds

Starting Question: 35...



 21%|██▏       | 35/164 [16:03<59:46, 27.80s/it]

Iteration 35: 30.855801 seconds

Starting Question: 36...



 22%|██▏       | 36/164 [16:21<52:51, 24.78s/it]

Iteration 36: 17.708479 seconds

Saved 16 samples to ./eval_results/samples_9.jsonl
Starting Question: 37...



 23%|██▎       | 37/164 [16:46<52:30, 24.81s/it]

Iteration 37: 24.883824 seconds

Starting Question: 38...



 23%|██▎       | 38/164 [17:16<55:40, 26.51s/it]

Iteration 38: 30.470252 seconds

Starting Question: 39...



 24%|██▍       | 39/164 [18:00<1:06:14, 31.79s/it]

Iteration 39: 44.128747 seconds

Starting Question: 40...



 24%|██▍       | 40/164 [18:38<1:09:13, 33.49s/it]

Iteration 40: 37.458234 seconds

Saved 16 samples to ./eval_results/samples_10.jsonl
Starting Question: 41...



 25%|██▌       | 41/164 [19:16<1:11:13, 34.75s/it]

Iteration 41: 37.661962 seconds

Starting Question: 42...



 26%|██▌       | 42/164 [19:55<1:13:30, 36.15s/it]

Iteration 42: 39.425325 seconds

Starting Question: 43...



 26%|██▌       | 43/164 [20:19<1:05:42, 32.58s/it]

Iteration 43: 24.258267 seconds

Starting Question: 44...



 27%|██▋       | 44/164 [21:02<1:11:31, 35.77s/it]

Iteration 44: 43.191087 seconds

Saved 16 samples to ./eval_results/samples_11.jsonl
Starting Question: 45...



 27%|██▋       | 45/164 [21:32<1:07:14, 33.90s/it]

Iteration 45: 29.549760 seconds

Starting Question: 46...



 28%|██▊       | 46/164 [22:14<1:11:46, 36.49s/it]

Iteration 46: 42.536608 seconds

Starting Question: 47...



 29%|██▊       | 47/164 [22:50<1:10:41, 36.25s/it]

Iteration 47: 35.680208 seconds

Starting Question: 48...



 29%|██▉       | 48/164 [23:31<1:12:27, 37.48s/it]

Iteration 48: 40.352638 seconds

Saved 16 samples to ./eval_results/samples_12.jsonl
Starting Question: 49...



 30%|██▉       | 49/164 [23:47<59:42, 31.15s/it]  

Iteration 49: 16.390781 seconds

Starting Question: 50...



 30%|███       | 50/164 [24:04<51:19, 27.01s/it]

Iteration 50: 17.336423 seconds

Starting Question: 51...



 31%|███       | 51/164 [24:33<51:52, 27.55s/it]

Iteration 51: 28.796375 seconds

Starting Question: 52...



 32%|███▏      | 52/164 [25:05<53:50, 28.85s/it]

Iteration 52: 31.875042 seconds

Saved 16 samples to ./eval_results/samples_13.jsonl
Starting Question: 53...



 32%|███▏      | 53/164 [25:21<46:29, 25.13s/it]

Iteration 53: 16.460377 seconds

Starting Question: 54...



 33%|███▎      | 54/164 [25:44<44:39, 24.36s/it]

Iteration 54: 22.573081 seconds

Starting Question: 55...



 34%|███▎      | 55/164 [26:23<52:28, 28.88s/it]

Iteration 55: 39.430398 seconds

Starting Question: 56...



 34%|███▍      | 56/164 [26:54<52:47, 29.33s/it]

Iteration 56: 30.369765 seconds

Saved 16 samples to ./eval_results/samples_14.jsonl
Starting Question: 57...



 35%|███▍      | 57/164 [27:23<52:15, 29.30s/it]

Iteration 57: 29.229088 seconds

Starting Question: 58...



 35%|███▌      | 58/164 [27:42<46:31, 26.33s/it]

Iteration 58: 19.409623 seconds

Starting Question: 59...



 36%|███▌      | 59/164 [28:15<49:28, 28.27s/it]

Iteration 59: 32.780953 seconds

Starting Question: 60...



 37%|███▋      | 60/164 [28:59<56:50, 32.79s/it]

Iteration 60: 43.342993 seconds

Saved 16 samples to ./eval_results/samples_15.jsonl
Starting Question: 61...



 37%|███▋      | 61/164 [29:37<59:23, 34.60s/it]

Iteration 61: 38.818305 seconds

Starting Question: 62...



 38%|███▊      | 62/164 [30:04<55:01, 32.36s/it]

Iteration 62: 27.147535 seconds

Starting Question: 63...



 38%|███▊      | 63/164 [30:24<48:13, 28.65s/it]

Iteration 63: 19.967264 seconds

Starting Question: 64...



 39%|███▉      | 64/164 [31:00<51:18, 30.79s/it]

Iteration 64: 35.780878 seconds

Saved 16 samples to ./eval_results/samples_16.jsonl
Starting Question: 65...



 40%|███▉      | 65/164 [31:26<48:22, 29.32s/it]

Iteration 65: 25.899963 seconds

Starting Question: 66...



 40%|████      | 66/164 [32:03<51:24, 31.47s/it]

Iteration 66: 36.489779 seconds

Starting Question: 67...



 41%|████      | 67/164 [32:32<49:58, 30.91s/it]

Iteration 67: 29.611243 seconds

Starting Question: 68...



 41%|████▏     | 68/164 [33:16<55:26, 34.65s/it]

Iteration 68: 43.351214 seconds

Saved 16 samples to ./eval_results/samples_17.jsonl
Starting Question: 69...



 42%|████▏     | 69/164 [33:55<57:07, 36.08s/it]

Iteration 69: 39.425899 seconds

Starting Question: 70...



 43%|████▎     | 70/164 [34:38<59:47, 38.16s/it]

Iteration 70: 43.020105 seconds

Starting Question: 71...



 43%|████▎     | 71/164 [35:15<58:31, 37.76s/it]

Iteration 71: 36.828134 seconds

Starting Question: 72...



 44%|████▍     | 72/164 [35:53<58:07, 37.90s/it]

Iteration 72: 38.229167 seconds

Saved 16 samples to ./eval_results/samples_18.jsonl
Starting Question: 73...



 45%|████▍     | 73/164 [36:24<54:12, 35.75s/it]

Iteration 73: 30.709216 seconds

Starting Question: 74...



 45%|████▌     | 74/164 [37:07<57:01, 38.01s/it]

Iteration 74: 43.306600 seconds

Starting Question: 75...



 46%|████▌     | 75/164 [37:47<57:25, 38.71s/it]

Iteration 75: 40.325284 seconds

Starting Question: 76...



 46%|████▋     | 76/164 [38:18<53:13, 36.29s/it]

Iteration 76: 30.653617 seconds

Saved 16 samples to ./eval_results/samples_19.jsonl
Starting Question: 77...



 47%|████▋     | 77/164 [38:55<52:52, 36.47s/it]

Iteration 77: 36.884944 seconds

Starting Question: 78...



 48%|████▊     | 78/164 [39:25<49:41, 34.67s/it]

Iteration 78: 30.475354 seconds

Starting Question: 79...



 48%|████▊     | 79/164 [39:58<48:22, 34.14s/it]

Iteration 79: 32.902013 seconds

Starting Question: 80...



 49%|████▉     | 80/164 [40:36<49:08, 35.10s/it]

Iteration 80: 37.323198 seconds

Saved 16 samples to ./eval_results/samples_20.jsonl
Starting Question: 81...



 49%|████▉     | 81/164 [41:12<48:56, 35.38s/it]

Iteration 81: 36.025111 seconds

Starting Question: 82...



 50%|█████     | 82/164 [41:50<49:33, 36.26s/it]

Iteration 82: 38.332990 seconds

Starting Question: 83...



 51%|█████     | 83/164 [42:27<49:15, 36.49s/it]

Iteration 83: 37.024859 seconds

Starting Question: 84...



 51%|█████     | 84/164 [43:09<50:48, 38.11s/it]

Iteration 84: 41.888091 seconds

Saved 16 samples to ./eval_results/samples_21.jsonl
Starting Question: 85...



 52%|█████▏    | 85/164 [43:52<52:12, 39.65s/it]

Iteration 85: 43.226850 seconds

Starting Question: 86...



 52%|█████▏    | 86/164 [44:28<50:08, 38.57s/it]

Iteration 86: 36.047211 seconds

Starting Question: 87...



 53%|█████▎    | 87/164 [44:57<45:48, 35.69s/it]

Iteration 87: 28.971905 seconds

Starting Question: 88...



 54%|█████▎    | 88/164 [45:38<47:15, 37.31s/it]

Iteration 88: 41.092511 seconds

Saved 16 samples to ./eval_results/samples_22.jsonl
Starting Question: 89...



 54%|█████▍    | 89/164 [46:19<48:01, 38.42s/it]

Iteration 89: 40.993908 seconds

Starting Question: 90...



 55%|█████▍    | 90/164 [46:59<47:59, 38.91s/it]

Iteration 90: 40.068632 seconds

Starting Question: 91...



 55%|█████▌    | 91/164 [47:43<49:02, 40.31s/it]

Iteration 91: 43.568624 seconds

Starting Question: 92...



 56%|█████▌    | 92/164 [48:24<48:32, 40.45s/it]

Iteration 92: 40.791167 seconds

Saved 16 samples to ./eval_results/samples_23.jsonl
Starting Question: 93...



 57%|█████▋    | 93/164 [48:52<43:32, 36.80s/it]

Iteration 93: 28.256316 seconds

Starting Question: 94...



 57%|█████▋    | 94/164 [49:22<40:42, 34.90s/it]

Iteration 94: 30.458524 seconds

Starting Question: 95...



 58%|█████▊    | 95/164 [50:06<43:07, 37.50s/it]

Iteration 95: 43.585494 seconds

Starting Question: 96...



 59%|█████▊    | 96/164 [50:49<44:28, 39.24s/it]

Iteration 96: 43.287566 seconds

Saved 16 samples to ./eval_results/samples_24.jsonl
Starting Question: 97...



 59%|█████▉    | 97/164 [51:32<45:04, 40.37s/it]

Iteration 97: 43.003540 seconds

Starting Question: 98...



 60%|█████▉    | 98/164 [52:13<44:28, 40.43s/it]

Iteration 98: 40.574914 seconds

Starting Question: 99...



 60%|██████    | 99/164 [52:53<43:41, 40.33s/it]

Iteration 99: 40.079248 seconds

Starting Question: 100...



 61%|██████    | 100/164 [53:34<43:20, 40.63s/it]

Iteration 100: 41.348944 seconds

Saved 16 samples to ./eval_results/samples_25.jsonl
Starting Question: 101...



 62%|██████▏   | 101/164 [54:09<40:54, 38.96s/it]

Iteration 101: 35.051756 seconds

Starting Question: 102...



 62%|██████▏   | 102/164 [54:31<34:44, 33.62s/it]

Iteration 102: 21.163373 seconds

Starting Question: 103...



 63%|██████▎   | 103/164 [55:14<37:02, 36.44s/it]

Iteration 103: 43.023293 seconds

Starting Question: 104...



 63%|██████▎   | 104/164 [55:46<35:16, 35.28s/it]

Iteration 104: 32.561154 seconds

Saved 16 samples to ./eval_results/samples_26.jsonl
Starting Question: 105...



 64%|██████▍   | 105/164 [56:26<36:05, 36.71s/it]

Iteration 105: 40.053102 seconds

Starting Question: 106...



 65%|██████▍   | 106/164 [57:08<37:04, 38.36s/it]

Iteration 106: 42.208797 seconds

Starting Question: 107...



 65%|██████▌   | 107/164 [57:46<36:16, 38.18s/it]

Iteration 107: 37.756507 seconds

Starting Question: 108...



 66%|██████▌   | 108/164 [58:28<36:34, 39.19s/it]

Iteration 108: 41.543728 seconds

Saved 16 samples to ./eval_results/samples_27.jsonl
Starting Question: 109...



 66%|██████▋   | 109/164 [58:53<32:04, 34.98s/it]

Iteration 109: 25.171391 seconds

Starting Question: 110...



 67%|██████▋   | 110/164 [59:34<33:13, 36.93s/it]

Iteration 110: 41.450715 seconds

Starting Question: 111...



 68%|██████▊   | 111/164 [1:00:07<31:24, 35.55s/it]

Iteration 111: 32.340207 seconds

Starting Question: 112...



 68%|██████▊   | 112/164 [1:00:50<32:46, 37.83s/it]

Iteration 112: 43.131072 seconds

Saved 16 samples to ./eval_results/samples_28.jsonl
Starting Question: 113...



 69%|██████▉   | 113/164 [1:01:22<30:41, 36.11s/it]

Iteration 113: 32.119130 seconds

Starting Question: 114...



 70%|██████▉   | 114/164 [1:01:57<29:44, 35.69s/it]

Iteration 114: 34.714055 seconds

Starting Question: 115...



 70%|███████   | 115/164 [1:02:40<30:58, 37.92s/it]

Iteration 115: 43.127226 seconds

Starting Question: 116...



 71%|███████   | 116/164 [1:03:20<30:51, 38.57s/it]

Iteration 116: 40.085176 seconds

Saved 16 samples to ./eval_results/samples_29.jsonl
Starting Question: 117...



 71%|███████▏  | 117/164 [1:03:53<28:50, 36.83s/it]

Iteration 117: 32.756564 seconds

Starting Question: 118...



 72%|███████▏  | 118/164 [1:04:32<28:47, 37.55s/it]

Iteration 118: 39.236355 seconds

Starting Question: 119...



 73%|███████▎  | 119/164 [1:05:11<28:29, 37.98s/it]

Iteration 119: 38.977435 seconds

Starting Question: 120...



 73%|███████▎  | 120/164 [1:05:49<27:48, 37.93s/it]

Iteration 120: 37.808945 seconds

Saved 16 samples to ./eval_results/samples_30.jsonl
Starting Question: 121...



 74%|███████▍  | 121/164 [1:06:25<26:44, 37.31s/it]

Iteration 121: 35.870659 seconds

Starting Question: 122...



 74%|███████▍  | 122/164 [1:07:08<27:22, 39.10s/it]

Iteration 122: 43.267399 seconds

Starting Question: 123...



 75%|███████▌  | 123/164 [1:07:51<27:36, 40.41s/it]

Iteration 123: 43.470875 seconds

Starting Question: 124...



 76%|███████▌  | 124/164 [1:08:35<27:32, 41.32s/it]

Iteration 124: 43.425683 seconds

Saved 16 samples to ./eval_results/samples_31.jsonl
Starting Question: 125...



 76%|███████▌  | 125/164 [1:09:16<26:51, 41.32s/it]

Iteration 125: 41.321726 seconds

Starting Question: 126...



 77%|███████▋  | 126/164 [1:09:49<24:29, 38.68s/it]

Iteration 126: 32.528894 seconds

Starting Question: 127...



 77%|███████▋  | 127/164 [1:10:32<24:40, 40.02s/it]

Iteration 127: 43.135832 seconds

Starting Question: 128...



 78%|███████▊  | 128/164 [1:11:15<24:38, 41.08s/it]

Iteration 128: 43.539858 seconds

Saved 16 samples to ./eval_results/samples_32.jsonl
Starting Question: 129...



 79%|███████▊  | 129/164 [1:11:53<23:18, 39.96s/it]

Iteration 129: 37.357154 seconds

Starting Question: 130...



 79%|███████▉  | 130/164 [1:12:36<23:14, 41.01s/it]

Iteration 130: 43.465853 seconds

Starting Question: 131...



 80%|███████▉  | 131/164 [1:13:11<21:31, 39.14s/it]

Iteration 131: 34.765505 seconds

Starting Question: 132...



 80%|████████  | 132/164 [1:13:54<21:30, 40.33s/it]

Iteration 132: 43.117105 seconds

Saved 16 samples to ./eval_results/samples_33.jsonl
Starting Question: 133...



 81%|████████  | 133/164 [1:14:32<20:33, 39.78s/it]

Iteration 133: 38.502789 seconds

Starting Question: 134...



 82%|████████▏ | 134/164 [1:15:14<20:12, 40.40s/it]

Iteration 134: 41.835816 seconds

Starting Question: 135...



 82%|████████▏ | 135/164 [1:15:52<19:04, 39.47s/it]

Iteration 135: 37.303854 seconds

Starting Question: 136...



 83%|████████▎ | 136/164 [1:16:35<18:56, 40.57s/it]

Iteration 136: 43.140540 seconds

Saved 16 samples to ./eval_results/samples_34.jsonl
Starting Question: 137...



 84%|████████▎ | 137/164 [1:17:17<18:24, 40.93s/it]

Iteration 137: 41.747807 seconds

Starting Question: 138...



 84%|████████▍ | 138/164 [1:17:58<17:49, 41.14s/it]

Iteration 138: 41.638647 seconds

Starting Question: 139...



 85%|████████▍ | 139/164 [1:18:41<17:23, 41.76s/it]

Iteration 139: 43.204399 seconds

Starting Question: 140...



 85%|████████▌ | 140/164 [1:19:23<16:41, 41.72s/it]

Iteration 140: 41.629867 seconds

Saved 16 samples to ./eval_results/samples_35.jsonl
Starting Question: 141...



 86%|████████▌ | 141/164 [1:20:02<15:38, 40.81s/it]

Iteration 141: 38.684979 seconds

Starting Question: 142...



 87%|████████▋ | 142/164 [1:20:45<15:13, 41.54s/it]

Iteration 142: 43.236595 seconds

Starting Question: 143...



 87%|████████▋ | 143/164 [1:21:27<14:37, 41.77s/it]

Iteration 143: 42.298319 seconds

Starting Question: 144...



 88%|████████▊ | 144/164 [1:22:11<14:05, 42.28s/it]

Iteration 144: 43.460540 seconds

Saved 16 samples to ./eval_results/samples_36.jsonl
Starting Question: 145...



 88%|████████▊ | 145/164 [1:22:44<12:31, 39.57s/it]

Iteration 145: 33.246466 seconds

Starting Question: 146...



 89%|████████▉ | 146/164 [1:23:25<12:00, 40.02s/it]

Iteration 146: 41.064004 seconds

Starting Question: 147...



 90%|████████▉ | 147/164 [1:24:08<11:37, 41.06s/it]

Iteration 147: 43.486320 seconds

Starting Question: 148...



 90%|█████████ | 148/164 [1:24:48<10:49, 40.59s/it]

Iteration 148: 39.501646 seconds

Saved 16 samples to ./eval_results/samples_37.jsonl
Starting Question: 149...



 91%|█████████ | 149/164 [1:25:32<10:23, 41.54s/it]

Iteration 149: 43.747766 seconds

Starting Question: 150...



 91%|█████████▏| 150/164 [1:26:12<09:34, 41.06s/it]

Iteration 150: 39.955045 seconds

Starting Question: 151...



 92%|█████████▏| 151/164 [1:26:49<08:40, 40.00s/it]

Iteration 151: 37.521871 seconds

Starting Question: 152...



 93%|█████████▎| 152/164 [1:27:34<08:18, 41.54s/it]

Iteration 152: 45.137130 seconds

Saved 16 samples to ./eval_results/samples_38.jsonl
Starting Question: 153...



 93%|█████████▎| 153/164 [1:28:19<07:48, 42.60s/it]

Iteration 153: 45.050582 seconds

Starting Question: 154...



 94%|█████████▍| 154/164 [1:28:56<06:48, 40.89s/it]

Iteration 154: 36.923590 seconds

Starting Question: 155...



 95%|█████████▍| 155/164 [1:29:35<06:00, 40.09s/it]

Iteration 155: 38.224250 seconds

Starting Question: 156...



 95%|█████████▌| 156/164 [1:30:18<05:27, 40.98s/it]

Iteration 156: 43.036067 seconds

Saved 16 samples to ./eval_results/samples_39.jsonl
Starting Question: 157...



 96%|█████████▌| 157/164 [1:30:59<04:48, 41.23s/it]

Iteration 157: 41.822833 seconds

Starting Question: 158...



 96%|█████████▋| 158/164 [1:31:41<04:07, 41.22s/it]

Iteration 158: 41.189548 seconds

Starting Question: 159...



 97%|█████████▋| 159/164 [1:32:11<03:09, 37.92s/it]

Iteration 159: 30.224393 seconds

Starting Question: 160...



 98%|█████████▊| 160/164 [1:32:54<02:37, 39.35s/it]

Iteration 160: 42.696202 seconds

Saved 16 samples to ./eval_results/samples_40.jsonl
Starting Question: 161...



 98%|█████████▊| 161/164 [1:33:37<02:01, 40.51s/it]

Iteration 161: 43.191178 seconds

Starting Question: 162...



 99%|█████████▉| 162/164 [1:34:02<01:11, 35.95s/it]

Iteration 162: 25.307502 seconds

Starting Question: 163...



 99%|█████████▉| 163/164 [1:34:37<00:35, 35.78s/it]

Iteration 163: 35.389844 seconds

Starting Question: 164...



100%|██████████| 164/164 [1:35:16<00:00, 34.86s/it]

Iteration 164: 39.027637 seconds

Saved 16 samples to ./eval_results/samples_41.jsonl





In [7]:
# merge files, stored in human eval
merge_jsonl_files(EVAL_SAVE_DIR, MERGE_OUTPUT)

Merged 41 files into merged_samples.jsonl


In [8]:
import json

def transform_jsonl(input_file: str, output_file: str):
    """
    Transforms each line from:
        {
          "task_id": "...",
          "completion": [{"generated_text": "some string ..."}]
        }
    into:
        {
          "task_id": "...",
          "completion": "some string ..."
        }
    and writes to a new JSONL file.
    """
    transformed_records = []

    # 1. Read each line as JSON
    with open(input_file, "r", encoding="utf-8") as fin:
        for line in fin:
            data = json.loads(line)

            # 2. Extract "generated_text" from the 'completion' list (assuming it exists)
            if "completion" in data and isinstance(data["completion"], list) and data["completion"]:
                # Typically something like [{"generated_text": "..."}]
                gen_text = data["completion"][0].get("generated_text", "")
            else:
                gen_text = ""

            # 3. Replace the 'completion' field with just the string
            data["completion"] = gen_text

            transformed_records.append(data)

    # 4. Write the new structure to a JSONL output file
    with open(output_file, "w", encoding="utf-8") as fout:
        for record in transformed_records:
            fout.write(json.dumps(record) + "\n")


In [9]:
# Usage:
input_path = "/content/human-eval/merged_samples.jsonl"
output_path = "/content/human-eval/clean_merged_samples.jsonl"

transform_jsonl(input_path, output_path)
print(f"Transformed JSONL saved to: {output_path}")

Transformed JSONL saved to: /content/human-eval/clean_merged_samples.jsonl


In [10]:
# run human eval
%cd /content/human-eval/
!evaluate_functional_correctness clean_merged_samples.jsonl

/content/human-eval
Reading samples...
656it [00:00, 26084.67it/s]
Running test suites...
100% 656/656 [00:06<00:00, 107.88it/s]
Writing results to clean_merged_samples.jsonl_results.jsonl...
100% 656/656 [00:00<00:00, 40483.53it/s]
{'pass@1': 0.07164634146341463}
