<a href="https://colab.research.google.com/github/Bri636/ml-programming-winter-2025/blob/main/Code_SIMPO_Eval_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# importing packages
!git clone https://github.com/openai/human-eval.git
%cd human-eval/
!pip install -e .
!pip install transformers
!pip install pydantic
!pip install torch
!pip install bitsandbytes

Cloning into 'human-eval'...
remote: Enumerating objects: 34, done.[K
remote: Counting objects: 100% (26/26), done.[K
remote: Compressing objects: 100% (19/19), done.[K
remote: Total 34 (delta 12), reused 7 (delta 7), pack-reused 8 (from 1)[K
Receiving objects: 100% (34/34), 55.80 KiB | 2.23 MiB/s, done.
Resolving deltas: 100% (13/13), done.
/content/human-eval
Obtaining file:///content/human-eval
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting fire (from human-eval==1.0)
  Downloading fire-0.7.0.tar.gz (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fire
  Building wheel for fire (setup.py) ... [?25l[?25hdone
  Created wheel for fire: filename=fire-0.7.0-py3-none-any.whl size=114249 sha256=ff21b41d42b2df1c9706db12ef7f0a5aeb5086b2cb3465d018dc83f92de4c224
  Stored in directory: /root/.cach

In [2]:
""" Evaluation on Human Eval """

from __future__ import annotations
from typing import TypedDict, Dict, List, Any, Union, TypeVar
from transformers import pipeline
from transformers.pipelines import Pipeline
from functools import partial
from pydantic import Field, BaseModel
from tqdm import tqdm
import timeit
import os, glob
import json, yaml
from pathlib import Path
import torch
# submods
from human_eval.data import write_jsonl, read_problems, HUMAN_EVAL

In [3]:
_DTYPES={
    'bfloat16': torch.bfloat16
}
T = TypeVar('T')
PathLike = Union[Path, str]
# MODEL_NAME_OR_PATH = "unsloth/Qwen2.5-0.5B-Instruct-bnb-4bit"
MODEL_NAME_OR_PATH="BigBri/CodeSimPO"
EVAL_SAVE_DIR = "./eval_results"
MERGE = False
MERGE_OUTPUT = "merged_samples.jsonl"


In [4]:
class BaseConfig(BaseModel):
    """An interface to add JSON/YAML serialization to Pydantic models."""

    # A name literal to correctly identify and construct nested models
    # which have many possible options.
    # name: Literal[''] = ''

    def write_json(self, path: PathLike) -> None:
        """Write the model to a JSON file.

        Parameters
        ----------
        path : str
            The path to the JSON file.
        """
        with open(path, 'w') as fp:
            json.dump(self.model_dump(), fp, indent=2)

    @classmethod
    def from_json(cls: type[T], path: PathLike) -> T:
        """Load the model from a JSON file.

        Parameters
        ----------
        path : str
            The path to the JSON file.

        Returns
        -------
        T
            A specific BaseConfig instance.
        """
        with open(path) as fp:
            data = json.load(fp)
        return cls(**data)

    def write_yaml(self, path: PathLike) -> None:
        """Write the model to a YAML file.

        Parameters
        ----------
        path : str
            The path to the YAML file.
        """
        with open(path, 'w') as fp:
            yaml.dump(
                json.loads(self.model_dump_json()),
                fp,
                indent=4,
                sort_keys=False,
            )

    @classmethod
    def from_yaml(cls: type[T], path: PathLike) -> T:
        """Load the model from a YAML file.

        Parameters
        ----------
        path : PathLike
            The path to the YAML file.

        Returns
        -------
        T
            A specific BaseConfig instance.
        """
        with open(path) as fp:
            raw_data = yaml.safe_load(fp)
        return cls(**raw_data)

class HumanEvalProblem(TypedDict):
    """ One single problem in HumanEval """
    task_id: str
    prompt: str
    entry_point: str
    canonical_solution: str
    test: str

class HumanEvalSolution(TypedDict):
    """ One instance of solution to HumanEval problem """
    task_id: str
    completion: str

class HFGeneratorConfig(BaseConfig):
    model_name_or_path: str = 'BigBri/CodeSimPO'
    task: str = 'text-generation'
    device_map: str = 'auto'
    # device: str = 'cpu'
    torch_dtype: str = 'bfloat16'
    max_new_tokens: int = 256
    truncation: bool = True

class EvaluationConfig(BaseConfig):
    human_eval_path: str = Field(default=HUMAN_EVAL)
    pipeline_config: HFGeneratorConfig = Field(default_factory=HFGeneratorConfig)
    eval_save_dir: str = Field(default=EVAL_SAVE_DIR)
    num_samples_per_task: int = Field(default=4)
    batch_size: int = Field(default=16)  # Save every 10 samples

In [5]:
def format_time(seconds: float) -> str:
    """
    Converts time in seconds to a human-readable format (HH:MM:SS).
    """
    hours, remainder = divmod(seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    return f"{int(hours):02d}:{int(minutes):02d}:{seconds:06.3f}"

def read_jsonl(file_path: str) -> list:
    with open(file_path, "r", encoding="utf-8") as f:
        data = [json.loads(line) for line in f]
    return data

In [6]:
def evaluate(pipeline_func,
             problems: Dict[str, HumanEvalProblem],
             eval_config: EvaluationConfig):
    """ Evaluation of HFModel with batch saving """

    os.makedirs(eval_config.eval_save_dir, exist_ok=True)  # Ensure save directory exists
    batch = []
    file_index = len(glob.glob(os.path.join(eval_config.eval_save_dir, "samples_*.jsonl"))) + 1

    for idx, (task_id, task_data) in tqdm(enumerate(problems.items()), total=len(problems)):
        print(f'Starting Question: {idx + 1}...\n')
        start_time = timeit.default_timer()

        batched_prompts = [task_data["prompt"]] * eval_config.num_samples_per_task
        completions = pipeline_func(
            batched_prompts,
            max_new_tokens=eval_config.pipeline_config.max_new_tokens,
            truncation=eval_config.pipeline_config.truncation
        )

        for completion in completions:
            batch.append({"task_id": task_id, "completion": completion})

        elapsed_time = timeit.default_timer() - start_time
        print(f"Iteration {idx + 1}: {elapsed_time:.6f} seconds\n")

        # Save every `batch_size` samples
        if len(batch) >= eval_config.batch_size:
            save_path = os.path.join(eval_config.eval_save_dir, f"samples_{file_index}.jsonl")
            write_jsonl(save_path, batch)
            print(f"Saved {len(batch)} samples to {save_path}")
            batch.clear()
            file_index += 1

    # Save remaining batch if not empty
    if batch:
        save_path = os.path.join(eval_config.eval_save_dir, f"samples_{file_index}.jsonl")
        write_jsonl(save_path, batch)
        print(f"Saved {len(batch)} final samples to {save_path}")

def merge_jsonl_files(input_dir: str, output_file: str):
    """
    Merges multiple JSONL files from a directory into a single JSONL file.
    """
    jsonl_files = sorted(glob.glob(os.path.join(input_dir, "samples_*.jsonl")))

    with open(output_file, 'w', encoding='utf-8') as outfile:
        for file in jsonl_files:
            with open(file, 'r', encoding='utf-8') as infile:
                for line in infile:
                    outfile.write(line)

    print(f"Merged {len(jsonl_files)} files into {output_file}")

def main():
    # If MERGE is set, just merge and return
    if MERGE:
        merge_jsonl_files(EVAL_SAVE_DIR, MERGE_OUTPUT)
        return

    # Otherwise, run the standard evaluation
    eval_config = EvaluationConfig()
    eval_config.eval_save_dir = EVAL_SAVE_DIR

    pipe_config = eval_config.pipeline_config
    pipe_config.model_name_or_path = MODEL_NAME_OR_PATH
    print(f'Running Model: {pipe_config.model_name_or_path} On Human Eval With Settings: {eval_config.model_dump()}')
    problems: Dict[str, HumanEvalProblem] = read_problems()
    pipe = pipeline(
        model=pipe_config.model_name_or_path,
        task=pipe_config.task,
        device_map=pipe_config.device_map,
        # device=pipe_config.device,
        torch_dtype=_DTYPES.get(pipe_config.torch_dtype)
    )

    evaluate(pipe, problems, eval_config)

if __name__ == "__main__":
    main()

Running Model: BigBri/CodeSimPO On Human Eval With Settings: {'human_eval_path': '/content/human-eval/human_eval/../data/HumanEval.jsonl.gz', 'pipeline_config': {'model_name_or_path': 'BigBri/CodeSimPO', 'task': 'text-generation', 'device_map': 'auto', 'torch_dtype': 'bfloat16', 'max_new_tokens': 256, 'truncation': True}, 'eval_save_dir': './eval_results', 'num_samples_per_task': 4, 'batch_size': 16}


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


adapter_config.json:   0%|          | 0.00/801 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


model.safetensors:   0%|          | 0.00/457M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

Device set to use cuda:0
  0%|          | 0/164 [00:00<?, ?it/s]

Starting Question: 1...



  1%|          | 1/164 [00:13<35:57, 13.23s/it]

Iteration 1: 13.232941 seconds

Starting Question: 2...



  1%|          | 2/164 [00:34<48:04, 17.80s/it]

Iteration 2: 20.999081 seconds

Starting Question: 3...



  2%|▏         | 3/164 [00:59<56:23, 21.02s/it]

Iteration 3: 24.843461 seconds

Starting Question: 4...



  2%|▏         | 4/164 [01:17<53:47, 20.17s/it]

Iteration 4: 18.867789 seconds

Saved 16 samples to ./eval_results/samples_1.jsonl
Starting Question: 5...



  3%|▎         | 5/164 [01:52<1:07:37, 25.52s/it]

Iteration 5: 35.002828 seconds

Starting Question: 6...



  4%|▎         | 6/164 [02:00<51:09, 19.42s/it]  

Iteration 6: 7.592459 seconds

Starting Question: 7...



  4%|▍         | 7/164 [02:26<56:23, 21.55s/it]

Iteration 7: 25.921053 seconds

Starting Question: 8...



  5%|▍         | 8/164 [02:31<41:57, 16.14s/it]

Iteration 8: 4.549018 seconds

Saved 16 samples to ./eval_results/samples_2.jsonl
Starting Question: 9...



  5%|▌         | 9/164 [02:44<39:48, 15.41s/it]

Iteration 9: 13.812417 seconds

Starting Question: 10...



  6%|▌         | 10/164 [03:05<43:23, 16.90s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Iteration 10: 20.241650 seconds

Starting Question: 11...



  7%|▋         | 11/164 [03:48<1:03:56, 25.08s/it]

Iteration 11: 43.614546 seconds

Starting Question: 12...



  7%|▋         | 12/164 [04:01<53:42, 21.20s/it]  

Iteration 12: 12.330942 seconds

Saved 16 samples to ./eval_results/samples_3.jsonl
Starting Question: 13...



  8%|▊         | 13/164 [04:22<53:28, 21.25s/it]

Iteration 13: 21.349419 seconds

Starting Question: 14...



  9%|▊         | 14/164 [04:39<49:58, 19.99s/it]

Iteration 14: 17.087850 seconds

Starting Question: 15...



  9%|▉         | 15/164 [04:54<45:56, 18.50s/it]

Iteration 15: 15.046946 seconds

Starting Question: 16...



 10%|▉         | 16/164 [05:19<50:33, 20.49s/it]

Iteration 16: 25.116217 seconds

Saved 16 samples to ./eval_results/samples_4.jsonl
Starting Question: 17...



 10%|█         | 17/164 [05:28<41:21, 16.88s/it]

Iteration 17: 8.486318 seconds

Starting Question: 18...



 11%|█         | 18/164 [05:48<43:25, 17.84s/it]

Iteration 18: 20.077316 seconds

Starting Question: 19...



 12%|█▏        | 19/164 [06:08<45:11, 18.70s/it]

Iteration 19: 20.689270 seconds

Starting Question: 20...



 12%|█▏        | 20/164 [06:28<45:35, 18.99s/it]

Iteration 20: 19.678478 seconds

Saved 16 samples to ./eval_results/samples_5.jsonl
Starting Question: 21...



 13%|█▎        | 21/164 [06:47<45:32, 19.11s/it]

Iteration 21: 19.368166 seconds

Starting Question: 22...



 13%|█▎        | 22/164 [07:05<43:46, 18.50s/it]

Iteration 22: 17.076351 seconds

Starting Question: 23...



 14%|█▍        | 23/164 [07:11<34:40, 14.75s/it]

Iteration 23: 6.020065 seconds

Starting Question: 24...



 15%|█▍        | 24/164 [07:31<38:44, 16.60s/it]

Iteration 24: 20.919581 seconds

Saved 16 samples to ./eval_results/samples_6.jsonl
Starting Question: 25...



 15%|█▌        | 25/164 [07:42<34:20, 14.82s/it]

Iteration 25: 10.664729 seconds

Starting Question: 26...



 16%|█▌        | 26/164 [07:55<32:57, 14.33s/it]

Iteration 26: 13.188898 seconds

Starting Question: 27...



 16%|█▋        | 27/164 [08:02<27:45, 12.16s/it]

Iteration 27: 7.081616 seconds

Starting Question: 28...



 17%|█▋        | 28/164 [08:35<41:19, 18.23s/it]

Iteration 28: 32.393879 seconds

Saved 16 samples to ./eval_results/samples_7.jsonl
Starting Question: 29...



 18%|█▊        | 29/164 [08:42<33:33, 14.91s/it]

Iteration 29: 7.172671 seconds

Starting Question: 30...



 18%|█▊        | 30/164 [08:50<28:42, 12.85s/it]

Iteration 30: 8.038248 seconds

Starting Question: 31...



 19%|█▉        | 31/164 [09:05<29:40, 13.39s/it]

Iteration 31: 14.634065 seconds

Starting Question: 32...



 20%|█▉        | 32/164 [09:21<31:23, 14.27s/it]

Iteration 32: 16.332397 seconds

Saved 16 samples to ./eval_results/samples_8.jsonl
Starting Question: 33...



 20%|██        | 33/164 [09:48<39:12, 17.96s/it]

Iteration 33: 26.555842 seconds

Starting Question: 34...



 21%|██        | 34/164 [10:00<35:27, 16.36s/it]

Iteration 34: 12.638953 seconds

Starting Question: 35...



 21%|██▏       | 35/164 [10:26<40:59, 19.06s/it]

Iteration 35: 25.360777 seconds

Starting Question: 36...



 22%|██▏       | 36/164 [10:33<32:58, 15.46s/it]

Iteration 36: 7.041100 seconds

Saved 16 samples to ./eval_results/samples_9.jsonl
Starting Question: 37...



 23%|██▎       | 37/164 [10:50<33:58, 16.05s/it]

Iteration 37: 17.439096 seconds

Starting Question: 38...



 23%|██▎       | 38/164 [11:05<32:47, 15.61s/it]

Iteration 38: 14.590388 seconds

Starting Question: 39...



 24%|██▍       | 39/164 [11:37<42:43, 20.51s/it]

Iteration 39: 31.931858 seconds

Starting Question: 40...



 24%|██▍       | 40/164 [12:06<47:49, 23.14s/it]

Iteration 40: 29.268146 seconds

Saved 16 samples to ./eval_results/samples_10.jsonl
Starting Question: 41...



 25%|██▌       | 41/164 [12:31<48:24, 23.61s/it]

Iteration 41: 24.726450 seconds

Starting Question: 42...



 26%|██▌       | 42/164 [12:58<50:17, 24.73s/it]

Iteration 42: 27.344015 seconds

Starting Question: 43...



 26%|██▌       | 43/164 [13:14<44:54, 22.27s/it]

Iteration 43: 16.520221 seconds

Starting Question: 44...



 27%|██▋       | 44/164 [13:39<45:39, 22.83s/it]

Iteration 44: 24.137638 seconds

Saved 16 samples to ./eval_results/samples_11.jsonl
Starting Question: 45...



 27%|██▋       | 45/164 [14:16<54:00, 27.24s/it]

Iteration 45: 37.509772 seconds

Starting Question: 46...



 28%|██▊       | 46/164 [14:47<55:47, 28.37s/it]

Iteration 46: 31.010226 seconds

Starting Question: 47...



 29%|██▊       | 47/164 [15:25<1:00:41, 31.13s/it]

Iteration 47: 37.562035 seconds

Starting Question: 48...



 29%|██▉       | 48/164 [15:49<56:12, 29.07s/it]  

Iteration 48: 24.278131 seconds

Saved 16 samples to ./eval_results/samples_12.jsonl
Starting Question: 49...



 30%|██▉       | 49/164 [16:09<50:45, 26.48s/it]

Iteration 49: 20.426873 seconds

Starting Question: 50...



 30%|███       | 50/164 [16:39<52:12, 27.47s/it]

Iteration 50: 29.794497 seconds

Starting Question: 51...



 31%|███       | 51/164 [17:06<51:32, 27.37s/it]

Iteration 51: 27.124195 seconds

Starting Question: 52...



 32%|███▏      | 52/164 [17:22<44:45, 23.98s/it]

Iteration 52: 16.057060 seconds

Saved 16 samples to ./eval_results/samples_13.jsonl
Starting Question: 53...



 32%|███▏      | 53/164 [17:43<42:40, 23.07s/it]

Iteration 53: 20.952916 seconds

Starting Question: 54...



 33%|███▎      | 54/164 [18:08<43:08, 23.53s/it]

Iteration 54: 24.597168 seconds

Starting Question: 55...



 34%|███▎      | 55/164 [18:14<33:18, 18.34s/it]

Iteration 55: 6.224101 seconds

Starting Question: 56...



 34%|███▍      | 56/164 [18:35<34:09, 18.97s/it]

Iteration 56: 20.453086 seconds

Saved 16 samples to ./eval_results/samples_14.jsonl
Starting Question: 57...



 35%|███▍      | 57/164 [19:08<41:43, 23.40s/it]

Iteration 57: 33.734632 seconds

Starting Question: 58...



 35%|███▌      | 58/164 [19:34<42:46, 24.21s/it]

Iteration 58: 26.093784 seconds

Starting Question: 59...



 36%|███▌      | 59/164 [20:04<45:06, 25.78s/it]

Iteration 59: 29.433439 seconds

Starting Question: 60...



 37%|███▋      | 60/164 [20:38<48:49, 28.17s/it]

Iteration 60: 33.739434 seconds

Saved 16 samples to ./eval_results/samples_15.jsonl
Starting Question: 61...



 37%|███▋      | 61/164 [21:04<47:29, 27.67s/it]

Iteration 61: 26.495071 seconds

Starting Question: 62...



 38%|███▊      | 62/164 [21:31<46:49, 27.54s/it]

Iteration 62: 27.254422 seconds

Starting Question: 63...



 38%|███▊      | 63/164 [21:53<43:26, 25.81s/it]

Iteration 63: 21.770584 seconds

Starting Question: 64...



 39%|███▉      | 64/164 [22:21<44:11, 26.52s/it]

Iteration 64: 28.170901 seconds

Saved 16 samples to ./eval_results/samples_16.jsonl
Starting Question: 65...



 40%|███▉      | 65/164 [22:46<42:43, 25.90s/it]

Iteration 65: 24.438480 seconds

Starting Question: 66...



 40%|████      | 66/164 [23:15<44:09, 27.04s/it]

Iteration 66: 29.696511 seconds

Starting Question: 67...



 41%|████      | 67/164 [23:53<48:43, 30.14s/it]

Iteration 67: 37.392217 seconds

Starting Question: 68...



 41%|████▏     | 68/164 [24:22<47:40, 29.80s/it]

Iteration 68: 28.996681 seconds

Saved 16 samples to ./eval_results/samples_17.jsonl
Starting Question: 69...



 42%|████▏     | 69/164 [24:35<39:07, 24.71s/it]

Iteration 69: 12.821248 seconds

Starting Question: 70...



 43%|████▎     | 70/164 [25:00<39:11, 25.01s/it]

Iteration 70: 25.725814 seconds

Starting Question: 71...



 43%|████▎     | 71/164 [25:22<37:09, 23.97s/it]

Iteration 71: 21.527803 seconds

Starting Question: 72...



 44%|████▍     | 72/164 [25:55<41:10, 26.85s/it]

Iteration 72: 33.567102 seconds

Saved 16 samples to ./eval_results/samples_18.jsonl
Starting Question: 73...



 45%|████▍     | 73/164 [26:28<43:30, 28.69s/it]

Iteration 73: 32.978038 seconds

Starting Question: 74...



 45%|████▌     | 74/164 [27:08<47:59, 31.99s/it]

Iteration 74: 39.694879 seconds

Starting Question: 75...



 46%|████▌     | 75/164 [27:37<46:05, 31.07s/it]

Iteration 75: 28.936306 seconds

Starting Question: 76...



 46%|████▋     | 76/164 [28:03<43:22, 29.57s/it]

Iteration 76: 26.061657 seconds

Saved 16 samples to ./eval_results/samples_19.jsonl
Starting Question: 77...



 47%|████▋     | 77/164 [28:47<48:56, 33.76s/it]

Iteration 77: 43.517851 seconds

Starting Question: 78...



 48%|████▊     | 78/164 [29:16<46:41, 32.58s/it]

Iteration 78: 29.835670 seconds

Starting Question: 79...



 48%|████▊     | 79/164 [29:46<44:55, 31.71s/it]

Iteration 79: 29.675720 seconds

Starting Question: 80...



 49%|████▉     | 80/164 [30:20<45:09, 32.26s/it]

Iteration 80: 33.544357 seconds

Saved 16 samples to ./eval_results/samples_20.jsonl
Starting Question: 81...



 49%|████▉     | 81/164 [30:55<45:47, 33.11s/it]

Iteration 81: 35.077659 seconds

Starting Question: 82...



 50%|█████     | 82/164 [31:24<43:38, 31.93s/it]

Iteration 82: 29.193422 seconds

Starting Question: 83...



 51%|█████     | 83/164 [32:01<45:06, 33.42s/it]

Iteration 83: 36.880283 seconds

Starting Question: 84...



 51%|█████     | 84/164 [32:35<44:41, 33.52s/it]

Iteration 84: 33.761246 seconds

Saved 16 samples to ./eval_results/samples_21.jsonl
Starting Question: 85...



 52%|█████▏    | 85/164 [32:51<37:25, 28.42s/it]

Iteration 85: 16.522682 seconds

Starting Question: 86...



 52%|█████▏    | 86/164 [33:19<36:33, 28.12s/it]

Iteration 86: 27.426070 seconds

Starting Question: 87...



 53%|█████▎    | 87/164 [33:51<37:39, 29.35s/it]

Iteration 87: 32.198344 seconds

Starting Question: 88...



 54%|█████▎    | 88/164 [34:24<38:28, 30.38s/it]

Iteration 88: 32.781423 seconds

Saved 16 samples to ./eval_results/samples_22.jsonl
Starting Question: 89...



 54%|█████▍    | 89/164 [34:53<37:41, 30.15s/it]

Iteration 89: 29.608957 seconds

Starting Question: 90...



 55%|█████▍    | 90/164 [35:32<40:27, 32.80s/it]

Iteration 90: 38.981662 seconds

Starting Question: 91...



 55%|█████▌    | 91/164 [36:03<39:21, 32.36s/it]

Iteration 91: 31.319923 seconds

Starting Question: 92...



 56%|█████▌    | 92/164 [36:26<35:06, 29.26s/it]

Iteration 92: 22.038037 seconds

Saved 16 samples to ./eval_results/samples_23.jsonl
Starting Question: 93...



 57%|█████▋    | 93/164 [36:49<32:36, 27.55s/it]

Iteration 93: 23.557141 seconds

Starting Question: 94...



 57%|█████▋    | 94/164 [37:21<33:50, 29.01s/it]

Iteration 94: 32.401369 seconds

Starting Question: 95...



 58%|█████▊    | 95/164 [38:05<38:29, 33.46s/it]

Iteration 95: 43.866901 seconds

Starting Question: 96...



 59%|█████▊    | 96/164 [38:42<39:02, 34.45s/it]

Iteration 96: 36.743784 seconds

Saved 16 samples to ./eval_results/samples_24.jsonl
Starting Question: 97...



 59%|█████▉    | 97/164 [39:05<34:32, 30.93s/it]

Iteration 97: 22.722420 seconds

Starting Question: 98...



 60%|█████▉    | 98/164 [39:40<35:22, 32.15s/it]

Iteration 98: 35.000448 seconds

Starting Question: 99...



 60%|██████    | 99/164 [40:11<34:38, 31.97s/it]

Iteration 99: 31.555292 seconds

Starting Question: 100...



 61%|██████    | 100/164 [40:49<35:56, 33.69s/it]

Iteration 100: 37.701146 seconds

Saved 16 samples to ./eval_results/samples_25.jsonl
Starting Question: 101...



 62%|██████▏   | 101/164 [41:06<30:08, 28.71s/it]

Iteration 101: 17.091236 seconds

Starting Question: 102...



 62%|██████▏   | 102/164 [41:32<28:46, 27.84s/it]

Iteration 102: 25.812859 seconds

Starting Question: 103...



 63%|██████▎   | 103/164 [42:09<31:08, 30.64s/it]

Iteration 103: 37.160434 seconds

Starting Question: 104...



 63%|██████▎   | 104/164 [42:30<27:35, 27.59s/it]

Iteration 104: 20.489829 seconds

Saved 16 samples to ./eval_results/samples_26.jsonl
Starting Question: 105...



 64%|██████▍   | 105/164 [42:39<21:36, 21.98s/it]

Iteration 105: 8.866983 seconds

Starting Question: 106...



 65%|██████▍   | 106/164 [43:03<21:50, 22.60s/it]

Iteration 106: 24.053581 seconds

Starting Question: 107...



 65%|██████▌   | 107/164 [43:26<21:39, 22.80s/it]

Iteration 107: 23.253731 seconds

Starting Question: 108...



 66%|██████▌   | 108/164 [43:49<21:27, 22.99s/it]

Iteration 108: 23.454307 seconds

Saved 16 samples to ./eval_results/samples_27.jsonl
Starting Question: 109...



 66%|██████▋   | 109/164 [44:22<23:40, 25.84s/it]

Iteration 109: 32.463836 seconds

Starting Question: 110...



 67%|██████▋   | 110/164 [44:58<25:58, 28.87s/it]

Iteration 110: 35.946005 seconds

Starting Question: 111...



 68%|██████▊   | 111/164 [45:19<23:36, 26.72s/it]

Iteration 111: 21.719175 seconds

Starting Question: 112...



 68%|██████▊   | 112/164 [45:55<25:26, 29.36s/it]

Iteration 112: 35.490601 seconds

Saved 16 samples to ./eval_results/samples_28.jsonl
Starting Question: 113...



 69%|██████▉   | 113/164 [46:18<23:24, 27.54s/it]

Iteration 113: 23.288514 seconds

Starting Question: 114...



 70%|██████▉   | 114/164 [46:52<24:38, 29.56s/it]

Iteration 114: 34.294026 seconds

Starting Question: 115...



 70%|███████   | 115/164 [47:33<26:51, 32.88s/it]

Iteration 115: 40.620134 seconds

Starting Question: 116...



 71%|███████   | 116/164 [47:56<24:00, 30.01s/it]

Iteration 116: 23.312687 seconds

Saved 16 samples to ./eval_results/samples_29.jsonl
Starting Question: 117...



 71%|███████▏  | 117/164 [48:18<21:31, 27.48s/it]

Iteration 117: 21.570102 seconds

Starting Question: 118...



 72%|███████▏  | 118/164 [48:59<24:06, 31.45s/it]

Iteration 118: 40.702394 seconds

Starting Question: 119...



 73%|███████▎  | 119/164 [49:35<24:34, 32.77s/it]

Iteration 119: 35.873104 seconds

Starting Question: 120...



 73%|███████▎  | 120/164 [50:00<22:25, 30.59s/it]

Iteration 120: 25.482486 seconds

Saved 16 samples to ./eval_results/samples_30.jsonl
Starting Question: 121...



 74%|███████▍  | 121/164 [50:20<19:36, 27.35s/it]

Iteration 121: 19.805762 seconds

Starting Question: 122...



 74%|███████▍  | 122/164 [50:53<20:21, 29.07s/it]

Iteration 122: 33.085412 seconds

Starting Question: 123...



 75%|███████▌  | 123/164 [51:19<19:13, 28.14s/it]

Iteration 123: 25.957552 seconds

Starting Question: 124...



 76%|███████▌  | 124/164 [51:45<18:18, 27.46s/it]

Iteration 124: 25.871247 seconds

Saved 16 samples to ./eval_results/samples_31.jsonl
Starting Question: 125...



 76%|███████▌  | 125/164 [52:09<17:12, 26.47s/it]

Iteration 125: 24.152942 seconds

Starting Question: 126...



 77%|███████▋  | 126/164 [52:38<17:17, 27.31s/it]

Iteration 126: 29.259833 seconds

Starting Question: 127...



 77%|███████▋  | 127/164 [53:07<17:08, 27.81s/it]

Iteration 127: 28.978160 seconds

Starting Question: 128...



 78%|███████▊  | 128/164 [53:46<18:34, 30.97s/it]

Iteration 128: 38.342485 seconds

Saved 16 samples to ./eval_results/samples_32.jsonl
Starting Question: 129...



 79%|███████▊  | 129/164 [54:16<17:56, 30.76s/it]

Iteration 129: 30.253537 seconds

Starting Question: 130...



 79%|███████▉  | 130/164 [54:41<16:26, 29.01s/it]

Iteration 130: 24.951591 seconds

Starting Question: 131...



 80%|███████▉  | 131/164 [54:55<13:31, 24.59s/it]

Iteration 131: 14.271507 seconds

Starting Question: 132...



 80%|████████  | 132/164 [55:16<12:29, 23.42s/it]

Iteration 132: 20.691833 seconds

Saved 16 samples to ./eval_results/samples_33.jsonl
Starting Question: 133...



 81%|████████  | 133/164 [55:45<13:02, 25.23s/it]

Iteration 133: 29.454242 seconds

Starting Question: 134...



 82%|████████▏ | 134/164 [56:12<12:49, 25.65s/it]

Iteration 134: 26.610802 seconds

Starting Question: 135...



 82%|████████▏ | 135/164 [56:36<12:10, 25.18s/it]

Iteration 135: 24.075405 seconds

Starting Question: 136...



 83%|████████▎ | 136/164 [57:13<13:26, 28.81s/it]

Iteration 136: 37.296754 seconds

Saved 16 samples to ./eval_results/samples_34.jsonl
Starting Question: 137...



 84%|████████▎ | 137/164 [57:44<13:15, 29.47s/it]

Iteration 137: 30.989650 seconds

Starting Question: 138...



 84%|████████▍ | 138/164 [58:23<13:58, 32.25s/it]

Iteration 138: 38.730916 seconds

Starting Question: 139...



 85%|████████▍ | 139/164 [58:46<12:17, 29.49s/it]

Iteration 139: 23.057556 seconds

Starting Question: 140...



 85%|████████▌ | 140/164 [59:23<12:39, 31.64s/it]

Iteration 140: 36.655959 seconds

Saved 16 samples to ./eval_results/samples_35.jsonl
Starting Question: 141...



 86%|████████▌ | 141/164 [59:39<10:25, 27.21s/it]

Iteration 141: 16.884680 seconds

Starting Question: 142...



 87%|████████▋ | 142/164 [59:48<07:53, 21.52s/it]

Iteration 142: 8.219370 seconds

Starting Question: 143...



 87%|████████▋ | 143/164 [1:00:20<08:37, 24.63s/it]

Iteration 143: 31.889245 seconds

Starting Question: 144...



 88%|████████▊ | 144/164 [1:00:41<07:52, 23.61s/it]

Iteration 144: 21.230031 seconds

Saved 16 samples to ./eval_results/samples_36.jsonl
Starting Question: 145...



 88%|████████▊ | 145/164 [1:01:09<07:56, 25.06s/it]

Iteration 145: 28.451581 seconds

Starting Question: 146...



 89%|████████▉ | 146/164 [1:01:40<08:03, 26.85s/it]

Iteration 146: 31.022415 seconds

Starting Question: 147...



 90%|████████▉ | 147/164 [1:02:03<07:15, 25.61s/it]

Iteration 147: 22.696783 seconds

Starting Question: 148...



 90%|█████████ | 148/164 [1:02:41<07:49, 29.35s/it]

Iteration 148: 38.100991 seconds

Saved 16 samples to ./eval_results/samples_37.jsonl
Starting Question: 149...



 91%|█████████ | 149/164 [1:03:18<07:53, 31.59s/it]

Iteration 149: 36.799148 seconds

Starting Question: 150...



 91%|█████████▏| 150/164 [1:03:35<06:22, 27.36s/it]

Iteration 150: 17.477472 seconds

Starting Question: 151...



 92%|█████████▏| 151/164 [1:03:47<04:53, 22.56s/it]

Iteration 151: 11.379169 seconds

Starting Question: 152...



 93%|█████████▎| 152/164 [1:04:23<05:21, 26.79s/it]

Iteration 152: 36.639931 seconds

Saved 16 samples to ./eval_results/samples_38.jsonl
Starting Question: 153...



 93%|█████████▎| 153/164 [1:04:51<04:58, 27.17s/it]

Iteration 153: 28.058319 seconds

Starting Question: 154...



 94%|█████████▍| 154/164 [1:05:32<05:12, 31.28s/it]

Iteration 154: 40.870505 seconds

Starting Question: 155...



 95%|█████████▍| 155/164 [1:06:11<05:01, 33.53s/it]

Iteration 155: 38.790142 seconds

Starting Question: 156...



 95%|█████████▌| 156/164 [1:06:27<03:45, 28.23s/it]

Iteration 156: 15.838564 seconds

Saved 16 samples to ./eval_results/samples_39.jsonl
Starting Question: 157...



 96%|█████████▌| 157/164 [1:07:11<03:50, 32.98s/it]

Iteration 157: 44.059836 seconds

Starting Question: 158...



 96%|█████████▋| 158/164 [1:07:30<02:53, 28.88s/it]

Iteration 158: 19.327648 seconds

Starting Question: 159...



 97%|█████████▋| 159/164 [1:07:49<02:08, 25.72s/it]

Iteration 159: 18.333005 seconds

Starting Question: 160...



 98%|█████████▊| 160/164 [1:08:09<01:36, 24.08s/it]

Iteration 160: 20.255719 seconds

Saved 16 samples to ./eval_results/samples_40.jsonl
Starting Question: 161...



 98%|█████████▊| 161/164 [1:08:42<01:20, 26.73s/it]

Iteration 161: 32.906342 seconds

Starting Question: 162...



 99%|█████████▉| 162/164 [1:09:02<00:49, 24.74s/it]

Iteration 162: 20.086058 seconds

Starting Question: 163...



 99%|█████████▉| 163/164 [1:09:30<00:25, 25.64s/it]

Iteration 163: 27.744299 seconds

Starting Question: 164...



100%|██████████| 164/164 [1:10:03<00:00, 25.63s/it]

Iteration 164: 33.564172 seconds

Saved 16 samples to ./eval_results/samples_41.jsonl





In [7]:
# merge files, stored in human eval
merge_jsonl_files(EVAL_SAVE_DIR, MERGE_OUTPUT)

Merged 41 files into merged_samples.jsonl


In [8]:
import json

def transform_jsonl(input_file: str, output_file: str):
    """
    Transforms each line from:
        {
          "task_id": "...",
          "completion": [{"generated_text": "some string ..."}]
        }
    into:
        {
          "task_id": "...",
          "completion": "some string ..."
        }
    and writes to a new JSONL file.
    """
    transformed_records = []

    # 1. Read each line as JSON
    with open(input_file, "r", encoding="utf-8") as fin:
        for line in fin:
            data = json.loads(line)

            # 2. Extract "generated_text" from the 'completion' list (assuming it exists)
            if "completion" in data and isinstance(data["completion"], list) and data["completion"]:
                # Typically something like [{"generated_text": "..."}]
                gen_text = data["completion"][0].get("generated_text", "")
            else:
                gen_text = ""

            # 3. Replace the 'completion' field with just the string
            data["completion"] = gen_text

            transformed_records.append(data)

    # 4. Write the new structure to a JSONL output file
    with open(output_file, "w", encoding="utf-8") as fout:
        for record in transformed_records:
            fout.write(json.dumps(record) + "\n")


In [9]:
# Usage:
input_path = "/content/human-eval/merged_samples.jsonl"
output_path = "/content/human-eval/clean_merged_samples.jsonl"

transform_jsonl(input_path, output_path)
print(f"Transformed JSONL saved to: {output_path}")

Transformed JSONL saved to: /content/human-eval/clean_merged_samples.jsonl


In [10]:
# run human eval
%cd /content/human-eval/
!evaluate_functional_correctness clean_merged_samples.jsonl

/content/human-eval
Reading samples...
656it [00:00, 19208.90it/s]
Running test suites...
100% 656/656 [00:09<00:00, 67.86it/s]
Writing results to clean_merged_samples.jsonl_results.jsonl...
100% 656/656 [00:00<00:00, 44804.81it/s]
{'pass@1': 0.1173780487804878}
