<a href="https://colab.research.google.com/github/Bri636/ml-programming-winter-2025/blob/main/Eval_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# importing packages
!git clone https://github.com/openai/human-eval.git
%cd human-eval/
!pip install -e .
!pip install transformers
!pip install pydantic
!pip install torch
!pip install bitsandbytes

fatal: destination path 'human-eval' already exists and is not an empty directory.
/content/human-eval
Obtaining file:///content/human-eval
  Preparing metadata (setup.py) ... [?25l[?25hdone
Installing collected packages: human-eval
  Attempting uninstall: human-eval
    Found existing installation: human-eval 1.0
    Uninstalling human-eval-1.0:
      Successfully uninstalled human-eval-1.0
  Running setup.py develop for human-eval
Successfully installed human-eval-1.0


In [2]:
""" Evaluation on Human Eval """

from __future__ import annotations
from typing import TypedDict, Dict, List, Any, Union, TypeVar
from transformers import pipeline
from transformers.pipelines import Pipeline
from functools import partial
from pydantic import Field, BaseModel
from tqdm import tqdm
import timeit
import os, glob
import json, yaml
from pathlib import Path
import torch
# submods
from human_eval.data import write_jsonl, read_problems, HUMAN_EVAL

In [3]:
_DTYPES={
    'bfloat16': torch.bfloat16
}
T = TypeVar('T')
PathLike = Union[Path, str]
# MODEL_NAME_OR_PATH = "unsloth/Qwen2.5-0.5B-Instruct-bnb-4bit"
MODEL_NAME_OR_PATH="zhaviraax/qwen2-3332"
EVAL_SAVE_DIR = "./eval_results"
MERGE = False
MERGE_OUTPUT = "merged_samples.jsonl"


In [4]:
class BaseConfig(BaseModel):
    """An interface to add JSON/YAML serialization to Pydantic models."""

    # A name literal to correctly identify and construct nested models
    # which have many possible options.
    # name: Literal[''] = ''

    def write_json(self, path: PathLike) -> None:
        """Write the model to a JSON file.

        Parameters
        ----------
        path : str
            The path to the JSON file.
        """
        with open(path, 'w') as fp:
            json.dump(self.model_dump(), fp, indent=2)

    @classmethod
    def from_json(cls: type[T], path: PathLike) -> T:
        """Load the model from a JSON file.

        Parameters
        ----------
        path : str
            The path to the JSON file.

        Returns
        -------
        T
            A specific BaseConfig instance.
        """
        with open(path) as fp:
            data = json.load(fp)
        return cls(**data)

    def write_yaml(self, path: PathLike) -> None:
        """Write the model to a YAML file.

        Parameters
        ----------
        path : str
            The path to the YAML file.
        """
        with open(path, 'w') as fp:
            yaml.dump(
                json.loads(self.model_dump_json()),
                fp,
                indent=4,
                sort_keys=False,
            )

    @classmethod
    def from_yaml(cls: type[T], path: PathLike) -> T:
        """Load the model from a YAML file.

        Parameters
        ----------
        path : PathLike
            The path to the YAML file.

        Returns
        -------
        T
            A specific BaseConfig instance.
        """
        with open(path) as fp:
            raw_data = yaml.safe_load(fp)
        return cls(**raw_data)

class HumanEvalProblem(TypedDict):
    """ One single problem in HumanEval """
    task_id: str
    prompt: str
    entry_point: str
    canonical_solution: str
    test: str

class HumanEvalSolution(TypedDict):
    """ One instance of solution to HumanEval problem """
    task_id: str
    completion: str

class HFGeneratorConfig(BaseConfig):
    model_name_or_path: str = 'zhaviraax/qwen2-3332'
    task: str = 'text-generation'
    device_map: str = 'auto'
    torch_dtype: str = 'bfloat16'
    max_new_tokens: int = 256
    truncation: bool = True

class EvaluationConfig(BaseConfig):
    human_eval_path: str = Field(default=HUMAN_EVAL)
    pipeline_config: HFGeneratorConfig = Field(default_factory=HFGeneratorConfig)
    eval_save_dir: str = Field(default=EVAL_SAVE_DIR)
    num_samples_per_task: int = Field(default=4)
    batch_size: int = Field(default=16)  # Save every 10 samples

In [5]:
def format_time(seconds: float) -> str:
    """
    Converts time in seconds to a human-readable format (HH:MM:SS).
    """
    hours, remainder = divmod(seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    return f"{int(hours):02d}:{int(minutes):02d}:{seconds:06.3f}"

def read_jsonl(file_path: str) -> list:
    with open(file_path, "r", encoding="utf-8") as f:
        data = [json.loads(line) for line in f]
    return data

In [6]:
def evaluate(pipeline_func,
             problems: Dict[str, HumanEvalProblem],
             eval_config: EvaluationConfig):
    """ Evaluation of HFModel with batch saving """

    os.makedirs(eval_config.eval_save_dir, exist_ok=True)  # Ensure save directory exists
    batch = []
    file_index = len(glob.glob(os.path.join(eval_config.eval_save_dir, "samples_*.jsonl"))) + 1

    for idx, (task_id, task_data) in tqdm(enumerate(problems.items()), total=len(problems)):
        print(f'Starting Question: {idx + 1}...\n')
        start_time = timeit.default_timer()

        batched_prompts = [task_data["prompt"]] * eval_config.num_samples_per_task
        completions = pipeline_func(
            batched_prompts,
            max_new_tokens=eval_config.pipeline_config.max_new_tokens,
            truncation=eval_config.pipeline_config.truncation
        )

        for completion in completions:
            batch.append({"task_id": task_id, "completion": completion})

        elapsed_time = timeit.default_timer() - start_time
        print(f"Iteration {idx + 1}: {elapsed_time:.6f} seconds\n")

        # Save every `batch_size` samples
        if len(batch) >= eval_config.batch_size:
            save_path = os.path.join(eval_config.eval_save_dir, f"samples_{file_index}.jsonl")
            write_jsonl(save_path, batch)
            print(f"Saved {len(batch)} samples to {save_path}")
            batch.clear()
            file_index += 1

    # Save remaining batch if not empty
    if batch:
        save_path = os.path.join(eval_config.eval_save_dir, f"samples_{file_index}.jsonl")
        write_jsonl(save_path, batch)
        print(f"Saved {len(batch)} final samples to {save_path}")

def merge_jsonl_files(input_dir: str, output_file: str):
    """
    Merges multiple JSONL files from a directory into a single JSONL file.
    """
    jsonl_files = sorted(glob.glob(os.path.join(input_dir, "samples_*.jsonl")))

    with open(output_file, 'w', encoding='utf-8') as outfile:
        for file in jsonl_files:
            with open(file, 'r', encoding='utf-8') as infile:
                for line in infile:
                    outfile.write(line)

    print(f"Merged {len(jsonl_files)} files into {output_file}")

def main():
    # If MERGE is set, just merge and return
    if MERGE:
        merge_jsonl_files(EVAL_SAVE_DIR, MERGE_OUTPUT)
        return

    # Otherwise, run the standard evaluation
    eval_config = EvaluationConfig()
    eval_config.eval_save_dir = EVAL_SAVE_DIR

    pipe_config = eval_config.pipeline_config
    pipe_config.model_name_or_path = MODEL_NAME_OR_PATH
    print(f'Running Model: {pipe_config.model_name_or_path} On Human Eval With Settings: {eval_config.model_dump()}')
    problems: Dict[str, HumanEvalProblem] = read_problems()
    pipe = pipeline(
        model=pipe_config.model_name_or_path,
        task=pipe_config.task,
        device_map=pipe_config.device_map,
        torch_dtype=_DTYPES.get(pipe_config.torch_dtype)
    )

    evaluate(pipe, problems, eval_config)

if __name__ == "__main__":
    main()

Running Model: zhaviraax/qwen2-3332 On Human Eval With Settings: {'human_eval_path': '/content/human-eval/human_eval/../data/HumanEval.jsonl.gz', 'pipeline_config': {'model_name_or_path': 'zhaviraax/qwen2-3332', 'task': 'text-generation', 'device_map': 'auto', 'torch_dtype': 'bfloat16', 'max_new_tokens': 256, 'truncation': True}, 'eval_save_dir': './eval_results', 'num_samples_per_task': 4, 'batch_size': 16}


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


adapter_config.json:   0%|          | 0.00/807 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


model.safetensors:   0%|          | 0.00/457M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/265 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.32k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/80.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/367 [00:00<?, ?B/s]

Device set to use cuda:0
  0%|          | 0/164 [00:00<?, ?it/s]

Starting Question: 1...



  1%|          | 1/164 [00:26<1:10:45, 26.05s/it]

Iteration 1: 26.045892 seconds

Starting Question: 2...



  1%|          | 2/164 [00:59<1:22:38, 30.61s/it]

Iteration 2: 33.797075 seconds

Starting Question: 3...



  2%|▏         | 3/164 [01:38<1:32:06, 34.32s/it]

Iteration 3: 38.744018 seconds

Starting Question: 4...



  2%|▏         | 4/164 [02:03<1:21:24, 30.53s/it]

Iteration 4: 24.709398 seconds

Saved 16 samples to ./eval_results/samples_2.jsonl
Starting Question: 5...



  3%|▎         | 5/164 [02:33<1:21:02, 30.58s/it]

Iteration 5: 30.673816 seconds

Starting Question: 6...



  4%|▎         | 6/164 [02:45<1:03:31, 24.12s/it]

Iteration 6: 11.577579 seconds

Starting Question: 7...



  4%|▍         | 7/164 [03:19<1:11:11, 27.20s/it]

Iteration 7: 33.549686 seconds

Starting Question: 8...



  5%|▍         | 8/164 [03:26<54:24, 20.93s/it]  

Iteration 8: 7.477080 seconds

Saved 16 samples to ./eval_results/samples_3.jsonl
Starting Question: 9...



  5%|▌         | 9/164 [03:51<57:01, 22.07s/it]

Iteration 9: 24.597040 seconds

Starting Question: 10...



  6%|▌         | 10/164 [04:14<57:29, 22.40s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Iteration 10: 23.131762 seconds

Starting Question: 11...



  7%|▋         | 11/164 [04:46<1:04:25, 25.27s/it]

Iteration 11: 31.760773 seconds

Starting Question: 12...



  7%|▋         | 12/164 [05:11<1:04:28, 25.45s/it]

Iteration 12: 25.877519 seconds

Saved 16 samples to ./eval_results/samples_4.jsonl
Starting Question: 13...



  8%|▊         | 13/164 [05:43<1:08:49, 27.35s/it]

Iteration 13: 31.695291 seconds

Starting Question: 14...



  9%|▊         | 14/164 [06:30<1:23:10, 33.27s/it]

Iteration 14: 46.959394 seconds

Starting Question: 15...



  9%|▉         | 15/164 [06:50<1:12:47, 29.31s/it]

Iteration 15: 20.144501 seconds

Starting Question: 16...



 10%|▉         | 16/164 [07:29<1:19:06, 32.07s/it]

Iteration 16: 38.463446 seconds

Saved 16 samples to ./eval_results/samples_5.jsonl
Starting Question: 17...



 10%|█         | 17/164 [08:18<1:30:51, 37.09s/it]

Iteration 17: 48.755529 seconds

Starting Question: 18...



 11%|█         | 18/164 [08:55<1:30:46, 37.31s/it]

Iteration 18: 37.812039 seconds

Starting Question: 19...



 12%|█▏        | 19/164 [09:47<1:40:23, 41.54s/it]

Iteration 19: 51.402604 seconds

Starting Question: 20...



 12%|█▏        | 20/164 [10:14<1:29:46, 37.41s/it]

Iteration 20: 27.771364 seconds

Saved 16 samples to ./eval_results/samples_6.jsonl
Starting Question: 21...



 13%|█▎        | 21/164 [10:55<1:31:07, 38.24s/it]

Iteration 21: 40.169827 seconds

Starting Question: 22...



 13%|█▎        | 22/164 [11:19<1:20:44, 34.12s/it]

Iteration 22: 24.512834 seconds

Starting Question: 23...



 14%|█▍        | 23/164 [11:29<1:02:41, 26.68s/it]

Iteration 23: 9.327127 seconds

Starting Question: 24...



 15%|█▍        | 24/164 [12:13<1:14:28, 31.92s/it]

Iteration 24: 44.122671 seconds

Saved 16 samples to ./eval_results/samples_7.jsonl
Starting Question: 25...



 15%|█▌        | 25/164 [13:03<1:27:00, 37.56s/it]

Iteration 25: 50.718178 seconds

Starting Question: 26...



 16%|█▌        | 26/164 [13:32<1:20:03, 34.81s/it]

Iteration 26: 28.396175 seconds

Starting Question: 27...



 16%|█▋        | 27/164 [13:59<1:14:03, 32.44s/it]

Iteration 27: 26.901080 seconds

Starting Question: 28...



 17%|█▋        | 28/164 [14:47<1:24:18, 37.19s/it]

Iteration 28: 48.283369 seconds

Saved 16 samples to ./eval_results/samples_8.jsonl
Starting Question: 29...



 18%|█▊        | 29/164 [14:52<1:02:18, 27.70s/it]

Iteration 29: 5.534005 seconds

Starting Question: 30...



 18%|█▊        | 30/164 [15:01<49:01, 21.95s/it]  

Iteration 30: 8.553887 seconds

Starting Question: 31...



 19%|█▉        | 31/164 [15:31<54:08, 24.42s/it]

Iteration 31: 30.184698 seconds

Starting Question: 32...



 20%|█▉        | 32/164 [16:22<1:11:00, 32.28s/it]

Iteration 32: 50.596850 seconds

Saved 16 samples to ./eval_results/samples_9.jsonl
Starting Question: 33...



 20%|██        | 33/164 [16:56<1:11:51, 32.91s/it]

Iteration 33: 34.386350 seconds

Starting Question: 34...



 21%|██        | 34/164 [17:34<1:14:40, 34.46s/it]

Iteration 34: 38.090360 seconds

Starting Question: 35...



 21%|██▏       | 35/164 [17:55<1:05:30, 30.47s/it]

Iteration 35: 21.131720 seconds

Starting Question: 36...



 22%|██▏       | 36/164 [18:35<1:10:42, 33.15s/it]

Iteration 36: 39.399873 seconds

Saved 16 samples to ./eval_results/samples_10.jsonl
Starting Question: 37...



 23%|██▎       | 37/164 [19:06<1:08:50, 32.53s/it]

Iteration 37: 31.074419 seconds

Starting Question: 38...



 23%|██▎       | 38/164 [19:48<1:14:17, 35.37s/it]

Iteration 38: 42.014483 seconds

Starting Question: 39...



 24%|██▍       | 39/164 [20:23<1:13:19, 35.20s/it]

Iteration 39: 34.790243 seconds

Starting Question: 40...



 24%|██▍       | 40/164 [21:06<1:17:53, 37.69s/it]

Iteration 40: 43.503267 seconds

Saved 16 samples to ./eval_results/samples_11.jsonl
Starting Question: 41...



 25%|██▌       | 41/164 [21:47<1:19:16, 38.67s/it]

Iteration 41: 40.954383 seconds

Starting Question: 42...



 26%|██▌       | 42/164 [22:35<1:24:29, 41.56s/it]

Iteration 42: 48.290128 seconds

Starting Question: 43...



 26%|██▌       | 43/164 [23:06<1:16:54, 38.14s/it]

Iteration 43: 30.149903 seconds

Starting Question: 44...



 27%|██▋       | 44/164 [23:56<1:23:49, 41.91s/it]

Iteration 44: 50.729512 seconds

Saved 16 samples to ./eval_results/samples_12.jsonl
Starting Question: 45...



 27%|██▋       | 45/164 [24:09<1:05:31, 33.04s/it]

Iteration 45: 12.333890 seconds

Starting Question: 46...



 28%|██▊       | 46/164 [24:32<59:08, 30.07s/it]  

Iteration 46: 23.131268 seconds

Starting Question: 47...



 29%|██▊       | 47/164 [25:19<1:08:31, 35.15s/it]

Iteration 47: 46.989904 seconds

Starting Question: 48...



 29%|██▉       | 48/164 [26:03<1:12:55, 37.72s/it]

Iteration 48: 43.713400 seconds

Saved 16 samples to ./eval_results/samples_13.jsonl
Starting Question: 49...



 30%|██▉       | 49/164 [26:39<1:11:33, 37.33s/it]

Iteration 49: 36.436405 seconds

Starting Question: 50...



 30%|███       | 50/164 [27:12<1:08:40, 36.14s/it]

Iteration 50: 33.367409 seconds

Starting Question: 51...



 31%|███       | 51/164 [28:02<1:15:44, 40.22s/it]

Iteration 51: 49.728481 seconds

Starting Question: 52...



 32%|███▏      | 52/164 [28:44<1:15:47, 40.61s/it]

Iteration 52: 41.507181 seconds

Saved 16 samples to ./eval_results/samples_14.jsonl
Starting Question: 53...



 32%|███▏      | 53/164 [29:22<1:14:02, 40.02s/it]

Iteration 53: 38.658639 seconds

Starting Question: 54...



 33%|███▎      | 54/164 [30:02<1:13:26, 40.06s/it]

Iteration 54: 40.147577 seconds

Starting Question: 55...



 34%|███▎      | 55/164 [30:42<1:12:16, 39.78s/it]

Iteration 55: 39.130746 seconds

Starting Question: 56...



 34%|███▍      | 56/164 [31:22<1:12:00, 40.00s/it]

Iteration 56: 40.518433 seconds

Saved 16 samples to ./eval_results/samples_15.jsonl
Starting Question: 57...



 35%|███▍      | 57/164 [32:00<1:10:26, 39.50s/it]

Iteration 57: 38.319879 seconds

Starting Question: 58...



 35%|███▌      | 58/164 [32:51<1:15:40, 42.84s/it]

Iteration 58: 50.620590 seconds

Starting Question: 59...



 36%|███▌      | 59/164 [33:33<1:14:38, 42.66s/it]

Iteration 59: 42.233108 seconds

Starting Question: 60...



 37%|███▋      | 60/164 [34:17<1:14:21, 42.90s/it]

Iteration 60: 43.478095 seconds

Saved 16 samples to ./eval_results/samples_16.jsonl
Starting Question: 61...



 37%|███▋      | 61/164 [35:03<1:15:12, 43.81s/it]

Iteration 61: 45.933195 seconds

Starting Question: 62...



 38%|███▊      | 62/164 [35:23<1:02:17, 36.65s/it]

Iteration 62: 19.923482 seconds

Starting Question: 63...



 38%|███▊      | 63/164 [35:51<57:45, 34.31s/it]  

Iteration 63: 28.855562 seconds

Starting Question: 64...



 39%|███▉      | 64/164 [36:35<1:01:41, 37.01s/it]

Iteration 64: 43.317553 seconds

Saved 16 samples to ./eval_results/samples_17.jsonl
Starting Question: 65...



 40%|███▉      | 65/164 [36:58<54:10, 32.83s/it]  

Iteration 65: 23.078511 seconds

Starting Question: 66...



 40%|████      | 66/164 [37:35<55:31, 33.99s/it]

Iteration 66: 36.702639 seconds

Starting Question: 67...



 41%|████      | 67/164 [38:15<58:05, 35.93s/it]

Iteration 67: 40.437542 seconds

Starting Question: 68...



 41%|████▏     | 68/164 [38:58<1:00:43, 37.95s/it]

Iteration 68: 42.664589 seconds

Saved 16 samples to ./eval_results/samples_18.jsonl
Starting Question: 69...



 42%|████▏     | 69/164 [39:48<1:06:02, 41.71s/it]

Iteration 69: 50.490090 seconds

Starting Question: 70...



 43%|████▎     | 70/164 [40:39<1:09:34, 44.41s/it]

Iteration 70: 50.694387 seconds

Starting Question: 71...



 43%|████▎     | 71/164 [41:27<1:10:34, 45.54s/it]

Iteration 71: 48.170967 seconds

Starting Question: 72...



 44%|████▍     | 72/164 [42:17<1:12:03, 46.99s/it]

Iteration 72: 50.382062 seconds

Saved 16 samples to ./eval_results/samples_19.jsonl
Starting Question: 73...



 45%|████▍     | 73/164 [42:51<1:05:22, 43.10s/it]

Iteration 73: 34.019722 seconds

Starting Question: 74...



 45%|████▌     | 74/164 [43:42<1:08:14, 45.49s/it]

Iteration 74: 51.073025 seconds

Starting Question: 75...



 46%|████▌     | 75/164 [44:28<1:07:36, 45.57s/it]

Iteration 75: 45.762553 seconds

Starting Question: 76...



 46%|████▋     | 76/164 [45:14<1:07:02, 45.71s/it]

Iteration 76: 46.008383 seconds

Saved 16 samples to ./eval_results/samples_20.jsonl
Starting Question: 77...



 47%|████▋     | 77/164 [45:58<1:05:37, 45.26s/it]

Iteration 77: 44.219152 seconds

Starting Question: 78...



 48%|████▊     | 78/164 [46:27<57:31, 40.13s/it]  

Iteration 78: 28.161376 seconds

Starting Question: 79...



 48%|████▊     | 79/164 [47:16<1:00:45, 42.89s/it]

Iteration 79: 49.311767 seconds

Starting Question: 80...



 49%|████▉     | 80/164 [47:46<54:51, 39.18s/it]  

Iteration 80: 30.537541 seconds

Saved 16 samples to ./eval_results/samples_21.jsonl
Starting Question: 81...



 49%|████▉     | 81/164 [48:37<59:04, 42.70s/it]

Iteration 81: 50.918368 seconds

Starting Question: 82...



 50%|█████     | 82/164 [49:26<1:00:41, 44.41s/it]

Iteration 82: 48.395300 seconds

Starting Question: 83...



 51%|█████     | 83/164 [50:03<57:00, 42.23s/it]  

Iteration 83: 37.134678 seconds

Starting Question: 84...



 51%|█████     | 84/164 [50:50<58:15, 43.69s/it]

Iteration 84: 47.093824 seconds

Saved 16 samples to ./eval_results/samples_22.jsonl
Starting Question: 85...



 52%|█████▏    | 85/164 [51:39<59:47, 45.42s/it]

Iteration 85: 49.444555 seconds

Starting Question: 86...



 52%|█████▏    | 86/164 [52:27<59:44, 45.96s/it]

Iteration 86: 47.213478 seconds

Starting Question: 87...



 53%|█████▎    | 87/164 [53:01<54:33, 42.51s/it]

Iteration 87: 34.476362 seconds

Starting Question: 88...



 54%|█████▎    | 88/164 [53:52<56:49, 44.86s/it]

Iteration 88: 50.326118 seconds

Saved 16 samples to ./eval_results/samples_23.jsonl
Starting Question: 89...



 54%|█████▍    | 89/164 [54:43<58:22, 46.70s/it]

Iteration 89: 50.994696 seconds

Starting Question: 90...



 55%|█████▍    | 90/164 [55:21<54:42, 44.35s/it]

Iteration 90: 38.877507 seconds

Starting Question: 91...



 55%|█████▌    | 91/164 [56:12<56:04, 46.09s/it]

Iteration 91: 50.131286 seconds

Starting Question: 92...



 56%|█████▌    | 92/164 [56:58<55:32, 46.28s/it]

Iteration 92: 46.723835 seconds

Saved 16 samples to ./eval_results/samples_24.jsonl
Starting Question: 93...



 57%|█████▋    | 93/164 [57:35<51:14, 43.31s/it]

Iteration 93: 36.377765 seconds

Starting Question: 94...



 57%|█████▋    | 94/164 [58:25<53:00, 45.43s/it]

Iteration 94: 50.377757 seconds

Starting Question: 95...



 58%|█████▊    | 95/164 [59:02<49:15, 42.84s/it]

Iteration 95: 36.785174 seconds

Starting Question: 96...



 59%|█████▊    | 96/164 [59:52<51:10, 45.15s/it]

Iteration 96: 50.536609 seconds

Saved 16 samples to ./eval_results/samples_25.jsonl
Starting Question: 97...



 59%|█████▉    | 97/164 [1:00:39<51:00, 45.67s/it]

Iteration 97: 46.892906 seconds

Starting Question: 98...



 60%|█████▉    | 98/164 [1:01:14<46:46, 42.52s/it]

Iteration 98: 35.153585 seconds

Starting Question: 99...



 60%|██████    | 99/164 [1:02:00<46:56, 43.33s/it]

Iteration 99: 45.222082 seconds

Starting Question: 100...



 61%|██████    | 100/164 [1:02:45<46:43, 43.80s/it]

Iteration 100: 44.890634 seconds

Saved 16 samples to ./eval_results/samples_26.jsonl
Starting Question: 101...



 62%|██████▏   | 101/164 [1:03:31<46:50, 44.61s/it]

Iteration 101: 46.500062 seconds

Starting Question: 102...



 62%|██████▏   | 102/164 [1:03:59<40:59, 39.67s/it]

Iteration 102: 28.156377 seconds

Starting Question: 103...



 63%|██████▎   | 103/164 [1:04:43<41:40, 40.99s/it]

Iteration 103: 44.075339 seconds

Starting Question: 104...



 63%|██████▎   | 104/164 [1:05:10<36:49, 36.83s/it]

Iteration 104: 27.119972 seconds

Saved 16 samples to ./eval_results/samples_27.jsonl
Starting Question: 105...



 64%|██████▍   | 105/164 [1:05:55<38:26, 39.09s/it]

Iteration 105: 44.350983 seconds

Starting Question: 106...



 65%|██████▍   | 106/164 [1:06:45<41:05, 42.50s/it]

Iteration 106: 50.457819 seconds

Starting Question: 107...



 65%|██████▌   | 107/164 [1:07:34<42:09, 44.38s/it]

Iteration 107: 48.751533 seconds

Starting Question: 108...



 66%|██████▌   | 108/164 [1:08:24<43:02, 46.12s/it]

Iteration 108: 50.172569 seconds

Saved 16 samples to ./eval_results/samples_28.jsonl
Starting Question: 109...



 66%|██████▋   | 109/164 [1:09:15<43:31, 47.48s/it]

Iteration 109: 50.652645 seconds

Starting Question: 110...



 67%|██████▋   | 110/164 [1:09:57<41:17, 45.88s/it]

Iteration 110: 42.164797 seconds

Starting Question: 111...



 68%|██████▊   | 111/164 [1:10:40<39:40, 44.92s/it]

Iteration 111: 42.662790 seconds

Starting Question: 112...



 68%|██████▊   | 112/164 [1:11:31<40:31, 46.77s/it]

Iteration 112: 51.076962 seconds

Saved 16 samples to ./eval_results/samples_29.jsonl
Starting Question: 113...



 69%|██████▉   | 113/164 [1:12:03<36:08, 42.51s/it]

Iteration 113: 32.582290 seconds

Starting Question: 114...



 70%|██████▉   | 114/164 [1:12:44<35:03, 42.07s/it]

Iteration 114: 41.043033 seconds

Starting Question: 115...



 70%|███████   | 115/164 [1:13:30<35:08, 43.03s/it]

Iteration 115: 45.271162 seconds

Starting Question: 116...



 71%|███████   | 116/164 [1:14:19<35:56, 44.92s/it]

Iteration 116: 49.333348 seconds

Saved 16 samples to ./eval_results/samples_30.jsonl
Starting Question: 117...



 71%|███████▏  | 117/164 [1:14:55<33:04, 42.22s/it]

Iteration 117: 35.927956 seconds

Starting Question: 118...



 72%|███████▏  | 118/164 [1:15:42<33:29, 43.68s/it]

Iteration 118: 47.087869 seconds

Starting Question: 119...



 73%|███████▎  | 119/164 [1:16:29<33:32, 44.72s/it]

Iteration 119: 47.139567 seconds

Starting Question: 120...



 73%|███████▎  | 120/164 [1:17:19<33:54, 46.24s/it]

Iteration 120: 49.777994 seconds

Saved 16 samples to ./eval_results/samples_31.jsonl
Starting Question: 121...



 74%|███████▍  | 121/164 [1:18:10<34:07, 47.61s/it]

Iteration 121: 50.818272 seconds

Starting Question: 122...



 74%|███████▍  | 122/164 [1:19:00<33:51, 48.36s/it]

Iteration 122: 50.093644 seconds

Starting Question: 123...



 75%|███████▌  | 123/164 [1:19:48<32:55, 48.18s/it]

Iteration 123: 47.746789 seconds

Starting Question: 124...



 76%|███████▌  | 124/164 [1:20:34<31:49, 47.73s/it]

Iteration 124: 46.687193 seconds

Saved 16 samples to ./eval_results/samples_32.jsonl
Starting Question: 125...



 76%|███████▌  | 125/164 [1:21:23<31:09, 47.94s/it]

Iteration 125: 48.434646 seconds

Starting Question: 126...



 77%|███████▋  | 126/164 [1:22:09<29:59, 47.36s/it]

Iteration 126: 45.988177 seconds

Starting Question: 127...



 77%|███████▋  | 127/164 [1:22:59<29:44, 48.24s/it]

Iteration 127: 50.303945 seconds

Starting Question: 128...



 78%|███████▊  | 128/164 [1:23:50<29:31, 49.20s/it]

Iteration 128: 51.417145 seconds

Saved 16 samples to ./eval_results/samples_33.jsonl
Starting Question: 129...



 79%|███████▊  | 129/164 [1:24:39<28:34, 48.98s/it]

Iteration 129: 48.473784 seconds

Starting Question: 130...



 79%|███████▉  | 130/164 [1:25:30<28:05, 49.57s/it]

Iteration 130: 50.954015 seconds

Starting Question: 131...



 80%|███████▉  | 131/164 [1:26:12<26:00, 47.28s/it]

Iteration 131: 41.926631 seconds

Starting Question: 132...



 80%|████████  | 132/164 [1:26:56<24:40, 46.27s/it]

Iteration 132: 43.913389 seconds

Saved 16 samples to ./eval_results/samples_34.jsonl
Starting Question: 133...



 81%|████████  | 133/164 [1:27:33<22:35, 43.72s/it]

Iteration 133: 37.778218 seconds

Starting Question: 134...



 82%|████████▏ | 134/164 [1:28:18<21:56, 43.88s/it]

Iteration 134: 44.239854 seconds

Starting Question: 135...



 82%|████████▏ | 135/164 [1:29:01<21:10, 43.82s/it]

Iteration 135: 43.667435 seconds

Starting Question: 136...



 83%|████████▎ | 136/164 [1:29:48<20:47, 44.56s/it]

Iteration 136: 46.276924 seconds

Saved 16 samples to ./eval_results/samples_35.jsonl
Starting Question: 137...



 84%|████████▎ | 137/164 [1:30:39<20:58, 46.60s/it]

Iteration 137: 51.380597 seconds

Starting Question: 138...



 84%|████████▍ | 138/164 [1:31:31<20:50, 48.11s/it]

Iteration 138: 51.607217 seconds

Starting Question: 139...



 85%|████████▍ | 139/164 [1:32:21<20:21, 48.85s/it]

Iteration 139: 50.570578 seconds

Starting Question: 140...



 85%|████████▌ | 140/164 [1:33:09<19:24, 48.50s/it]

Iteration 140: 47.696311 seconds

Saved 16 samples to ./eval_results/samples_36.jsonl
Starting Question: 141...



 86%|████████▌ | 141/164 [1:33:59<18:49, 49.11s/it]

Iteration 141: 50.530108 seconds

Starting Question: 142...



 87%|████████▋ | 142/164 [1:34:49<18:03, 49.25s/it]

Iteration 142: 49.570607 seconds

Starting Question: 143...



 87%|████████▋ | 143/164 [1:35:39<17:21, 49.62s/it]

Iteration 143: 50.472878 seconds

Starting Question: 144...



 88%|████████▊ | 144/164 [1:36:27<16:19, 48.98s/it]

Iteration 144: 47.486999 seconds

Saved 16 samples to ./eval_results/samples_37.jsonl
Starting Question: 145...



 88%|████████▊ | 145/164 [1:37:12<15:10, 47.93s/it]

Iteration 145: 45.468061 seconds

Starting Question: 146...



 89%|████████▉ | 146/164 [1:38:01<14:25, 48.06s/it]

Iteration 146: 48.359658 seconds

Starting Question: 147...



 90%|████████▉ | 147/164 [1:38:52<13:53, 49.05s/it]

Iteration 147: 51.355597 seconds

Starting Question: 148...



 90%|█████████ | 148/164 [1:39:33<12:24, 46.51s/it]

Iteration 148: 40.586359 seconds

Saved 16 samples to ./eval_results/samples_38.jsonl
Starting Question: 149...



 91%|█████████ | 149/164 [1:40:24<11:59, 47.94s/it]

Iteration 149: 51.279756 seconds

Starting Question: 150...



 91%|█████████▏| 150/164 [1:40:57<10:06, 43.32s/it]

Iteration 150: 32.527403 seconds

Starting Question: 151...



 92%|█████████▏| 151/164 [1:41:44<09:40, 44.64s/it]

Iteration 151: 47.707911 seconds

Starting Question: 152...



 93%|█████████▎| 152/164 [1:42:35<09:16, 46.40s/it]

Iteration 152: 50.527434 seconds

Saved 16 samples to ./eval_results/samples_39.jsonl
Starting Question: 153...



 93%|█████████▎| 153/164 [1:43:15<08:10, 44.60s/it]

Iteration 153: 40.375875 seconds

Starting Question: 154...



 94%|█████████▍| 154/164 [1:44:05<07:42, 46.29s/it]

Iteration 154: 50.240108 seconds

Starting Question: 155...



 95%|█████████▍| 155/164 [1:44:45<06:37, 44.17s/it]

Iteration 155: 39.235694 seconds

Starting Question: 156...



 95%|█████████▌| 156/164 [1:45:28<05:51, 43.90s/it]

Iteration 156: 43.259861 seconds

Saved 16 samples to ./eval_results/samples_40.jsonl
Starting Question: 157...



 96%|█████████▌| 157/164 [1:46:19<05:21, 45.95s/it]

Iteration 157: 50.719201 seconds

Starting Question: 158...



 96%|█████████▋| 158/164 [1:47:02<04:30, 45.15s/it]

Iteration 158: 43.291166 seconds

Starting Question: 159...



 97%|█████████▋| 159/164 [1:47:53<03:54, 46.84s/it]

Iteration 159: 50.766243 seconds

Starting Question: 160...



 98%|█████████▊| 160/164 [1:48:32<02:58, 44.64s/it]

Iteration 160: 39.509142 seconds

Saved 16 samples to ./eval_results/samples_41.jsonl
Starting Question: 161...



 98%|█████████▊| 161/164 [1:49:23<02:19, 46.43s/it]

Iteration 161: 50.601380 seconds

Starting Question: 162...



 99%|█████████▉| 162/164 [1:49:55<01:24, 42.26s/it]

Iteration 162: 32.541793 seconds

Starting Question: 163...



 99%|█████████▉| 163/164 [1:50:39<00:42, 42.53s/it]

Iteration 163: 43.156905 seconds

Starting Question: 164...



100%|██████████| 164/164 [1:51:30<00:00, 40.80s/it]

Iteration 164: 51.675926 seconds

Saved 16 samples to ./eval_results/samples_42.jsonl





In [7]:
# merge files, stored in human eval
merge_jsonl_files(EVAL_SAVE_DIR, MERGE_OUTPUT)

Merged 42 files into merged_samples.jsonl


In [13]:
import json

def transform_jsonl(input_file: str, output_file: str):
    """
    Transforms each line from:
        {
          "task_id": "...",
          "completion": [{"generated_text": "some string ..."}]
        }
    into:
        {
          "task_id": "...",
          "completion": "some string ..."
        }
    and writes to a new JSONL file.
    """
    transformed_records = []

    # 1. Read each line as JSON
    with open(input_file, "r", encoding="utf-8") as fin:
        for line in fin:
            data = json.loads(line)

            # 2. Extract "generated_text" from the 'completion' list (assuming it exists)
            if "completion" in data and isinstance(data["completion"], list) and data["completion"]:
                # Typically something like [{"generated_text": "..."}]
                gen_text = data["completion"][0].get("generated_text", "")
            else:
                gen_text = ""

            # 3. Replace the 'completion' field with just the string
            data["completion"] = gen_text

            transformed_records.append(data)

    # 4. Write the new structure to a JSONL output file
    with open(output_file, "w", encoding="utf-8") as fout:
        for record in transformed_records:
            fout.write(json.dumps(record) + "\n")


In [14]:
# Usage:
input_path = "/content/human-eval/merged_samples.jsonl"
output_path = "/content/human-eval/clean_merged_samples.jsonl"

transform_jsonl(input_path, output_path)
print(f"Transformed JSONL saved to: {output_path}")

Transformed JSONL saved to: /content/human-eval/clean_merged_samples.jsonl


In [16]:
# run human eval
%cd /content/human-eval/
!evaluate_functional_correctness clean_merged_samples.jsonl

/content/human-eval
Reading samples...
672it [00:00, 14265.83it/s]
Running test suites...
100% 672/672 [00:12<00:00, 54.07it/s]
Writing results to clean_merged_samples.jsonl_results.jsonl...
100% 672/672 [00:00<00:00, 38997.89it/s]
{'pass@1': 0.07393292682926829}
