<a href="https://colab.research.google.com/github/Bri636/ml-programming-winter-2025/blob/main/Code_SIMPO_10epochs_Eval_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# importing packages
!git clone https://github.com/openai/human-eval.git
%cd human-eval/
!pip install -e .
!pip install transformers
!pip install pydantic
!pip install torch
!pip install bitsandbytes

Cloning into 'human-eval'...
remote: Enumerating objects: 34, done.[K
remote: Counting objects: 100% (26/26), done.[K
remote: Compressing objects: 100% (19/19), done.[K
remote: Total 34 (delta 12), reused 7 (delta 7), pack-reused 8 (from 1)[K
Receiving objects: 100% (34/34), 55.80 KiB | 1.69 MiB/s, done.
Resolving deltas: 100% (13/13), done.
/content/human-eval
Obtaining file:///content/human-eval
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting fire (from human-eval==1.0)
  Downloading fire-0.7.0.tar.gz (87 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.2/87.2 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fire
  Building wheel for fire (setup.py) ... [?25l[?25hdone
  Created wheel for fire: filename=fire-0.7.0-py3-none-any.whl size=114249 sha256=c43c8ac873fdd37ab77c14a118132f9cc29f8d2c6ca33a7bedae2e8dc6af5789
  Stored in directory: /root/.cach

In [2]:
""" Evaluation on Human Eval """

from __future__ import annotations
from typing import TypedDict, Dict, List, Any, Union, TypeVar
from transformers import pipeline
from transformers.pipelines import Pipeline
from functools import partial
from pydantic import Field, BaseModel
from tqdm import tqdm
import timeit
import os, glob
import json, yaml
from pathlib import Path
import torch
# submods
from human_eval.data import write_jsonl, read_problems, HUMAN_EVAL

In [3]:
_DTYPES={
    'bfloat16': torch.bfloat16
}
T = TypeVar('T')
PathLike = Union[Path, str]
# MODEL_NAME_OR_PATH = "unsloth/Qwen2.5-0.5B-Instruct-bnb-4bit"
MODEL_NAME_OR_PATH="BigBri/CodeSimPO"
EVAL_SAVE_DIR = "./eval_results"
MERGE = False
MERGE_OUTPUT = "merged_samples.jsonl"


In [4]:
class BaseConfig(BaseModel):
    """An interface to add JSON/YAML serialization to Pydantic models."""

    # A name literal to correctly identify and construct nested models
    # which have many possible options.
    # name: Literal[''] = ''

    def write_json(self, path: PathLike) -> None:
        """Write the model to a JSON file.

        Parameters
        ----------
        path : str
            The path to the JSON file.
        """
        with open(path, 'w') as fp:
            json.dump(self.model_dump(), fp, indent=2)

    @classmethod
    def from_json(cls: type[T], path: PathLike) -> T:
        """Load the model from a JSON file.

        Parameters
        ----------
        path : str
            The path to the JSON file.

        Returns
        -------
        T
            A specific BaseConfig instance.
        """
        with open(path) as fp:
            data = json.load(fp)
        return cls(**data)

    def write_yaml(self, path: PathLike) -> None:
        """Write the model to a YAML file.

        Parameters
        ----------
        path : str
            The path to the YAML file.
        """
        with open(path, 'w') as fp:
            yaml.dump(
                json.loads(self.model_dump_json()),
                fp,
                indent=4,
                sort_keys=False,
            )

    @classmethod
    def from_yaml(cls: type[T], path: PathLike) -> T:
        """Load the model from a YAML file.

        Parameters
        ----------
        path : PathLike
            The path to the YAML file.

        Returns
        -------
        T
            A specific BaseConfig instance.
        """
        with open(path) as fp:
            raw_data = yaml.safe_load(fp)
        return cls(**raw_data)

class HumanEvalProblem(TypedDict):
    """ One single problem in HumanEval """
    task_id: str
    prompt: str
    entry_point: str
    canonical_solution: str
    test: str

class HumanEvalSolution(TypedDict):
    """ One instance of solution to HumanEval problem """
    task_id: str
    completion: str

class HFGeneratorConfig(BaseConfig):
    model_name_or_path: str = 'BigBri/CodeSimPO'
    # model_name_or_path: str = "unsloth/Qwen2.5-0.5B-Instruct-bnb-4bit"
    task: str = 'text-generation'
    device_map: str = 'auto'
    # device: str = 'cpu'
    torch_dtype: str = 'bfloat16'
    max_new_tokens: int = 256
    truncation: bool = True

class EvaluationConfig(BaseConfig):
    human_eval_path: str = Field(default=HUMAN_EVAL)
    pipeline_config: HFGeneratorConfig = Field(default_factory=HFGeneratorConfig)
    eval_save_dir: str = Field(default=EVAL_SAVE_DIR)
    num_samples_per_task: int = Field(default=4)
    batch_size: int = Field(default=16)  # Save every 10 samples

In [5]:
def format_time(seconds: float) -> str:
    """
    Converts time in seconds to a human-readable format (HH:MM:SS).
    """
    hours, remainder = divmod(seconds, 3600)
    minutes, seconds = divmod(remainder, 60)
    return f"{int(hours):02d}:{int(minutes):02d}:{seconds:06.3f}"

def read_jsonl(file_path: str) -> list:
    with open(file_path, "r", encoding="utf-8") as f:
        data = [json.loads(line) for line in f]
    return data

In [6]:
def evaluate(pipeline_func,
             problems: Dict[str, HumanEvalProblem],
             eval_config: EvaluationConfig):
    """ Evaluation of HFModel with batch saving """

    os.makedirs(eval_config.eval_save_dir, exist_ok=True)  # Ensure save directory exists
    batch = []
    file_index = len(glob.glob(os.path.join(eval_config.eval_save_dir, "samples_*.jsonl"))) + 1

    for idx, (task_id, task_data) in tqdm(enumerate(problems.items()), total=len(problems)):
        print(f'Starting Question: {idx + 1}...\n')
        start_time = timeit.default_timer()

        batched_prompts = [task_data["prompt"]] * eval_config.num_samples_per_task
        completions = pipeline_func(
            batched_prompts,
            max_new_tokens=eval_config.pipeline_config.max_new_tokens,
            truncation=eval_config.pipeline_config.truncation
        )

        for completion in completions:
            batch.append({"task_id": task_id, "completion": completion})

        elapsed_time = timeit.default_timer() - start_time
        print(f"Iteration {idx + 1}: {elapsed_time:.6f} seconds\n")

        # Save every `batch_size` samples
        if len(batch) >= eval_config.batch_size:
            save_path = os.path.join(eval_config.eval_save_dir, f"samples_{file_index}.jsonl")
            write_jsonl(save_path, batch)
            print(f"Saved {len(batch)} samples to {save_path}")
            batch.clear()
            file_index += 1

    # Save remaining batch if not empty
    if batch:
        save_path = os.path.join(eval_config.eval_save_dir, f"samples_{file_index}.jsonl")
        write_jsonl(save_path, batch)
        print(f"Saved {len(batch)} final samples to {save_path}")

def merge_jsonl_files(input_dir: str, output_file: str):
    """
    Merges multiple JSONL files from a directory into a single JSONL file.
    """
    jsonl_files = sorted(glob.glob(os.path.join(input_dir, "samples_*.jsonl")))

    with open(output_file, 'w', encoding='utf-8') as outfile:
        for file in jsonl_files:
            with open(file, 'r', encoding='utf-8') as infile:
                for line in infile:
                    outfile.write(line)

    print(f"Merged {len(jsonl_files)} files into {output_file}")

def main():
    # If MERGE is set, just merge and return
    if MERGE:
        merge_jsonl_files(EVAL_SAVE_DIR, MERGE_OUTPUT)
        return

    # Otherwise, run the standard evaluation
    eval_config = EvaluationConfig()
    eval_config.eval_save_dir = EVAL_SAVE_DIR

    pipe_config = eval_config.pipeline_config
    pipe_config.model_name_or_path = MODEL_NAME_OR_PATH
    print(f'Running Model: {pipe_config.model_name_or_path} On Human Eval With Settings: {eval_config.model_dump()}')
    problems: Dict[str, HumanEvalProblem] = read_problems()
    pipe = pipeline(
        model=pipe_config.model_name_or_path,
        task=pipe_config.task,
        device_map=pipe_config.device_map,
        # device=pipe_config.device,
        torch_dtype=_DTYPES.get(pipe_config.torch_dtype)
    )

    evaluate(pipe, problems, eval_config)

if __name__ == "__main__":
    main()

Running Model: BigBri/CodeSimPO On Human Eval With Settings: {'human_eval_path': '/content/human-eval/human_eval/../data/HumanEval.jsonl.gz', 'pipeline_config': {'model_name_or_path': 'BigBri/CodeSimPO', 'task': 'text-generation', 'device_map': 'auto', 'torch_dtype': 'bfloat16', 'max_new_tokens': 256, 'truncation': True}, 'eval_save_dir': './eval_results', 'num_samples_per_task': 4, 'batch_size': 16}


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


adapter_config.json:   0%|          | 0.00/801 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.34k [00:00<?, ?B/s]

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


model.safetensors:   0%|          | 0.00/457M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

Device set to use cuda:0
  0%|          | 0/164 [00:00<?, ?it/s]

Starting Question: 1...



  1%|          | 1/164 [00:10<29:07, 10.72s/it]

Iteration 1: 10.722389 seconds

Starting Question: 2...



  1%|          | 2/164 [00:37<54:32, 20.20s/it]

Iteration 2: 26.836522 seconds

Starting Question: 3...



  2%|▏         | 3/164 [01:10<1:09:39, 25.96s/it]

Iteration 3: 32.808018 seconds

Starting Question: 4...



  2%|▏         | 4/164 [01:45<1:18:59, 29.62s/it]

Iteration 4: 35.232560 seconds

Saved 16 samples to ./eval_results/samples_1.jsonl
Starting Question: 5...



  3%|▎         | 5/164 [02:13<1:16:27, 28.85s/it]

Iteration 5: 27.483833 seconds

Starting Question: 6...



  4%|▎         | 6/164 [02:24<1:00:13, 22.87s/it]

Iteration 6: 11.252776 seconds

Starting Question: 7...



  4%|▍         | 7/164 [02:50<1:02:39, 23.94s/it]

Iteration 7: 26.155655 seconds

Starting Question: 8...



  5%|▍         | 8/164 [02:53<45:06, 17.35s/it]  

Iteration 8: 3.225003 seconds

Saved 16 samples to ./eval_results/samples_2.jsonl
Starting Question: 9...



  5%|▌         | 9/164 [03:10<43:57, 17.02s/it]

Iteration 9: 16.282570 seconds

Starting Question: 10...



  6%|▌         | 10/164 [03:25<42:46, 16.66s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Iteration 10: 15.872795 seconds

Starting Question: 11...



  7%|▋         | 11/164 [04:03<58:35, 22.97s/it]

Iteration 11: 37.284055 seconds

Starting Question: 12...



  7%|▋         | 12/164 [04:17<51:28, 20.32s/it]

Iteration 12: 14.242733 seconds

Saved 16 samples to ./eval_results/samples_3.jsonl
Starting Question: 13...



  8%|▊         | 13/164 [04:37<51:19, 20.39s/it]

Iteration 13: 20.567987 seconds

Starting Question: 14...



  9%|▊         | 14/164 [05:19<1:07:02, 26.82s/it]

Iteration 14: 41.654019 seconds

Starting Question: 15...



  9%|▉         | 15/164 [05:31<55:44, 22.45s/it]  

Iteration 15: 12.322105 seconds

Starting Question: 16...



 10%|▉         | 16/164 [05:41<45:48, 18.57s/it]

Iteration 16: 9.560744 seconds

Saved 16 samples to ./eval_results/samples_4.jsonl
Starting Question: 17...



 10%|█         | 17/164 [05:51<39:26, 16.10s/it]

Iteration 17: 10.339102 seconds

Starting Question: 18...



 11%|█         | 18/164 [06:06<37:52, 15.57s/it]

Iteration 18: 14.328856 seconds

Starting Question: 19...



 12%|█▏        | 19/164 [06:41<52:13, 21.61s/it]

Iteration 19: 35.702537 seconds

Starting Question: 20...



 12%|█▏        | 20/164 [06:54<45:19, 18.89s/it]

Iteration 20: 12.533830 seconds

Saved 16 samples to ./eval_results/samples_5.jsonl
Starting Question: 21...



 13%|█▎        | 21/164 [07:24<52:48, 22.16s/it]

Iteration 21: 29.784491 seconds

Starting Question: 22...



 13%|█▎        | 22/164 [07:37<46:04, 19.47s/it]

Iteration 22: 13.185116 seconds

Starting Question: 23...



 14%|█▍        | 23/164 [07:43<36:19, 15.46s/it]

Iteration 23: 6.101438 seconds

Starting Question: 24...



 15%|█▍        | 24/164 [07:55<33:32, 14.38s/it]

Iteration 24: 11.858603 seconds

Saved 16 samples to ./eval_results/samples_6.jsonl
Starting Question: 25...



 15%|█▌        | 25/164 [08:04<29:49, 12.88s/it]

Iteration 25: 9.371571 seconds

Starting Question: 26...



 16%|█▌        | 26/164 [08:23<33:33, 14.59s/it]

Iteration 26: 18.580744 seconds

Starting Question: 27...



 16%|█▋        | 27/164 [08:34<31:12, 13.67s/it]

Iteration 27: 11.511673 seconds

Starting Question: 28...



 17%|█▋        | 28/164 [08:48<31:14, 13.78s/it]

Iteration 28: 14.054155 seconds

Saved 16 samples to ./eval_results/samples_7.jsonl
Starting Question: 29...



 18%|█▊        | 29/164 [08:51<23:36, 10.50s/it]

Iteration 29: 2.826562 seconds

Starting Question: 30...



 18%|█▊        | 30/164 [08:57<20:28,  9.17s/it]

Iteration 30: 6.069423 seconds

Starting Question: 31...



 19%|█▉        | 31/164 [09:21<29:49, 13.45s/it]

Iteration 31: 23.454342 seconds

Starting Question: 32...



 20%|█▉        | 32/164 [09:43<35:06, 15.95s/it]

Iteration 32: 21.785778 seconds

Saved 16 samples to ./eval_results/samples_8.jsonl
Starting Question: 33...



 20%|██        | 33/164 [10:08<41:08, 18.84s/it]

Iteration 33: 25.573761 seconds

Starting Question: 34...



 21%|██        | 34/164 [10:43<51:24, 23.73s/it]

Iteration 34: 35.137805 seconds

Starting Question: 35...



 21%|██▏       | 35/164 [11:08<51:27, 23.93s/it]

Iteration 35: 24.401298 seconds

Starting Question: 36...



 22%|██▏       | 36/164 [11:27<48:14, 22.62s/it]

Iteration 36: 19.546002 seconds

Saved 16 samples to ./eval_results/samples_9.jsonl
Starting Question: 37...



 23%|██▎       | 37/164 [11:51<48:30, 22.92s/it]

Iteration 37: 23.614897 seconds

Starting Question: 38...



 23%|██▎       | 38/164 [12:03<41:20, 19.69s/it]

Iteration 38: 12.160647 seconds

Starting Question: 39...



 24%|██▍       | 39/164 [12:29<44:49, 21.51s/it]

Iteration 39: 25.765540 seconds

Starting Question: 40...



 24%|██▍       | 40/164 [13:01<50:49, 24.59s/it]

Iteration 40: 31.780315 seconds

Saved 16 samples to ./eval_results/samples_10.jsonl
Starting Question: 41...



 25%|██▌       | 41/164 [13:16<44:54, 21.91s/it]

Iteration 41: 15.638080 seconds

Starting Question: 42...



 26%|██▌       | 42/164 [13:50<51:34, 25.37s/it]

Iteration 42: 33.444168 seconds

Starting Question: 43...



 26%|██▌       | 43/164 [14:26<58:02, 28.78s/it]

Iteration 43: 36.734375 seconds

Starting Question: 44...



 27%|██▋       | 44/164 [15:00<1:00:24, 30.20s/it]

Iteration 44: 33.522819 seconds

Saved 16 samples to ./eval_results/samples_11.jsonl
Starting Question: 45...



 27%|██▋       | 45/164 [15:24<56:00, 28.24s/it]  

Iteration 45: 23.663829 seconds

Starting Question: 46...



 28%|██▊       | 46/164 [15:56<58:14, 29.62s/it]

Iteration 46: 32.831205 seconds

Starting Question: 47...



 29%|██▊       | 47/164 [16:19<53:46, 27.58s/it]

Iteration 47: 22.808986 seconds

Starting Question: 48...



 29%|██▉       | 48/164 [16:47<53:12, 27.52s/it]

Iteration 48: 27.379127 seconds

Saved 16 samples to ./eval_results/samples_12.jsonl
Starting Question: 49...



 30%|██▉       | 49/164 [17:10<50:10, 26.18s/it]

Iteration 49: 23.065004 seconds

Starting Question: 50...



 30%|███       | 50/164 [17:37<50:15, 26.45s/it]

Iteration 50: 27.076351 seconds

Starting Question: 51...



 31%|███       | 51/164 [17:56<45:32, 24.18s/it]

Iteration 51: 18.887676 seconds

Starting Question: 52...



 32%|███▏      | 52/164 [18:22<46:31, 24.92s/it]

Iteration 52: 26.650505 seconds

Saved 16 samples to ./eval_results/samples_13.jsonl
Starting Question: 53...



 32%|███▏      | 53/164 [18:52<49:01, 26.50s/it]

Iteration 53: 30.189152 seconds

Starting Question: 54...



 33%|███▎      | 54/164 [19:07<42:08, 22.98s/it]

Iteration 54: 14.768699 seconds

Starting Question: 55...



 34%|███▎      | 55/164 [19:14<32:50, 18.08s/it]

Iteration 55: 6.621195 seconds

Starting Question: 56...



 34%|███▍      | 56/164 [19:41<37:19, 20.74s/it]

Iteration 56: 26.957581 seconds

Saved 16 samples to ./eval_results/samples_14.jsonl
Starting Question: 57...



 35%|███▍      | 57/164 [20:09<40:45, 22.86s/it]

Iteration 57: 27.788175 seconds

Starting Question: 58...



 35%|███▌      | 58/164 [20:43<46:34, 26.37s/it]

Iteration 58: 34.561897 seconds

Starting Question: 59...



 36%|███▌      | 59/164 [21:05<43:48, 25.03s/it]

Iteration 59: 21.904797 seconds

Starting Question: 60...



 37%|███▋      | 60/164 [21:28<42:06, 24.30s/it]

Iteration 60: 22.588468 seconds

Saved 16 samples to ./eval_results/samples_15.jsonl
Starting Question: 61...



 37%|███▋      | 61/164 [21:37<34:06, 19.87s/it]

Iteration 61: 9.537262 seconds

Starting Question: 62...



 38%|███▊      | 62/164 [21:58<34:02, 20.03s/it]

Iteration 62: 20.390714 seconds

Starting Question: 63...



 38%|███▊      | 63/164 [22:27<38:37, 22.95s/it]

Iteration 63: 29.771090 seconds

Starting Question: 64...



 39%|███▉      | 64/164 [22:57<41:22, 24.82s/it]

Iteration 64: 29.195380 seconds

Saved 16 samples to ./eval_results/samples_16.jsonl
Starting Question: 65...



 40%|███▉      | 65/164 [23:19<39:41, 24.06s/it]

Iteration 65: 22.261870 seconds

Starting Question: 66...



 40%|████      | 66/164 [23:52<43:41, 26.75s/it]

Iteration 66: 33.048307 seconds

Starting Question: 67...



 41%|████      | 67/164 [24:13<40:22, 24.97s/it]

Iteration 67: 20.810008 seconds

Starting Question: 68...



 41%|████▏     | 68/164 [24:37<39:32, 24.71s/it]

Iteration 68: 24.100081 seconds

Saved 16 samples to ./eval_results/samples_17.jsonl
Starting Question: 69...



 42%|████▏     | 69/164 [24:52<34:25, 21.74s/it]

Iteration 69: 14.797986 seconds

Starting Question: 70...



 43%|████▎     | 70/164 [25:20<37:05, 23.68s/it]

Iteration 70: 28.206554 seconds

Starting Question: 71...



 43%|████▎     | 71/164 [25:47<38:14, 24.67s/it]

Iteration 71: 26.996053 seconds

Starting Question: 72...



 44%|████▍     | 72/164 [26:12<38:15, 24.95s/it]

Iteration 72: 25.577345 seconds

Saved 16 samples to ./eval_results/samples_18.jsonl
Starting Question: 73...



 45%|████▍     | 73/164 [26:35<36:48, 24.27s/it]

Iteration 73: 22.690233 seconds

Starting Question: 74...



 45%|████▌     | 74/164 [27:11<41:31, 27.69s/it]

Iteration 74: 35.663343 seconds

Starting Question: 75...



 46%|████▌     | 75/164 [27:42<42:45, 28.83s/it]

Iteration 75: 31.480021 seconds

Starting Question: 76...



 46%|████▋     | 76/164 [27:58<36:27, 24.86s/it]

Iteration 76: 15.592613 seconds

Saved 16 samples to ./eval_results/samples_19.jsonl
Starting Question: 77...



 47%|████▋     | 77/164 [28:24<36:50, 25.41s/it]

Iteration 77: 26.691356 seconds

Starting Question: 78...



 48%|████▊     | 78/164 [28:59<40:15, 28.09s/it]

Iteration 78: 34.349869 seconds

Starting Question: 79...



 48%|████▊     | 79/164 [29:37<44:01, 31.08s/it]

Iteration 79: 38.051892 seconds

Starting Question: 80...



 49%|████▉     | 80/164 [30:10<44:28, 31.76s/it]

Iteration 80: 33.358930 seconds

Saved 16 samples to ./eval_results/samples_20.jsonl
Starting Question: 81...



 49%|████▉     | 81/164 [30:32<39:54, 28.85s/it]

Iteration 81: 22.048048 seconds

Starting Question: 82...



 50%|█████     | 82/164 [31:00<38:58, 28.51s/it]

Iteration 82: 27.732468 seconds

Starting Question: 83...



 51%|█████     | 83/164 [31:07<29:57, 22.19s/it]

Iteration 83: 7.421644 seconds

Starting Question: 84...



 51%|█████     | 84/164 [31:49<37:11, 27.90s/it]

Iteration 84: 41.216304 seconds

Saved 16 samples to ./eval_results/samples_21.jsonl
Starting Question: 85...



 52%|█████▏    | 85/164 [32:02<30:55, 23.48s/it]

Iteration 85: 13.181280 seconds

Starting Question: 86...



 52%|█████▏    | 86/164 [32:23<29:38, 22.81s/it]

Iteration 86: 21.231119 seconds

Starting Question: 87...



 53%|█████▎    | 87/164 [32:49<30:29, 23.75s/it]

Iteration 87: 25.964033 seconds

Starting Question: 88...



 54%|█████▎    | 88/164 [33:28<35:41, 28.18s/it]

Iteration 88: 38.498772 seconds

Saved 16 samples to ./eval_results/samples_22.jsonl
Starting Question: 89...



 54%|█████▍    | 89/164 [34:05<38:49, 31.06s/it]

Iteration 89: 37.790480 seconds

Starting Question: 90...



 55%|█████▍    | 90/164 [34:33<37:04, 30.06s/it]

Iteration 90: 27.735821 seconds

Starting Question: 91...



 55%|█████▌    | 91/164 [34:55<33:38, 27.65s/it]

Iteration 91: 22.021510 seconds

Starting Question: 92...



 56%|█████▌    | 92/164 [35:25<34:04, 28.39s/it]

Iteration 92: 30.120899 seconds

Saved 16 samples to ./eval_results/samples_23.jsonl
Starting Question: 93...



 57%|█████▋    | 93/164 [35:49<31:48, 26.88s/it]

Iteration 93: 23.333911 seconds

Starting Question: 94...



 57%|█████▋    | 94/164 [36:19<32:27, 27.81s/it]

Iteration 94: 30.002754 seconds

Starting Question: 95...



 58%|█████▊    | 95/164 [36:50<33:04, 28.76s/it]

Iteration 95: 30.958881 seconds

Starting Question: 96...



 59%|█████▊    | 96/164 [37:27<35:28, 31.31s/it]

Iteration 96: 37.258583 seconds

Saved 16 samples to ./eval_results/samples_24.jsonl
Starting Question: 97...



 59%|█████▉    | 97/164 [37:58<35:01, 31.37s/it]

Iteration 97: 31.499320 seconds

Starting Question: 98...



 60%|█████▉    | 98/164 [38:29<34:23, 31.27s/it]

Iteration 98: 31.038792 seconds

Starting Question: 99...



 60%|██████    | 99/164 [38:58<32:59, 30.46s/it]

Iteration 99: 28.558116 seconds

Starting Question: 100...



 61%|██████    | 100/164 [39:28<32:28, 30.45s/it]

Iteration 100: 30.431687 seconds

Saved 16 samples to ./eval_results/samples_25.jsonl
Starting Question: 101...



 62%|██████▏   | 101/164 [39:49<28:44, 27.37s/it]

Iteration 101: 20.190348 seconds

Starting Question: 102...



 62%|██████▏   | 102/164 [40:02<23:56, 23.17s/it]

Iteration 102: 13.357448 seconds

Starting Question: 103...



 63%|██████▎   | 103/164 [40:29<24:39, 24.25s/it]

Iteration 103: 26.772207 seconds

Starting Question: 104...



 63%|██████▎   | 104/164 [41:04<27:43, 27.72s/it]

Iteration 104: 35.827422 seconds

Saved 16 samples to ./eval_results/samples_26.jsonl
Starting Question: 105...



 64%|██████▍   | 105/164 [41:14<21:59, 22.36s/it]

Iteration 105: 9.855089 seconds

Starting Question: 106...



 65%|██████▍   | 106/164 [41:55<27:03, 27.99s/it]

Iteration 106: 41.116130 seconds

Starting Question: 107...



 65%|██████▌   | 107/164 [42:21<25:47, 27.15s/it]

Iteration 107: 25.204093 seconds

Starting Question: 108...



 66%|██████▌   | 108/164 [42:46<24:43, 26.50s/it]

Iteration 108: 24.958684 seconds

Saved 16 samples to ./eval_results/samples_27.jsonl
Starting Question: 109...



 66%|██████▋   | 109/164 [43:18<25:56, 28.30s/it]

Iteration 109: 32.495930 seconds

Starting Question: 110...



 67%|██████▋   | 110/164 [43:49<26:02, 28.93s/it]

Iteration 110: 30.409082 seconds

Starting Question: 111...



 68%|██████▊   | 111/164 [44:10<23:38, 26.76s/it]

Iteration 111: 21.710376 seconds

Starting Question: 112...



 68%|██████▊   | 112/164 [44:41<24:18, 28.05s/it]

Iteration 112: 31.055420 seconds

Saved 16 samples to ./eval_results/samples_28.jsonl
Starting Question: 113...



 69%|██████▉   | 113/164 [45:06<23:04, 27.14s/it]

Iteration 113: 25.018232 seconds

Starting Question: 114...



 70%|██████▉   | 114/164 [45:35<23:06, 27.73s/it]

Iteration 114: 29.085338 seconds

Starting Question: 115...



 70%|███████   | 115/164 [46:14<25:14, 30.91s/it]

Iteration 115: 38.330922 seconds

Starting Question: 116...



 71%|███████   | 116/164 [46:41<23:52, 29.85s/it]

Iteration 116: 27.389804 seconds

Saved 16 samples to ./eval_results/samples_29.jsonl
Starting Question: 117...



 71%|███████▏  | 117/164 [46:57<20:12, 25.80s/it]

Iteration 117: 16.348289 seconds

Starting Question: 118...



 72%|███████▏  | 118/164 [47:35<22:35, 29.46s/it]

Iteration 118: 37.985297 seconds

Starting Question: 119...



 73%|███████▎  | 119/164 [48:11<23:30, 31.35s/it]

Iteration 119: 35.768552 seconds

Starting Question: 120...



 73%|███████▎  | 120/164 [48:39<22:18, 30.42s/it]

Iteration 120: 28.237959 seconds

Saved 16 samples to ./eval_results/samples_30.jsonl
Starting Question: 121...



 74%|███████▍  | 121/164 [49:07<21:06, 29.46s/it]

Iteration 121: 27.230875 seconds

Starting Question: 122...



 74%|███████▍  | 122/164 [49:37<20:53, 29.85s/it]

Iteration 122: 30.739904 seconds

Starting Question: 123...



 75%|███████▌  | 123/164 [50:14<21:45, 31.84s/it]

Iteration 123: 36.488050 seconds

Starting Question: 124...



 76%|███████▌  | 124/164 [50:47<21:27, 32.20s/it]

Iteration 124: 33.028651 seconds

Saved 16 samples to ./eval_results/samples_31.jsonl
Starting Question: 125...



 76%|███████▌  | 125/164 [51:14<19:50, 30.52s/it]

Iteration 125: 26.613736 seconds

Starting Question: 126...



 77%|███████▋  | 126/164 [51:41<18:46, 29.65s/it]

Iteration 126: 27.608504 seconds

Starting Question: 127...



 77%|███████▋  | 127/164 [52:14<18:53, 30.63s/it]

Iteration 127: 32.929747 seconds

Starting Question: 128...



 78%|███████▊  | 128/164 [52:42<17:57, 29.93s/it]

Iteration 128: 28.303757 seconds

Saved 16 samples to ./eval_results/samples_32.jsonl
Starting Question: 129...



 79%|███████▊  | 129/164 [53:10<17:03, 29.25s/it]

Iteration 129: 27.660992 seconds

Starting Question: 130...



 79%|███████▉  | 130/164 [53:48<17:59, 31.76s/it]

Iteration 130: 37.593937 seconds

Starting Question: 131...



 80%|███████▉  | 131/164 [54:14<16:36, 30.19s/it]

Iteration 131: 26.527902 seconds

Starting Question: 132...



 80%|████████  | 132/164 [54:45<16:07, 30.23s/it]

Iteration 132: 30.321212 seconds

Saved 16 samples to ./eval_results/samples_33.jsonl
Starting Question: 133...



 81%|████████  | 133/164 [55:16<15:50, 30.66s/it]

Iteration 133: 31.680331 seconds

Starting Question: 134...



 82%|████████▏ | 134/164 [55:55<16:35, 33.20s/it]

Iteration 134: 39.098813 seconds

Starting Question: 135...



 82%|████████▏ | 135/164 [56:21<14:55, 30.87s/it]

Iteration 135: 25.455980 seconds

Starting Question: 136...



 83%|████████▎ | 136/164 [56:40<12:45, 27.36s/it]

Iteration 136: 19.148317 seconds

Saved 16 samples to ./eval_results/samples_34.jsonl
Starting Question: 137...



 84%|████████▎ | 137/164 [57:03<11:42, 26.04s/it]

Iteration 137: 22.952092 seconds

Starting Question: 138...



 84%|████████▍ | 138/164 [57:24<10:35, 24.46s/it]

Iteration 138: 20.765471 seconds

Starting Question: 139...



 85%|████████▍ | 139/164 [57:55<11:00, 26.42s/it]

Iteration 139: 30.987290 seconds

Starting Question: 140...



 85%|████████▌ | 140/164 [58:37<12:30, 31.26s/it]

Iteration 140: 42.557946 seconds

Saved 16 samples to ./eval_results/samples_35.jsonl
Starting Question: 141...



 86%|████████▌ | 141/164 [58:57<10:42, 27.92s/it]

Iteration 141: 20.112683 seconds

Starting Question: 142...



 87%|████████▋ | 142/164 [59:11<08:42, 23.74s/it]

Iteration 142: 14.007416 seconds

Starting Question: 143...



 87%|████████▋ | 143/164 [59:38<08:37, 24.64s/it]

Iteration 143: 26.730188 seconds

Starting Question: 144...



 88%|████████▊ | 144/164 [1:00:05<08:28, 25.42s/it]

Iteration 144: 27.230245 seconds

Saved 16 samples to ./eval_results/samples_36.jsonl
Starting Question: 145...



 88%|████████▊ | 145/164 [1:00:29<07:55, 25.01s/it]

Iteration 145: 24.058262 seconds

Starting Question: 146...



 89%|████████▉ | 146/164 [1:00:59<07:53, 26.30s/it]

Iteration 146: 29.306216 seconds

Starting Question: 147...



 90%|████████▉ | 147/164 [1:01:32<08:02, 28.40s/it]

Iteration 147: 33.303498 seconds

Starting Question: 148...



 90%|█████████ | 148/164 [1:02:05<07:56, 29.81s/it]

Iteration 148: 33.086058 seconds

Saved 16 samples to ./eval_results/samples_37.jsonl
Starting Question: 149...



 91%|█████████ | 149/164 [1:02:43<08:03, 32.23s/it]

Iteration 149: 37.898190 seconds

Starting Question: 150...



 91%|█████████▏| 150/164 [1:03:01<06:30, 27.88s/it]

Iteration 150: 17.724891 seconds

Starting Question: 151...



 92%|█████████▏| 151/164 [1:03:35<06:28, 29.86s/it]

Iteration 151: 34.460556 seconds

Starting Question: 152...



 93%|█████████▎| 152/164 [1:04:06<06:00, 30.04s/it]

Iteration 152: 30.465629 seconds

Saved 16 samples to ./eval_results/samples_38.jsonl
Starting Question: 153...



 93%|█████████▎| 153/164 [1:04:20<04:38, 25.31s/it]

Iteration 153: 14.282497 seconds

Starting Question: 154...



 94%|█████████▍| 154/164 [1:04:49<04:24, 26.46s/it]

Iteration 154: 29.136479 seconds

Starting Question: 155...



 95%|█████████▍| 155/164 [1:05:14<03:53, 25.92s/it]

Iteration 155: 24.651337 seconds

Starting Question: 156...



 95%|█████████▌| 156/164 [1:05:33<03:11, 23.89s/it]

Iteration 156: 19.142538 seconds

Saved 16 samples to ./eval_results/samples_39.jsonl
Starting Question: 157...



 96%|█████████▌| 157/164 [1:06:15<03:25, 29.36s/it]

Iteration 157: 42.120257 seconds

Starting Question: 158...



 96%|█████████▋| 158/164 [1:06:40<02:48, 28.11s/it]

Iteration 158: 25.209630 seconds

Starting Question: 159...



 97%|█████████▋| 159/164 [1:07:15<02:30, 30.06s/it]

Iteration 159: 34.603834 seconds

Starting Question: 160...



 98%|█████████▊| 160/164 [1:07:32<01:44, 26.16s/it]

Iteration 160: 17.068430 seconds

Saved 16 samples to ./eval_results/samples_40.jsonl
Starting Question: 161...



 98%|█████████▊| 161/164 [1:07:56<01:16, 25.65s/it]

Iteration 161: 24.450845 seconds

Starting Question: 162...



 99%|█████████▉| 162/164 [1:08:19<00:49, 24.76s/it]

Iteration 162: 22.677736 seconds

Starting Question: 163...



 99%|█████████▉| 163/164 [1:08:40<00:23, 23.51s/it]

Iteration 163: 20.592110 seconds

Starting Question: 164...



100%|██████████| 164/164 [1:09:10<00:00, 25.31s/it]

Iteration 164: 30.313677 seconds

Saved 16 samples to ./eval_results/samples_41.jsonl





In [7]:
# merge files, stored in human eval
merge_jsonl_files(EVAL_SAVE_DIR, MERGE_OUTPUT)

Merged 41 files into merged_samples.jsonl


In [8]:
import json

def transform_jsonl(input_file: str, output_file: str):
    """
    Transforms each line from:
        {
          "task_id": "...",
          "completion": [{"generated_text": "some string ..."}]
        }
    into:
        {
          "task_id": "...",
          "completion": "some string ..."
        }
    and writes to a new JSONL file.
    """
    transformed_records = []

    # 1. Read each line as JSON
    with open(input_file, "r", encoding="utf-8") as fin:
        for line in fin:
            data = json.loads(line)

            # 2. Extract "generated_text" from the 'completion' list (assuming it exists)
            if "completion" in data and isinstance(data["completion"], list) and data["completion"]:
                # Typically something like [{"generated_text": "..."}]
                gen_text = data["completion"][0].get("generated_text", "")
            else:
                gen_text = ""

            # 3. Replace the 'completion' field with just the string
            data["completion"] = gen_text

            transformed_records.append(data)

    # 4. Write the new structure to a JSONL output file
    with open(output_file, "w", encoding="utf-8") as fout:
        for record in transformed_records:
            fout.write(json.dumps(record) + "\n")


In [9]:
# Usage:
input_path = "/content/human-eval/merged_samples.jsonl"
output_path = "/content/human-eval/clean_merged_samples.jsonl"

transform_jsonl(input_path, output_path)
print(f"Transformed JSONL saved to: {output_path}")

Transformed JSONL saved to: /content/human-eval/clean_merged_samples.jsonl


In [10]:
# run human eval
%cd /content/human-eval/
!evaluate_functional_correctness clean_merged_samples.jsonl

/content/human-eval
Reading samples...
656it [00:00, 19773.65it/s]
Running test suites...
100% 656/656 [00:08<00:00, 77.94it/s] 
Writing results to clean_merged_samples.jsonl_results.jsonl...
100% 656/656 [00:00<00:00, 44869.84it/s]
{'pass@1': 0.11890243902439024}
