In [2]:
# !pip install jsonlines
# !pip install transformers
# !pip install accelerate
# !pip install protobuf
# !git clone https://github.com/openai/human-eval
# !pip install -e human-eval

### RESTART THE KERNEL AFTER

In [1]:
import os
import gc
import jsonlines
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel
from human_eval.data import write_jsonl, read_problems # Install HumanEval first!
from human_eval.execution import check_correctness # Install HumanEval first!

# Functions for HumanEval

In [6]:
def generate_code_with_transformers(model, tokenizer, prompt: str, num_samples: int = 1) -> list[str]:
    """
    Code generation
    """
    inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
    
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=512,
            do_sample=False,
            temperature=0.6,
            top_k=50,
            top_p=0.95,
            num_return_sequences=num_samples,
            pad_token_id=tokenizer.eos_token_id,
        )

    completions = []
    for i in range(num_samples):
        completion = tokenizer.decode(outputs[i], skip_special_tokens=True)
        # Keep only code
        completion = completion[len(prompt):]
        completions.append(completion)

    return completions

In [7]:
def evaluate_hf_model_on_humaneval(model, tokenizer, problems, num_samples_per_task: int = 1):
    """
    Evaluate model on HumanEval.
    """

    results = []

    for problem_id, problem in problems.items():
        
        problem_num = problem_id.split("/")[1]
        if int(problem_num)%10 == 0:
            print(f"Processing task {problem_id}...")
            
        prompt = problem["prompt"]
        completions = generate_code_with_transformers(model, tokenizer, prompt, num_samples=num_samples_per_task)

        for completion in completions:
            result = {
                "task_id": problem_id,
                "completion": completion,
            }
            results.append(result)
    return results

In [4]:
quantize_models = {
    "Qwen2.5-Coder-7B-Instruct.Q2_K": "models/quant_models/Q2_K/Qwen2.5-Coder-7B-Instruct.Q2_K.gguf",
    "Qwen2.5-Coder-7B-Instruct.Q3_K_L": "models/quant_models/Q3_K_L/Qwen2.5-Coder-7B-Instruct.Q3_K_L.gguf",
    "Qwen2.5-Coder-7B-Instruct.Q3_K_M": "models/quant_models/Q3_K_M/Qwen2.5-Coder-7B-Instruct.Q3_K_M.gguf",
    "Qwen2.5-Coder-7B-Instruct.Q3_K_S": "models/quant_models/Q3_K_S/Qwen2.5-Coder-7B-Instruct.Q3_K_S.gguf",
    "Qwen2.5-Coder-7B-Instruct.Q4_0": "models/quant_models/Q4_0/Qwen2.5-Coder-7B-Instruct.Q4_0.gguf",
    "Qwen2.5-Coder-7B-Instruct.Q4_1": "models/quant_models/Q4_1/Qwen2.5-Coder-7B-Instruct.Q4_1.gguf",
    "Qwen2.5-Coder-7B-Instruct.Q4_K_M": "models/quant_models/Q4_K_M/Qwen2.5-Coder-7B-Instruct.Q4_K_M.gguf",
    "Qwen2.5-Coder-7B-Instruct.Q4_K_S": "models/quant_models/Q4_K_S/Qwen2.5-Coder-7B-Instruct.Q4_K_S.gguf",
    "Qwen2.5-Coder-7B-Instruct.Q5_0": "models/quant_models/Q5_0/Qwen2.5-Coder-7B-Instruct.Q5_0.gguf",
    "Qwen2.5-Coder-7B-Instruct.Q5_1": "models/quant_models/Q5_1/Qwen2.5-Coder-7B-Instruct.Q5_1.gguf",
    "Qwen2.5-Coder-7B-Instruct.Q5_K_M": "models/quant_models/Q5_K_M/Qwen2.5-Coder-7B-Instruct.Q5_K_M.gguf",
    "Qwen2.5-Coder-7B-Instruct.Q5_K_S": "models/quant_models/Q5_K_S/Qwen2.5-Coder-7B-Instruct.Q5_K_S.gguf",
    "Qwen2.5-Coder-7B-Instruct.Q6_K": "models/quant_models/Q6_K/Qwen2.5-Coder-7B-Instruct.Q6_K.gguf",
    "Qwen2.5-Coder-7B-Instruct.Q8_0": "models/quant_models/Q8_0/Qwen2.5-Coder-7B-Instruct.Q8_0.gguf",
}

base_models = {
    "Qwen2.5-1.5B": "models/base_models/Qwen2.5-1.5B", +
    "Qwen2.5-1.5B-Instruct": "models/base_models/Qwen2.5-1.5B-Inst", # +
    "Qwen2.5-Coder-1.5B": "models/base_models/Qwen2.5-Coder-1.5B", # +
    "Qwen2.5-Coder-1.5B-Instruct": "models/base_models/Qwen2.5-Coder-1.5B-Inst", # +
    "Qwen2.5-7B": "models/base_models/Qwen2.5-7B", # +
    "Qwen2.5-7B-Instruct": "models/base_models/Qwen2.5-7B-Inst", # +
    "Qwen2.5-Coder-7B": "models/base_models/Qwen2.5-Coder-7B", # +
    "Qwen2.5-Coder-7B-Instruct": "models/base_models/Qwen2.5-Coder-7B-Inst", # +
    "Qwen2.5-14B": "models/base_models/Qwen2.5-14B",
    "Qwen2.5-14B-Instruct": "models/base_models/Qwen2.5-14B-Inst",
    "Qwen2.5-Coder-14B": "models/base_models/Qwen2.5-Coder-14B",
    "Qwen2.5-Coder-14B-Instruct": "models/base_models/Qwen2.5-Coder-14B-Inst",
}

distill_models = {
    "DeepSeek-R1-Distill-Llama-70B" : "models/distill_models/DeepSeek-R1-Distill-Llama-70B",
    "DeepSeek-R1-Distill-Qwen-32B" : "models/distill_models/DeepSeek-R1-Distill-Qwen-32B",
    "DeepSeek-R1-Distill-Qwen-14B" : "models/distill_models/DeepSeek-R1-Distill-Qwen-14B", # +
    "DeepSeek-R1-Distill-Llama-8B": "models/distill_models/DeepSeek-R1-Distill-Llama-8B", # +
    "DeepSeek-R1-Distill-Qwen-7B" : "models/distill_models/DeepSeek-R1-Distill-Qwen-7B", # +
    "DeepSeek-R1-Distill-Qwen-1.5B" : "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B", # +
}

qwen3_models = {
    'Qwen3-0.6B-Base': 'models/qwen3_models/Qwen3-0.6B-Base', # +
    'Qwen3-0.6B-FP8': 'models/qwen3_models/Qwen3-0.6B-FP8',
    'Qwen3-0.6B': 'models/qwen3_models/Qwen3-0.6B', +
    'Qwen3-1.7B-Base': 'models/qwen3_models/Qwen3-1.7B-Base', # +
    'Qwen3-1.7B-FP8': 'models/qwen3_models/Qwen3-1.7B-FP8',
    'Qwen3-1.7B': 'models/qwen3_models/Qwen3-1.7B', # +
    'Qwen3-4B-Base': 'models/qwen3_models/Qwen3-4B-Base', # +
    'Qwen3-4B-FP8': 'models/qwen3_models/Qwen3-4B-FP8',
    'Qwen3-4B': 'models/qwen3_models/Qwen3-4B', # +
    'Qwen3-8B-Base': 'models/qwen3_models/Qwen3-8B-Base', # +
    'Qwen3-8B-FP8': 'models/qwen3_models/Qwen3-8B-FP8',
    'Qwen3-8B': 'models/qwen3_models/Qwen3-8B',
    'Qwen3-14B-Base': 'models/qwen3_models/Qwen3-14B-Base',
    'Qwen3-14B-FP8': 'models/qwen3_models/Qwen3-14B-FP8',
    'Qwen3-14B': 'models/qwen3_models/Qwen3-14B',
    'Qwen3-30B-A3B-Base': 'models/qwen3_models/Qwen3-30B-A3B-Base',
    'Qwen3-30B-A3B-FP8': 'models/qwen3_models/Qwen3-30B-A3B-FP8',
    'Qwen3-30B-A3B': 'models/qwen3_models/Qwen3-30B-A3B',
}

In [5]:
problems = read_problems()
len(problems)

164

In [8]:
# print(problems['HumanEval/0']['prompt'])

# Base models

In [7]:
# model.bfloat16()

In [8]:
# results = evaluate_hf_model_on_humaneval(model.bfloat16(), tokenizer, problems, num_samples_per_task=1) # Pass@1

In [None]:
%%time
OUTPUT_DIR = "HE_results"

for model_name, model_path in base_models.items():
    try:
        del model
        del results
        gc.collect()
        torch.cuda.empty_cache()
    except Exception as e:
        print(f"Error: {e}")
    print(model_name)

    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", torch_dtype="auto")

    results = evaluate_hf_model_on_humaneval(model, tokenizer, problems, num_samples_per_task=1) # Pass@1
    write_jsonl(f"{OUTPUT_DIR}/{model_name.replace('/', '_')}_results.jsonl", results)
    print(f"Evaluation complete. Results saved to {OUTPUT_DIR}/{model_name.replace('/', '_')}_results.jsonl")

In [None]:
%%time
OUTPUT_DIR = "HE_results"

for model_name, model_path in qwen3_models.items():
    try:
        del model
        del results
        gc.collect()
        torch.cuda.empty_cache()
    except Exception as e:
        print(f"Error: {e}")
    print(model_name)

    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", torch_dtype="auto")
    
    results = evaluate_hf_model_on_humaneval(model, tokenizer, problems, num_samples_per_task=1) # Pass@1
    write_jsonl(f"{OUTPUT_DIR}/{model_name.replace('/', '_')}_results.jsonl", results)
    print(f"Evaluation complete. Results saved to {OUTPUT_DIR}/{model_name.replace('/', '_')}_results.jsonl")

In [8]:
print(results[0]['completion'])

    for i in range(len(numbers)):
        for j in range(i + 1, len(numbers)):
            if abs(numbers[i] - numbers[j]) < threshold:
                return True
    return False


if __name__ == "__main__":
    from doctest import testmod

    testmod()


# Distill models

In [None]:
%%time
OUTPUT_DIR = "HE_results"

for model_name, model_path in distill_models.items():
    try:
        del model
        del results
        gc.collect()
        torch.cuda.empty_cache()
    except Exception as e:
        print(f"Error: {e}")
    print(model_name)

    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", torch_dtype="auto")

    results = evaluate_hf_model_on_humaneval(model, tokenizer, problems, num_samples_per_task=1) # Pass@1
    write_jsonl(f"{OUTPUT_DIR}/{model_name.replace('/', '_')}_results.jsonl", results)
    print(f"Evaluation complete. Results saved to {OUTPUT_DIR}/{model_name.replace('/', '_')}_results.jsonl")

# Results

## Base Qwen2.5 models

In [19]:
!python human-eval/human_eval/evaluate_functional_correctness.py HE_results/Qwen2.5-Coder-1.5B-Instruct_results.jsonl

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Reading samples...
164it [00:00, 9006.66it/s]
Running test suites...
100%|█████████████████████████████████████████| 164/164 [00:01<00:00, 87.91it/s]
Writing results to HE_results/Qwen2.5-Coder-1.5B-Instruct_results.jsonl_results.jsonl...
100%|███████████████████████████████████████| 164/164 [00:00<00:00, 9230.13it/s]
{'pass@1': 0.13414634146341464}


In [20]:
!python human-eval/human_eval/evaluate_functional_correctness.py HE_results/Qwen2.5-Coder-1.5B_results.jsonl

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Reading samples...
164it [00:00, 10980.55it/s]
Running test suites...
100%|█████████████████████████████████████████| 164/164 [00:03<00:00, 48.55it/s]
Writing results to HE_results/Qwen2.5-Coder-1.5B_results.jsonl_results.jsonl...
100%|██████████████████████████████████████| 164/164 [00:00<00:00, 10896.02it/s]
{'pass@1': 0.1951219512195122}


In [23]:
!python human-eval/human_eval/evaluate_functional_correctness.py HE_results/Qwen2.5-1.5B-Instruct_results.jsonl

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Reading samples...
164it [00:00, 11389.64it/s]
Running test suites...
100%|█████████████████████████████████████████| 164/164 [00:03<00:00, 48.70it/s]
Writing results to HE_results/Qwen2.5-1.5B-Instruct_results.jsonl_results.jsonl...
100%|███████████████████████████████████████| 164/164 [00:00<00:00, 6040.22it/s]
{'pass@1': 0.21341463414634146}


In [24]:
!python human-eval/human_eval/evaluate_functional_correctness.py HE_results/Qwen2.5-1.5B_results.jsonl

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Reading samples...
164it [00:00, 6350.31it/s]
Running test suites...
100%|█████████████████████████████████████████| 164/164 [00:03<00:00, 46.91it/s]
Writing results to HE_results/Qwen2.5-1.5B_results.jsonl_results.jsonl...
100%|███████████████████████████████████████| 164/164 [00:00<00:00, 9545.87it/s]
{'pass@1': 0.1951219512195122}


In [9]:
!python human-eval/human_eval/evaluate_functional_correctness.py HE_results/Qwen2.5-7B_results.jsonl

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Reading samples...
164it [00:00, 11848.52it/s]
Running test suites...
100%|█████████████████████████████████████████| 164/164 [00:02<00:00, 78.29it/s]
Writing results to HE_results/Qwen2.5-7B_results.jsonl_results.jsonl...
100%|██████████████████████████████████████| 164/164 [00:00<00:00, 13124.21it/s]
{'pass@1': 0.27439024390243905}


In [10]:
!python human-eval/human_eval/evaluate_functional_correctness.py HE_results/Qwen2.5-7B-Instruct_results.jsonl

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Reading samples...
164it [00:00, 3796.97it/s]
Running test suites...
100%|█████████████████████████████████████████| 164/164 [00:02<00:00, 62.33it/s]
Writing results to HE_results/Qwen2.5-7B-Instruct_results.jsonl_results.jsonl...
100%|███████████████████████████████████████| 164/164 [00:00<00:00, 7753.66it/s]
{'pass@1': 0.18292682926829268}


In [12]:
!python human-eval/human_eval/evaluate_functional_correctness.py HE_results/Qwen2.5-Coder-7B_results.jsonl

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Reading samples...
164it [00:00, 4039.69it/s]
Running test suites...
100%|█████████████████████████████████████████| 164/164 [00:01<00:00, 95.70it/s]
Writing results to HE_results/Qwen2.5-Coder-7B_results.jsonl_results.jsonl...
100%|███████████████████████████████████████| 164/164 [00:00<00:00, 5688.09it/s]
{'pass@1': 0.2865853658536585}


In [13]:
!python human-eval/human_eval/evaluate_functional_correctness.py HE_results/Qwen2.5-Coder-7B-Instruct_results.jsonl

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Reading samples...
164it [00:00, 13923.85it/s]
Running test suites...
100%|█████████████████████████████████████████| 164/164 [00:02<00:00, 65.55it/s]
Writing results to HE_results/Qwen2.5-Coder-7B-Instruct_results.jsonl_results.jsonl...
100%|███████████████████████████████████████| 164/164 [00:00<00:00, 5806.88it/s]
{'pass@1': 0.24390243902439024}


In [23]:
!python human-eval/human_eval/evaluate_functional_correctness.py Qwen2.5-Coder-7B-Instruct.Q2_K.gguf_results.jsonl

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Reading samples...
164it [00:00, 10306.50it/s]
Running test suites...
100%|█████████████████████████████████████████| 164/164 [00:03<00:00, 50.53it/s]
Writing results to Qwen2.5-Coder-7B-Instruct.Q2_K.gguf_results.jsonl_results.jsonl...
100%|██████████████████████████████████████| 164/164 [00:00<00:00, 14663.21it/s]
{'pass@1': 0.11585365853658537}


## Distill R1 models

In [1]:
!python human-eval/human_eval/evaluate_functional_correctness.py HE_results/DeepSeek-R1-Distill-Llama-8B_results.jsonl

Reading samples...
164it [00:00, 15308.02it/s]
Running test suites...
100%|████████████████████████████████████████| 164/164 [00:01<00:00, 135.48it/s]
Writing results to HE_results/DeepSeek-R1-Distill-Llama-8B_results.jsonl_results.jsonl...
100%|██████████████████████████████████████| 164/164 [00:00<00:00, 17516.77it/s]
{'pass@1': 0.08536585365853659}


In [2]:
!python human-eval/human_eval/evaluate_functional_correctness.py HE_results/DeepSeek-R1-Distill-Qwen-7B_results.jsonl

Reading samples...
164it [00:00, 9365.60it/s]
Running test suites...
100%|████████████████████████████████████████| 164/164 [00:01<00:00, 131.11it/s]
Writing results to HE_results/DeepSeek-R1-Distill-Qwen-7B_results.jsonl_results.jsonl...
100%|██████████████████████████████████████| 164/164 [00:00<00:00, 10571.65it/s]
{'pass@1': 0.0}


In [3]:
!python human-eval/human_eval/evaluate_functional_correctness.py HE_results/DeepSeek-R1-Distill-Qwen-1.5B_results.jsonl

Reading samples...
164it [00:00, 6475.31it/s]
Running test suites...
100%|████████████████████████████████████████| 164/164 [00:01<00:00, 130.94it/s]
Writing results to HE_results/DeepSeek-R1-Distill-Qwen-1.5B_results.jsonl_results.jsonl...
100%|███████████████████████████████████████| 164/164 [00:00<00:00, 4819.45it/s]
{'pass@1': 0.0}


In [4]:
!python human-eval/human_eval/evaluate_functional_correctness.py HE_results/DeepSeek-R1-Distill-Qwen-14B_results.jsonl

Reading samples...
164it [00:00, 15423.00it/s]
Running test suites...
100%|████████████████████████████████████████| 164/164 [00:01<00:00, 144.21it/s]
Writing results to HE_results/DeepSeek-R1-Distill-Qwen-14B_results.jsonl_results.jsonl...
100%|███████████████████████████████████████| 164/164 [00:00<00:00, 6462.11it/s]
{'pass@1': 0.06097560975609756}


## Base Qwen3 models

In [8]:
!python human-eval/human_eval/evaluate_functional_correctness.py HE_results/Qwen3-0.6B-Base_results.jsonl

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Reading samples...
164it [00:00, 16312.12it/s]
Running test suites...
100%|█████████████████████████████████████████| 164/164 [00:04<00:00, 39.61it/s]
Writing results to HE_results/Qwen3-0.6B-Base_results.jsonl_results.jsonl...
100%|███████████████████████████████████████| 164/164 [00:00<00:00, 9824.69it/s]
{'pass@1': 0.12195121951219512}


In [12]:
!python human-eval/human_eval/evaluate_functional_correctness.py HE_results/Qwen3-0.6B_results.jsonl

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Reading samples...
164it [00:00, 9675.03it/s]
Running test suites...
100%|█████████████████████████████████████████| 164/164 [00:03<00:00, 47.56it/s]
Writing results to HE_results/Qwen3-0.6B_results.jsonl_results.jsonl...
100%|██████████████████████████████████████| 164/164 [00:00<00:00, 10282.92it/s]
{'pass@1': 0.042682926829268296}


In [14]:
!python human-eval/human_eval/evaluate_functional_correctness.py HE_results/Qwen3-1.7B-Base_results.jsonl

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Reading samples...
164it [00:00, 15645.76it/s]
Running test suites...
100%|█████████████████████████████████████████| 164/164 [00:03<00:00, 50.46it/s]
Writing results to HE_results/Qwen3-1.7B-Base_results.jsonl_results.jsonl...
100%|██████████████████████████████████████| 164/164 [00:00<00:00, 21745.89it/s]
{'pass@1': 0.1951219512195122}


In [15]:
!python human-eval/human_eval/evaluate_functional_correctness.py HE_results/Qwen3-1.7B_results.jsonl

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Reading samples...
164it [00:00, 14485.96it/s]
Running test suites...
100%|█████████████████████████████████████████| 164/164 [00:03<00:00, 52.45it/s]
Writing results to HE_results/Qwen3-1.7B_results.jsonl_results.jsonl...
100%|██████████████████████████████████████| 164/164 [00:00<00:00, 17173.03it/s]
{'pass@1': 0.024390243902439025}


In [16]:
!python human-eval/human_eval/evaluate_functional_correctness.py HE_results/Qwen3-4B-Base_results.jsonl

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Reading samples...
164it [00:00, 17335.33it/s]
Running test suites...
100%|█████████████████████████████████████████| 164/164 [00:03<00:00, 42.01it/s]
Writing results to HE_results/Qwen3-4B-Base_results.jsonl_results.jsonl...
100%|██████████████████████████████████████| 164/164 [00:00<00:00, 23379.30it/s]
{'pass@1': 0.2621951219512195}


In [17]:
!python human-eval/human_eval/evaluate_functional_correctness.py HE_results/Qwen3-4B_results.jsonl

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Reading samples...
164it [00:00, 12318.07it/s]
Running test suites...
100%|█████████████████████████████████████████| 164/164 [00:02<00:00, 60.69it/s]
Writing results to HE_results/Qwen3-4B_results.jsonl_results.jsonl...
100%|██████████████████████████████████████| 164/164 [00:00<00:00, 10896.54it/s]
{'pass@1': 0.1951219512195122}


In [18]:
!python human-eval/human_eval/evaluate_functional_correctness.py HE_results/Qwen3-8B-Base_results.jsonl

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Reading samples...
164it [00:00, 16246.63it/s]
Running test suites...
100%|█████████████████████████████████████████| 164/164 [00:03<00:00, 48.44it/s]
Writing results to HE_results/Qwen3-8B-Base_results.jsonl_results.jsonl...
100%|██████████████████████████████████████| 164/164 [00:00<00:00, 21995.52it/s]
{'pass@1': 0.2804878048780488}


In [19]:
!python human-eval/human_eval/evaluate_functional_correctness.py HE_results/Qwen3-8B_results.jsonl

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Reading samples...
164it [00:00, 18149.97it/s]
Running test suites...
100%|█████████████████████████████████████████| 164/164 [00:02<00:00, 78.22it/s]
Writing results to HE_results/Qwen3-8B_results.jsonl_results.jsonl...
100%|██████████████████████████████████████| 164/164 [00:00<00:00, 10686.62it/s]
{'pass@1': 0.11585365853658537}


In [1]:
!python human-eval/human_eval/evaluate_functional_correctness.py HE_results/Qwen2.5-14B_results.jsonl

Reading samples...
164it [00:00, 24123.79it/s]
Running test suites...
100%|████████████████████████████████████████| 164/164 [00:01<00:00, 102.65it/s]
Writing results to HE_results/Qwen2.5-14B_results.jsonl_results.jsonl...
100%|██████████████████████████████████████| 164/164 [00:00<00:00, 34159.30it/s]
{'pass@1': 0.0}


In [None]:
#evaluate_hf_model_on_humaneval(model_name, problems, num_samples_per_task=10)  # Pass@10