In [1]:
!pip install torch transformers datasets 'accelerate>=0.26.0' -q

In [2]:
from datasets import load_dataset

ds = load_dataset("nuprl/engineering-llm-systems", "humaneval", split="test")

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-0.6B-Base") # NON GPU
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B-Base")

In [4]:
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-0.6B-Base").to("mps") # GPU

In [5]:
example_inputs = tokenizer("Shakespeare was a great", return_tensors="pt").to(model.device)
example_outputs = model.generate(
    **example_inputs,
    pad_token_id=tokenizer.eos_token_id,
    do_sample=True,
    temperature=0.2,
    max_new_tokens=300)
tokenizer.decode(example_outputs[0])

'Shakespeare was a great writer. His plays are still read today. He wrote about love, war, and politics. He wrote about people. He wrote about the human condition. He wrote about the human condition with great insight. He wrote about the human condition with great insight. He wrote about the human condition with great insight. He wrote about the human condition with great insight. He wrote about the human condition with great insight. He wrote about the human condition with great insight. He wrote about the human condition with great insight. He wrote about the human condition with great insight. He wrote about the human condition with great insight. He wrote about the human condition with great insight. He wrote about the human condition with great insight. He wrote about the human condition with great insight. He wrote about the human condition with great insight. He wrote about the human condition with great insight. He wrote about the human condition with great insight. He wrote ab

In [64]:
def clip_completions(completion, clip_at = ["\ndef", "\nclass", "\nif", "\nprint", "<|endoftext|>"]):
    # split at each successively
    result = completion

    for clip in clip_at:
        result = result.split(clip)[0]

    return result    

In [65]:
import json

def generate_completions(prompt, count = 5):
    completions = []
    for i in range(count):
        inputs = tokenizer(prompt, return_tensors = "pt").to(model.device)
        example_outputs = model.generate(
            **inputs,
            pad_token_id = tokenizer.eos_token_id,
            do_sample = True,
            temperature = 0.2,
            max_new_tokens = 300)
        result = tokenizer.decode(example_outputs[0])

        clipped_result = clip_completions(result)
        completions.append(clipped_result)
        
        # print(f"{i + 1}: {result[:100].strip()}...\n")
    return completions

In [30]:
generate_completions("def bogosort(")

['def bogosort(ary):\n    for i in range(len(ary)):\n        for j in range(len(ary)-1):\n            if ary[j] > ary[j+1]:\n                ary[j],ary[j+1] = ary[j+1],ary[j]\n    return ary\n',
 'def bogosort(ary):\n    for i in range(len(ary)):\n        for j in range(len(ary)-1):\n            if ary[j] > ary[j+1]:\n                ary[j], ary[j+1] = ary[j+1], ary[j]\n    return ary\n',
 'def bogosort(ary):\n    for i in range(len(ary)):\n        for j in range(len(ary)-1):\n            if ary[j] > ary[j+1]:\n                ary[j], ary[j+1] = ary[j+1], ary[j]\n    return ary\n',
 'def bogosort(ary):\n    while True:\n        for i in range(len(ary)):\n            if i == 0:\n                if ary[i] > ary[i+1]:\n                    ary[i], ary[i+1] = ary[i+1], ary[i]\n            else:\n                if ary[i] > ary[i-1]:\n                    ary[i], ary[i-1] = ary[i-1], ary[i]\n        if ary == sorted(ary):\n            break\n    return ary\n',
 'def bogosort(ary):\n    while 

In [34]:
print(len(ds))

161


In [36]:
print(ds[0])

{'name': 'HumanEval_23_strlen', 'prompt': 'def strlen(string: str) -> int:\n    """ Return length of given string\n    >>> strlen(\'\')\n    0\n    >>> strlen(\'abc\')\n    3\n    """', 'tests': "def check(candidate):\n    assert candidate('') == 0\n    assert candidate('x') == 1\n    assert candidate('asdasnakj') == 9\n\ndef test_check():\n    check(strlen)\n\ntest_check()\n"}


In [35]:
results = []

for index, sample in enumerate(ds):
    completions = generate_completions(sample["prompt"])
    results.append((completions, sample["tests"], sample["prompt"])
    print(f"Sample {index + 1} complete")
        
with open("completions.json", "w") as f:
    json.dump(results, f)

#open and read the file after the appending:
with open("completions.json") as f:
    loaded = json.load(f)
    if len(loaded) == len(results):
        print("successfuly wrote completions")
    else:
        print("write failed")

Sample 1 complete
Sample 2 complete
Sample 3 complete
Sample 4 complete
Sample 5 complete
Sample 6 complete
Sample 7 complete
Sample 8 complete
Sample 9 complete
Sample 10 complete
Sample 11 complete
Sample 12 complete
Sample 13 complete
Sample 14 complete
Sample 15 complete
Sample 16 complete
Sample 17 complete
Sample 18 complete
Sample 19 complete
Sample 20 complete
Sample 21 complete
Sample 22 complete
Sample 23 complete
Sample 24 complete
Sample 25 complete
Sample 26 complete
Sample 27 complete
Sample 28 complete
Sample 29 complete
Sample 30 complete
Sample 31 complete
Sample 32 complete
Sample 33 complete
Sample 34 complete
Sample 35 complete
Sample 36 complete
Sample 37 complete
Sample 38 complete
Sample 39 complete
Sample 40 complete
Sample 41 complete
Sample 42 complete
Sample 43 complete
Sample 44 complete
Sample 45 complete
Sample 46 complete
Sample 47 complete
Sample 48 complete
Sample 49 complete
Sample 50 complete
Sample 51 complete
Sample 52 complete
Sample 53 complete
Sa

In [37]:
def load_completions(filename = "completions.json"):
    with open(filename, 'r') as f:
        data = json.load(f)
    print(f"Loaded {len(data)} samples with completions")
    return data

In [83]:
data = load_completions()

prompt_examples = '''Write a function defintion based on the specification:
def strlen(string: str) -> int:
    """ Return length of given string
    >>> strlen('')
    0
    >>> strlen('abc')
    3
    """
    
Expected Output:
def strlen(string: str) -> int:
    return len(string)



Write a function defintion based on the specification:
def add(x: int, y: int) -> int:
    """Add two numbers x and y
    >>> add(2, 3)
    5
    >>> add(5, 7)
    12
    """

Expected Output:
def add(x: int, y: int) -> int:
    return x + y



Write a function defintion based on the specification:
def fibfib(n: int) -> int:
    """The FibFib number sequence is a sequence similar to the Fibbonacci sequnece that's defined as follows:
    fibfib(0) == 0
    fibfib(1) == 0
    fibfib(2) == 1
    fibfib(n) == fibfib(n-1) + fibfib(n-2) + fibfib(n-3).
    Please write a function to efficiently compute the n-th element of the fibfib number sequence.
    >>> fibfib(1)
    0
    >>> fibfib(5)
    4
    >>> fibfib(8)
    24
    """

Expected Output:
def fibfib(n: int) -> int:
    if n == 0:
        return 0
    elif n == 1:
        return 0
    elif n == 2:
        return 1
    else:
        return fibfib(n - 1) + fibfib(n - 2) + fibfib(n - 3)



Write a function defintion based on the specification:
def count_distinct_characters(string: str) -> int:
    """ Given a string, find out how many distinct characters (regardless of case) does it consist of
    >>> count_distinct_characters('xyzXYZ')
    3
    >>> count_distinct_characters('Jerry')
    4
    """

Expected Output:
def count_distinct_characters(string: str) -> int:
    return len(set(string.lower()))
'''

def make_prompt(sample):
    return f"""{prompt_examples}



Write a function defintion based on the specification:
{sample["prompt"]}

Expected Output:
"""


check = 0

# print(data[check])
print(data[check][0][0])

Loaded 1 samples with completions
Write a function defintion based on the specification:
from typing import List



In [84]:
print(make_prompt({"prompt": data[check][2]}))

Write a function defintion based on the specification:
def strlen(string: str) -> int:
    """ Return length of given string
    >>> strlen('')
    0
    >>> strlen('abc')
    3
    """
    
Expected Output:
def strlen(string: str) -> int:
    return len(string)



Write a function defintion based on the specification:
def add(x: int, y: int) -> int:
    """Add two numbers x and y
    >>> add(2, 3)
    5
    >>> add(5, 7)
    12
    """

Expected Output:
def add(x: int, y: int) -> int:
    return x + y



Write a function defintion based on the specification:
def fibfib(n: int) -> int:
    """The FibFib number sequence is a sequence similar to the Fibbonacci sequnece that's defined as follows:
    fibfib(0) == 0
    fibfib(1) == 0
    fibfib(2) == 1
    fibfib(n) == fibfib(n-1) + fibfib(n-2) + fibfib(n-3).
    Please write a function to efficiently compute the n-th element of the fibfib number sequence.
    >>> fibfib(1)
    0
    >>> fibfib(5)
    4
    >>> fibfib(8)
    24
    """

E

In [85]:
out = generate_completions(make_prompt({"prompt": data[check][2]}), count = 1)

In [88]:
print(out[0])

Write a function defintion based on the specification:


In [90]:
import random

random.random()

0.6394267984578837