In [1]:
!pip install torch transformers datasets 'accelerate>=0.26.0' -q

In [2]:
from datasets import load_dataset

ds = load_dataset("nuprl/engineering-llm-systems", "humaneval", split="test")

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-0.6B-Base") # NON GPU
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B-Base")

In [4]:
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-0.6B-Base").to("mps") # GPU

In [5]:
example_inputs = tokenizer("Shakespeare was a great", return_tensors="pt").to(model.device)
example_outputs = model.generate(
    **example_inputs,
    pad_token_id=tokenizer.eos_token_id,
    do_sample=True,
    temperature=0.2,
    max_new_tokens=300)
tokenizer.decode(example_outputs[0])

'Shakespeare was a great writer. He wrote many great plays. He wrote many great plays. He wrote many great plays. He wrote many great plays. He wrote many great plays. He wrote many great plays. He wrote many great plays. He wrote many great plays. He wrote many great plays. He wrote many great plays. He wrote many great plays. He wrote many great plays. He wrote many great plays. He wrote many great plays. He wrote many great plays. He wrote many great plays. He wrote many great plays. He wrote many great plays. He wrote many great plays. He wrote many great plays. He wrote many great plays. He wrote many great plays. He wrote many great plays. He wrote many great plays. He wrote many great plays. He wrote many great plays. He wrote many great plays. He wrote many great plays. He wrote many great plays. He wrote many great plays. He wrote many great plays. He wrote many great plays. He wrote many great plays. He wrote many great plays. He wrote many great plays. He wrote many great pl

In [6]:
def clip_completions(completion, clip_at = ["\ndef", "\nclass", "\nif", "\nprint"]):
    # split at each successively
    result = completion

    for clip in clip_at:
        result = result.split(clip)[0]

    return result    

In [7]:
import json

def generate_completions(prompt, count = 5):
    completions = []
    for i in range(count):
        inputs = tokenizer(prompt, return_tensors = "pt").to(model.device)
        example_outputs = model.generate(
            **inputs,
            pad_token_id = tokenizer.eos_token_id,
            do_sample = True,
            temperature = 0.2,
            max_new_tokens = 300)
        result = tokenizer.decode(example_outputs[0])

        clipped_result = clip_completions(result)
        completions.append(clipped_result)
        
        print(f"{i + 1}: {result[:100].strip()}...\n")

        
    with open("completions.json", "w") as f:
        json.dump(completions, f)
    
    #open and read the file after the appending:
    with open("completions.json") as f:
        loaded = json.load(f)
        if len(loaded) == len(completions):
            print("successfuly wrote completions")
        else:
            print("write failed")

In [8]:
generate_completions("def bogosort(")

1: def bogosort(ary):
    while True:
        for i in range(len(ary) - 1):
            if ary[i] > ary...

2: def bogosort(ary):
    while True:
        for i in range(len(ary)):
            if i == 0:...

3: def bogosort(ary):
    while True:
        for i in range(len(ary)):
            for j in range(len(...

4: def bogosort(ary):
    while True:
        for i in range(len(ary)):
            if i == 0:...

5: def bogosort(ary):
    while len(ary) > 1:
        for i in range(len(ary) - 1):
            if ary[...

successfuly wrote completions


In [23]:
def preprocess(data):
    inputs = tokenizer(data["prompt"], truncation=True, padding="max_length", max_length=256)
    inputs["labels"] = inputs["input_ids"].copy()
    return inputs

ds_tokenized = ds.map(preprocess, batched=True, remove_columns=ds.column_names)

Map:   0%|          | 0/161 [00:00<?, ? examples/s]

In [24]:
from transformers import Trainer, TrainingArguments

args = TrainingArguments(
    output_dir = "./results",
    per_device_eval_batch_size = 8,
    do_eval = True,
)

trainer = Trainer(
    model = model,
    args = args,
    tokenizer = tokenizer,
    eval_dataset = ds_tokenized,
)

metrics = trainer.evaluate()
print(metrics)

  trainer = Trainer(


{'eval_loss': 4.607391357421875, 'eval_model_preparation_time': 0.0045, 'eval_runtime': 17.08, 'eval_samples_per_second': 9.426, 'eval_steps_per_second': 1.23}
