In [3]:
# !pip install --upgrade pip
# !pip install -q unsloth peft trl wandb evaluate jsonlines human-eval
# !pip install -q datasets transformers accelAerate einops tiktoken tqdm
# !pip uninstall xformers -y
# !pip install xformers

In [4]:
import json
from datasets import Dataset
from unsloth import FastLanguageModel
from trl import SFTTrainer
from transformers import TrainingArguments, EarlyStoppingCallback
import wandb
from pathlib import Path
import re
import torch

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!


In [5]:
# Step 1: Load and prepare the dataset
def load_dataset(file_path):
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            data.append(json.loads(line))  # Parse each line as a JSON object

    # Format each example into a prompt-completion pair for instruct model
    formatted_data = []
    for item in data:
        prompt = f"""Below is a coding problem. Complete the function as per the description.

Problem:
{item['problem_description']}

Complete the following function:
{item['starter_code']}

Solution:
"""
        completion = f"{item['completion']}\n<|im_end|>"  # Add EOS token if needed
        formatted_data.append({"text": prompt + completion})

    return Dataset.from_list(formatted_data)

dataset = load_dataset('/content/drive/MyDrive/QwenNoQuen/train4k_clean.jsonl')  # Replace with your actual file path
dataset = dataset.train_test_split(test_size=0.02)  # Split 98% train, 2% validation

In [6]:
# Step 2: Load model and tokenizer with Unsloth (4-bit quantization)
model_id = "Qwen/Qwen2.5-Coder-3B-Instruct"
max_seq_length = 3072  # Adjust based on your needs (Qwen supports up to 32k, but for efficiency)

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_id,
    max_seq_length=max_seq_length,
    dtype=None,  # Auto-detect (bfloat16 if supported)
    load_in_4bit=True,  # Enable 4-bit quantization
)

==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.05G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/266 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [7]:
# Step 3: Apply LoRA adapters via Unsloth
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # Rank of LoRA adapters
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    use_gradient_checkpointing="unsloth",  # Efficient checkpointing
    random_state=3407,
    use_rslora=False,
    loftq_config=None,
)

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.11.3 patched 36 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


In [8]:
# Step 4: Tokenize function (Unsloth handles padding internally)
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=max_seq_length)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/3756 [00:00<?, ? examples/s]

Map:   0%|          | 0/77 [00:00<?, ? examples/s]

In [10]:
wandb.login()
wandb.init(project="demo-colab", name="4Kdata") #122e74ba5f2e5ab0922497dad7da9ee2f78fe1d4

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33m23020356[0m ([33m23020356-vnu[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [huggingface_hub.inference, openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


In [11]:
# Step 5: Set up training arguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/QwenNoQuen/fine_tuned_qwen",
    num_train_epochs=3,  # Few epochs to avoid overfitting on 2800 samples
    per_device_train_batch_size=4,  # Adjust based on GPU memory
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,  # Effective batch size 16
    learning_rate=2e-5,
    weight_decay=0.01,
    eval_strategy="steps",  # ƒê√£ s·ª≠a: d√πng "steps" thay v√¨ "epoch"
    eval_steps=100,  # Eval m·ªói 100 steps
    save_total_limit=3,
    save_strategy="steps",  # ƒê·ªïi th√†nh "steps" ƒë·ªÉ nh·∫•t qu√°n
    save_steps=100,  # Save checkpoint m·ªói 100 steps (ho·∫∑c c√≥ th·ªÉ ƒë·ªÉ kh√°c)
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    fp16=False,
    bf16=True,
    report_to="wandb",
    logging_steps=10,  # Log m·ªói 10 steps
    optim="adamw_8bit",
    max_grad_norm=0.3,
)

In [12]:
# Step 6: Initialize SFTTrainer for supervised fine-tuning with EarlyStoppingCallback
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    packing=False,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.01)],
)

In [None]:
# Step 7: Train the model
trainer.train()

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 3,756 | Num Epochs = 3 | Total steps = 705
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 29,933,568 of 3,115,872,256 (0.96% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
100,0.2819,0.294523
200,0.242,0.273204


Unsloth: Not an error, but Qwen2ForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


In [None]:
# Step 8: Save the fine-tuned model (LoRA adapters + base model)
save_dir = "/content/drive/MyDrive/QwenNoQuen/fine_tuned_qwen_4K"
trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "Qwen/Qwen2.5-Coder-3B-Instruct", # T√™n m√¥ h√¨nh g·ªëc
    max_seq_length = 3072,
    dtype = None, # dtype = "none" ho·∫∑c None ƒë·ªÅu ƒë∆∞·ª£c
    load_in_4bit = True,
)

# 2. T·∫£i (load) c√°c tr·ªçng s·ªë adapter ƒë√£ finetune
# ƒê√¢y l√† b∆∞·ªõc quan tr·ªçng, n√≥ "g·∫Øn" file adapter.safetensors v√†o m√¥ h√¨nh
print("ƒêang t·∫£i adapter...")
model.load_adapter(save_dir)

# 3. T·ªëi ∆∞u m√¥ h√¨nh cho suy lu·∫≠n (inference)
FastLanguageModel.for_inference(model)
print("T·∫£i m√¥ h√¨nh v√† adapter th√†nh c√¥ng!")

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch

# ====== PATH ======
base_model = "Qwen/Qwen2.5-Coder-3B-Instruct"
lora_path = "/content/drive/MyDrive/QwenNoQuen/fine_tuned_qwen_coder"
problem_path = "/content/drive/MyDrive/QwenNoQuen/leetcode_3250.md"

# ====== LOAD FILE N-QUEENS ======
with open(problem_path, "r", encoding="utf-8") as f:
    problem_text = f.read()

print("Loaded problem description:")
print(problem_text, "...\n")

# ====== MAKE PROMPT ======
prompt = f"""You are an expert competitive programmer.
Solve the following LeetCode problem and write clean, correct Python code.
Do NOT explain. Only output the final code.

Problem:
{problem_text}

Write Python solution:
"""

inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

# ====== GENERATE CODE ======
outputs = model.generate(
    **inputs,
    max_new_tokens=700,
    temperature=0.15,  # best for stable code
    top_p=0.9,
    do_sample=False
)

result = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(result)


In [4]:
"""
ƒê√°nh gi√° model Qwen ƒë√£ fine-tune b·∫±ng HumanEval benchmark
"""

# 1. C√ÄI ƒê·∫∂T TH∆Ø VI·ªÜN C·∫¶N THI·∫æT
# !pip install human-eval unsloth torch transformers datasets

import torch
from human_eval.data import write_jsonl, read_problems
from human_eval.evaluation import evaluate_functional_correctness
from unsloth import FastLanguageModel
import json
from tqdm import tqdm

# 2. T·∫¢I MODEL ƒê√É FINE-TUNE
print("ƒêang t·∫£i model...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="Qwen/Qwen2.5-Coder-3B-Instruct",
    max_seq_length=3072,
    dtype=None,
    load_in_4bit=True,
)

# T·∫£i adapter ƒë√£ fine-tune
save_dir = "/content/drive/MyDrive/QwenNoQuen/fine_tuned_qwen_coder"
model.load_adapter(save_dir)
FastLanguageModel.for_inference(model)
print("‚úì ƒê√£ t·∫£i model v√† adapter th√†nh c√¥ng!")

# 3. H√ÄM T·∫†O PROMPT CHO QWEN
def create_prompt(problem_prompt):
    """
    T·∫°o prompt theo format chat c·ªßa Qwen
    """
    messages = [
        {
            "role": "system",
            "content": "You are an expert Python programmer. Complete the function implementation below. Only output the code, no explanations."
        },
        {
            "role": "user",
            "content": f"Complete this Python function:\n\n{problem_prompt}\n\nProvide only the complete function implementation."
        }
    ]

    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    return prompt

# 4. H√ÄM SINH CODE T·ª™ MODEL
def generate_code(prompt, max_new_tokens=1024, temperature=0.1, top_p=0.95):
    """
    Sinh code t·ª´ model v·ªõi c√°c tham s·ªë ƒëi·ªÅu ch·ªânh ƒë∆∞·ª£c
    """
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_p=top_p,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Tr√≠ch xu·∫•t code t·ª´ response (b·ªè prompt v√† text th·ª´a)
    if "```python" in generated_text:
        code = generated_text.split("```python")[1].split("```")[0].strip()
    elif "```" in generated_text:
        code = generated_text.split("```")[1].split("```")[0].strip()
    else:
        # L·∫•y ph·∫ßn sau prompt
        code = generated_text.split(prompt)[-1].strip()

    return code

# 5. ƒê√ÅNH GI√Å TR√äN HUMANEVAL
def evaluate_on_humaneval(output_file="humaneval_samples.jsonl", num_samples=1):
    """
    ƒê√°nh gi√° model tr√™n to√†n b·ªô HumanEval dataset

    Args:
        output_file: File ƒë·ªÉ l∆∞u k·∫øt qu·∫£ sinh code
        num_samples: S·ªë l∆∞·ª£ng solutions sinh cho m·ªói b√†i to√°n (th∆∞·ªùng d√πng 1)
    """
    print("\n" + "="*60)
    print("B·∫ÆT ƒê·∫¶U ƒê√ÅNH GI√Å HUMANEVAL")
    print("="*60)

    # ƒê·ªçc c√°c b√†i to√°n t·ª´ HumanEval
    problems = read_problems()
    print(f"‚úì ƒê√£ t·∫£i {len(problems)} b√†i to√°n t·ª´ HumanEval")

    # Sinh code cho t·ª´ng b√†i to√°n
    samples = []
    print(f"\nƒêang sinh code cho {len(problems)} b√†i to√°n...")

    for task_id, problem in tqdm(problems.items(), desc="Processing"):
        prompt_text = problem["prompt"]

        for _ in range(num_samples):
            # T·∫°o prompt v√† sinh code
            full_prompt = create_prompt(prompt_text)
            generated_code = generate_code(full_prompt)

            # K·∫øt h·ª£p prompt g·ªëc v·ªõi code ƒë√£ sinh
            completion = prompt_text + "\n" + generated_code

            samples.append({
                "task_id": task_id,
                "completion": completion
            })

    # L∆∞u k·∫øt qu·∫£ v√†o file
    write_jsonl(output_file, samples)
    print(f"‚úì ƒê√£ l∆∞u {len(samples)} samples v√†o {output_file}")

    # ƒê√°nh gi√° ƒë·ªô ch√≠nh x√°c
    print("\nƒêang ƒë√°nh gi√° ƒë·ªô ch√≠nh x√°c...")
    results = evaluate_functional_correctness(
        output_file,
        k=[1],  # pass@1
        n_workers=4,
        timeout=3.0
    )

    return results

# 6. CH·∫†Y ƒê√ÅNH GI√Å
if __name__ == "__main__":
    # ƒê√°nh gi√° v·ªõi 1 sample m·ªói b√†i (pass@1)
    results = evaluate_on_humaneval(
        output_file="./humaneval_results.jsonl",
        num_samples=1
    )

    print("\n" + "="*60)
    print("K·∫æT QU·∫¢ ƒê√ÅNH GI√Å")
    print("="*60)
    print(json.dumps(results, indent=2))
    print("="*60)

    # L∆∞u k·∫øt qu·∫£ v√†o file
    with open("/content/humaneval_metrics.json", "w") as f:
        json.dump(results, f, indent=2)
    print("‚úì ƒê√£ l∆∞u k·∫øt qu·∫£ v√†o humaneval_metrics.json")

# 7. H√ÄM KI·ªÇM TRA NHANH V·ªöI M·ªòT B√ÄI TO√ÅN
def quick_test():
    """
    Test nhanh v·ªõi 1 b√†i to√°n ƒë·ªÉ ki·ªÉm tra model
    """
    problems = read_problems()
    task_id = "HumanEval/0"  # B√†i to√°n ƒë·∫ßu ti√™n
    problem = problems[task_id]

    print("\n" + "="*60)
    print("KI·ªÇM TRA NHANH")
    print("="*60)
    print(f"\nB√†i to√°n: {task_id}")
    print(f"\nPrompt:\n{problem['prompt']}")

    prompt = create_prompt(problem['prompt'])
    generated = generate_code(prompt)

    print(f"\nCode sinh ra:\n{generated}")
    print("="*60)

# Uncomment d√≤ng d∆∞·ªõi ƒë·ªÉ ch·∫°y test nhanh tr∆∞·ªõc
quick_test()

ƒêang t·∫£i model...
==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
‚úì ƒê√£ t·∫£i model v√† adapter th√†nh c√¥ng!

B·∫ÆT ƒê·∫¶U ƒê√ÅNH GI√Å HUMANEVAL
‚úì ƒê√£ t·∫£i 164 b√†i to√°n t·ª´ HumanEval

ƒêang sinh code cho 164 b√†i to√°n...


Processing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 164/164 [22:36<00:00,  8.27s/it]


‚úì ƒê√£ l∆∞u 164 samples v√†o ./humaneval_results.jsonl

ƒêang ƒë√°nh gi√° ƒë·ªô ch√≠nh x√°c...
Reading samples...


164it [00:00, 2350.75it/s]


Running test suites...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 164/164 [00:11<00:00, 14.41it/s]


Writing results to ./humaneval_results.jsonl_results.jsonl...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 164/164 [00:00<00:00, 38853.70it/s]



K·∫æT QU·∫¢ ƒê√ÅNH GI√Å
{
  "pass@1": 0.7682926829268293
}
‚úì ƒê√£ l∆∞u k·∫øt qu·∫£ v√†o humaneval_metrics.json

KI·ªÇM TRA NHANH

B√†i to√°n: HumanEval/0

Prompt:
from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """


Code sinh ra:
from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """
    for i in range(len(numbers)):
        for j in range(i + 1, len(numbers)):
            if abs(numbers[i] - numbers[j])

In [3]:
"""
ƒê√°nh gi√° model Qwen ƒë√£ fine-tune b·∫±ng HumanEval benchmark
"""

# 1. C√ÄI ƒê·∫∂T TH∆Ø VI·ªÜN C·∫¶N THI·∫æT
# !pip install human-eval unsloth torch transformers datasets

import torch
from human_eval.data import write_jsonl, read_problems
from human_eval.evaluation import evaluate_functional_correctness
from unsloth import FastLanguageModel
import json
from tqdm import tqdm

# 2. T·∫¢I MODEL ƒê√É FINE-TUNE
print("ƒêang t·∫£i model...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="Qwen/Qwen2.5-Coder-3B-Instruct",
    max_seq_length=3072,
    dtype=None,
    load_in_4bit=True,
)
# 3. H√ÄM T·∫†O PROMPT CHO QWEN
def create_prompt(problem_prompt):
    """
    T·∫°o prompt theo format chat c·ªßa Qwen
    """
    messages = [
        {
            "role": "system",
            "content": "You are an expert Python programmer. Complete the function implementation below. Only output the code, no explanations."
        },
        {
            "role": "user",
            "content": f"Complete this Python function:\n\n{problem_prompt}\n\nProvide only the complete function implementation."
        }
    ]

    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    return prompt

# 4. H√ÄM SINH CODE T·ª™ MODEL
def generate_code(prompt, max_new_tokens=1024, temperature=0.1, top_p=0.95):
    """
    Sinh code t·ª´ model v·ªõi c√°c tham s·ªë ƒëi·ªÅu ch·ªânh ƒë∆∞·ª£c
    """
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            temperature=temperature,
            top_p=top_p,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Tr√≠ch xu·∫•t code t·ª´ response (b·ªè prompt v√† text th·ª´a)
    if "```python" in generated_text:
        code = generated_text.split("```python")[1].split("```")[0].strip()
    elif "```" in generated_text:
        code = generated_text.split("```")[1].split("```")[0].strip()
    else:
        # L·∫•y ph·∫ßn sau prompt
        code = generated_text.split(prompt)[-1].strip()

    return code

# 5. ƒê√ÅNH GI√Å TR√äN HUMANEVAL
def evaluate_on_humaneval(output_file="humaneval_samples.jsonl", num_samples=1):
    """
    ƒê√°nh gi√° model tr√™n to√†n b·ªô HumanEval dataset

    Args:
        output_file: File ƒë·ªÉ l∆∞u k·∫øt qu·∫£ sinh code
        num_samples: S·ªë l∆∞·ª£ng solutions sinh cho m·ªói b√†i to√°n (th∆∞·ªùng d√πng 1)
    """
    print("\n" + "="*60)
    print("B·∫ÆT ƒê·∫¶U ƒê√ÅNH GI√Å HUMANEVAL")
    print("="*60)

    # ƒê·ªçc c√°c b√†i to√°n t·ª´ HumanEval
    problems = read_problems()
    print(f"‚úì ƒê√£ t·∫£i {len(problems)} b√†i to√°n t·ª´ HumanEval")

    # Sinh code cho t·ª´ng b√†i to√°n
    samples = []
    print(f"\nƒêang sinh code cho {len(problems)} b√†i to√°n...")

    for task_id, problem in tqdm(problems.items(), desc="Processing"):
        prompt_text = problem["prompt"]

        for _ in range(num_samples):
            # T·∫°o prompt v√† sinh code
            full_prompt = create_prompt(prompt_text)
            generated_code = generate_code(full_prompt)

            # K·∫øt h·ª£p prompt g·ªëc v·ªõi code ƒë√£ sinh
            completion = prompt_text + "\n" + generated_code

            samples.append({
                "task_id": task_id,
                "completion": completion
            })

    # L∆∞u k·∫øt qu·∫£ v√†o file
    write_jsonl(output_file, samples)
    print(f"‚úì ƒê√£ l∆∞u {len(samples)} samples v√†o {output_file}")

    # ƒê√°nh gi√° ƒë·ªô ch√≠nh x√°c
    print("\nƒêang ƒë√°nh gi√° ƒë·ªô ch√≠nh x√°c...")
    results = evaluate_functional_correctness(
        output_file,
        k=[1],  # pass@1
        n_workers=4,
        timeout=3.0
    )

    return results

# 6. CH·∫†Y ƒê√ÅNH GI√Å
if __name__ == "__main__":
    # ƒê√°nh gi√° v·ªõi 1 sample m·ªói b√†i (pass@1)
    results = evaluate_on_humaneval(
        output_file="./humaneval_results.jsonl",
        num_samples=1
    )

    print("\n" + "="*60)
    print("K·∫æT QU·∫¢ ƒê√ÅNH GI√Å")
    print("="*60)
    print(json.dumps(results, indent=2))
    print("="*60)

    # L∆∞u k·∫øt qu·∫£ v√†o file
    with open("/content/humaneval_metrics.json", "w") as f:
        json.dump(results, f, indent=2)
    print("‚úì ƒê√£ l∆∞u k·∫øt qu·∫£ v√†o humaneval_metrics.json")

# 7. H√ÄM KI·ªÇM TRA NHANH V·ªöI M·ªòT B√ÄI TO√ÅN
def quick_test():
    """
    Test nhanh v·ªõi 1 b√†i to√°n ƒë·ªÉ ki·ªÉm tra model
    """
    problems = read_problems()
    task_id = "HumanEval/0"  # B√†i to√°n ƒë·∫ßu ti√™n
    problem = problems[task_id]

    print("\n" + "="*60)
    print("KI·ªÇM TRA NHANH")
    print("="*60)
    print(f"\nB√†i to√°n: {task_id}")
    print(f"\nPrompt:\n{problem['prompt']}")

    prompt = create_prompt(problem['prompt'])
    generated = generate_code(prompt)

    print(f"\nCode sinh ra:\n{generated}")
    print("="*60)

# Uncomment d√≤ng d∆∞·ªõi ƒë·ªÉ ch·∫°y test nhanh tr∆∞·ªõc
quick_test()

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!
ƒêang t·∫£i model...
==((====))==  Unsloth 2025.11.3: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.05G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/266 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]


B·∫ÆT ƒê·∫¶U ƒê√ÅNH GI√Å HUMANEVAL
‚úì ƒê√£ t·∫£i 164 b√†i to√°n t·ª´ HumanEval

ƒêang sinh code cho 164 b√†i to√°n...


Processing: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 164/164 [16:42<00:00,  6.11s/it]


‚úì ƒê√£ l∆∞u 164 samples v√†o ./humaneval_results.jsonl

ƒêang ƒë√°nh gi√° ƒë·ªô ch√≠nh x√°c...
Reading samples...


164it [00:00, 27991.61it/s]


Running test suites...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 164/164 [00:10<00:00, 15.14it/s]


Writing results to ./humaneval_results.jsonl_results.jsonl...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 164/164 [00:00<00:00, 41169.85it/s]



K·∫æT QU·∫¢ ƒê√ÅNH GI√Å
{
  "pass@1": 0.8414634146341463
}
‚úì ƒê√£ l∆∞u k·∫øt qu·∫£ v√†o humaneval_metrics.json

KI·ªÇM TRA NHANH

B√†i to√°n: HumanEval/0

Prompt:
from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """


Code sinh ra:
from typing import List


def has_close_elements(numbers: List[float], threshold: float) -> bool:
    """ Check if in given list of numbers, are any two numbers closer to each other than
    given threshold.
    >>> has_close_elements([1.0, 2.0, 3.0], 0.5)
    False
    >>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
    True
    """
    for i in range(len(numbers)):
        for j in range(i + 1, len(numbers)):
            if abs(numbers[i] - numbers[j])