In [6]:
# !pip install --upgrade pip
# !pip install -q unsloth peft trl wandb evaluate jsonlines human-eval
# !pip install -q datasets transformers accelAerate einops tiktoken tqdm
# !pip uninstall xformers -y
# !pip install xformers

In [7]:
from unsloth import FastLanguageModel
import torch
from datasets import Dataset, DatasetDict, load_dataset
import json
import trl
import peft
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig, get_peft_model
from unsloth import is_bfloat16_supported
import os
from transformers import EarlyStoppingCallback, TrainingArguments
import wandb
#from kaggle_secrets import UserSecretsClient
from accelerate import Accelerator
from transformers import AutoTokenizer, AutoModelForCausalLM
import re
from tqdm import tqdm
import subprocess
import ast
import math
from typing import List, Dict
import jsonlines
import random
from pathlib import Path
from human_eval.data import read_problems


In [8]:
# JSONL: mỗi dòng là 1 sample JSON
dataset = load_dataset("json", data_files={"train": "/content/drive/MyDrive/DeepLearning/dataset_training_leetcode.jsonl"})["train"]

# Lọc bớt sample lỗi / thiếu dữ liệu
def is_valid(example):
    return bool(example.get("problem_description")) and bool(example.get("completion"))

dataset = dataset.filter(is_valid)

# Chia train / validation
dataset = dataset.train_test_split(test_size=0.02, seed=42)
train_ds = dataset["train"]
val_ds   = dataset["test"]

len(train_ds), len(val_ds)


(2811, 58)

In [9]:
max_seq_length = 3072       # đủ cho hầu hết đề + code
lora_rank      = 8         # 8 hoặc 16 là hợp lý
dtype          = None       # auto: bf16 trên L4
load_in_4bit   = True       # QLoRA 4bit

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name       = "unsloth/Qwen2.5-Coder-3B-Instruct",  # hoặc "unsloth/Qwen2.5-Coder-3B-Instruct" nếu bạn dùng bản đó
    max_seq_length   = max_seq_length,
    dtype            = dtype,
    load_in_4bit     = load_in_4bit,
)


==((====))==  Unsloth 2025.11.2: Fast Qwen2 patching. Transformers: 4.57.1.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu128. CUDA: 8.9. CUDA Toolkit: 12.8. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.05G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/266 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [10]:
model = FastLanguageModel.get_peft_model(
    model,
    r                        = lora_rank,             # 8 / 16
    target_modules           = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha               = lora_rank,
    lora_dropout             = 0.0,
    bias                     = "none",
    use_gradient_checkpointing = "unsloth",           # tiết kiệm VRAM + context dài
    random_state             = 3407,
    use_rslora               = False,
    loftq_config             = None,
)


Unsloth 2025.11.2 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.


In [11]:
def build_messages(example):
    problem = example["problem_description"].strip()
    starter = (example.get("starter_code") or "").strip()
    completion = example["completion"].strip()

    # Build user message
    user_parts = [problem]

    if starter:
        # Nếu có starter_code
        user_parts.append(
            "Here is the function signature you must complete. "
            "Do NOT change the function name, arguments, or return type. "
            "Do NOT add extra top-level code (no main, no input/output code).\n\n"
            + starter
        )
    else:
        # Nếu không có starter_code
        user_parts.append(
            "Write a single function that solves this problem. "
            "Do NOT write any code outside that function. "
            "Do NOT add any explanation text; only return the code."
        )

    user_content = "\n\n".join(user_parts)

    messages = [
        {
            "role": "system",
            "content": (
                "You are a coding assistant. "
                "Given a problem description and optionally a starter function, "
                "you must output only the final solution code python. "
                "Return ONLY code, with no explanations, comments, or markdown formatting "
                "outside the function body. "
                "If a starter function is provided, you must complete that function "
                "without changing its name or signature."
            ),
        },
        {
            "role": "user",
            "content": user_content,
        },
        {
            "role": "assistant",
            "content": completion,
        },
    ]
    return messages

def format_example(example):
    messages = build_messages(example)
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=False,  # vì đã có assistant trong data
    )
    return {"text": text}

In [12]:
train_ds_formatted = train_ds.map(format_example, remove_columns=train_ds.column_names)
val_ds_formatted   = val_ds.map(format_example,   remove_columns=val_ds.column_names)

print(train_ds_formatted[0]["text"])


Map:   0%|          | 0/2811 [00:00<?, ? examples/s]

Map:   0%|          | 0/58 [00:00<?, ? examples/s]

<|im_start|>system
You are a coding assistant. Given a problem description and optionally a starter function, you must output only the final solution code python. Return ONLY code, with no explanations, comments, or markdown formatting outside the function body. If a starter function is provided, you must complete that function without changing its name or signature.<|im_end|>
<|im_start|>user
An element x of an integer array arr of length m is dominant if more than half the elements of arr have a value of x.
You are given a 0-indexed integer array nums of length n with one dominant element.
You can split nums at an index i into two arrays nums[0, ..., i] and nums[i + 1, ..., n - 1], but the split is only valid if:

0 <= i < n - 1
nums[0, ..., i], and nums[i + 1, ..., n - 1] have the same dominant element.

Here, nums[i, ..., j] denotes the subarray of nums starting at index i and ending at index j, both ends being inclusive. Particularly, if j < i then nums[i, ..., j] denotes an empty

In [13]:
wandb.login()
wandb.init()

  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33m23020356[0m ([33m23020356-vnu[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [huggingface_hub.inference, openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


In [15]:
per_device_batch_size = 2   # L4 24GB + QLoRA + LoRA nhẹ: 4–8 là ổn
grad_accum_steps      = 8   # batch effective = 16

training_args = TrainingArguments(
    output_dir                       = "/content/drive/MyDrive/DeepLearning/qwen2.5-coder-3b-leetcode-lora",
    per_device_train_batch_size      = per_device_batch_size,
    per_device_eval_batch_size       = per_device_batch_size,
    gradient_accumulation_steps      = grad_accum_steps,

    # 1. Giảm learning rate để tránh "quên" kiến thức cũ
    learning_rate                    = 5e-6,

    # 2. Tăng số epoch nhưng sẽ dùng early stopping để dừng sớm
    num_train_epochs                 = 3.0,

    lr_scheduler_type                = "cosine",
    warmup_ratio                     = 0.1, # Tăng nhẹ warmup ratio
    logging_steps                    = 5,

    # 3. Cấu hình để đánh giá và lưu model tốt nhất
    eval_strategy              = "steps", # Đổi từ "eval_strategy" thành "evaluation_strategy" cho phiên bản mới
    eval_steps                       = 50,      # Đánh giá thường xuyên hơn
    save_strategy                    = "steps",   # Đồng bộ save_strategy với evaluation_strategy
    save_steps                       = 50,
    save_total_limit                 = 2,       # Chỉ giữ lại 2 checkpoint tốt nhất
    load_best_model_at_end           = True,    # Tải lại model có eval_loss thấp nhất khi kết thúc
    metric_for_best_model            = "eval_loss",

    bf16                             = is_bfloat16_supported(),
    fp16                             = not is_bfloat16_supported(),
    optim                            = "adamw_bnb_8bit",
    weight_decay                     = 0.01,
    max_grad_norm                    = 1.0,
    report_to                        = "wandb",
)

# 4. Thêm EarlyStoppingCallback vào Trainer
early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3, # Dừng nếu eval_loss không cải thiện sau 3 lần đánh giá
)

trainer = SFTTrainer(
    model               = model,
    tokenizer           = tokenizer,
    train_dataset       = train_ds_formatted,
    eval_dataset        = val_ds_formatted,
    dataset_text_field  = "text",
    max_seq_length      = max_seq_length,
    packing             = False,
    args                = training_args,
    # Thêm callback vào đây
    callbacks           = [early_stopping_callback],
)

Unsloth: Tokenizing ["text"] (num_proc=16):   0%|          | 0/2811 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=16):   0%|          | 0/58 [00:00<?, ? examples/s]

In [None]:
trainer.train()

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2,811 | Num Epochs = 3 | Total steps = 528
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 8 x 1) = 16
 "-____-"     Trainable parameters = 14,966,784 of 3,100,905,472 (0.48% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss


In [None]:
# Lưu adapter LoRA
save_dir = "/content/drive/MyDrive/DeepLearning/qwen2.5-coder-3b-leetcode-lora_adapter"
model.save_pretrained(save_dir, save_adapter=True)
tokenizer.save_pretrained(save_dir)


In [None]:
# Load lại base + LoRA adapter
ft_model, ft_tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen2.5-Coder-3B-Instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

ft_model.load_adapter(save_dir)   # nếu dùng API mới của unsloth; nếu không thì dùng PEFT load adapter như docs
FastLanguageModel.for_inference(ft_model)  # tối ưu cho suy luận


In [None]:
CODE_BLOCK_RE = re.compile(r"```(?:python)?\s*([\s\S]*?)```", re.IGNORECASE)

def parse_markdown_problem(md_text: str):
    """
    Trả về (problem_description, starter_code)
    starter_code có thể là "" nếu không có code block.
    """
    match = CODE_BLOCK_RE.search(md_text)
    if match:
        starter_code = match.group(1).strip()
        # Bỏ code block đầu tiên ra khỏi đề
        problem_description = CODE_BLOCK_RE.sub("", md_text, count=1).strip()
    else:
        starter_code = ""
        problem_description = md_text.strip()

    return problem_description, starter_code

In [None]:
def solve_with_model_from_md(
    md_path: str,
    output_py_path: str,
    model,
    tokenizer,
    max_new_tokens: int = 1024,
):
    # 1. Đọc markdown
    md_text = Path(md_path).read_text(encoding="utf-8")

    # 2. Tách đề + starter_code
    problem_description, starter_code = parse_markdown_problem(md_text)

    # 3. Build user prompt (TIẾNG ANH)
    user_parts = [problem_description.strip()]

    if starter_code:
        user_parts.append(
            "Here is the function signature you must complete. "
            "Do NOT change the function name, arguments, or return type. "
            "Do NOT add any extra top-level code (no main function, no input/output code).\n\n"
            + starter_code
        )
    else:
        user_parts.append(
            "Write a single function that solves this problem. "
            "Do NOT write any code outside that function. "
            "Return ONLY the code for that function, with no explanations."
        )

    user_content = "\n\n".join(user_parts)

    # 4. Build chat messages (system + user đều tiếng Anh)
    messages = [
        {
            "role": "system",
            "content": (
                "You are a coding assistant. "
                "Given a problem description and optionally a starter function, "
                "you must output only the final solution code. "
                "Return ONLY code, with no explanations, comments, or markdown formatting. "
                "If a starter function is provided, you must complete that function "
                "without changing its name or signature, and you must not add any other "
                "top-level code."
            ),
        },
        {
            "role": "user",
            "content": user_content,
        },
    ]

    # 5. Dùng chat template của Qwen2.5
    prompt = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True,  # thêm <assistant> để model generate tiếp
    )

    inputs = tokenizer([prompt], return_tensors="pt").to(model.device)

    # 6. Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            temperature=0.0,
            eos_token_id=tokenizer.eos_token_id,
        )

    full = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    completion = full[len(prompt):].strip()

    # 7. Nếu model trả về dạng ```python ... ``` thì bóc code bên trong
    if "```" in completion:
        fenced = CODE_BLOCK_RE.search(completion)
        if fenced:
            completion = fenced.group(1).strip()

    # 8. Ghi ra file .py
    Path(output_py_path).write_text(completion + "\n", encoding="utf-8")

    return completion  # tiện nếu bạn muốn in ra xem


In [None]:
solution = solve_with_model_from_md(
    md_path="/content/drive/MyDrive/DeepLearning/leetcode-31-next-permutation-en.md",
    output_py_path="/content/drive/MyDrive/DeepLearning/BuiDang/solutionX.py",
    model=ft_model,
    tokenizer=ft_tokenizer,
    max_new_tokens=1024,
)

print(solution)


In [None]:
"""
Simplified HumanEval Evaluation for Google Colab
Copy-paste friendly, all-in-one script
"""

# ============================================================================
# STEP 0: INSTALLATION (Run once)
# ============================================================================

def install_dependencies():
    """Install required packages"""
    import subprocess
    import sys

    print("📦 Installing dependencies...")

    packages = [
        "human-eval",
        "datasets",
    ]

    for package in packages:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", package])

    print("✅ Dependencies installed!\n")

# Uncomment to install
# install_dependencies()


# ============================================================================
# STEP 1: CONFIGURATION
# ============================================================================

# Đường dẫn adapter của bạn
ADAPTER_PATH = "/content/drive/MyDrive/BuiDang/qwen2.5-coder-3b-leetcode-lora_adapter"

# Settings
MAX_SEQ_LENGTH = 2048
DTYPE = None
LOAD_IN_4BIT = True

# HumanEval settings
MAX_NEW_TOKENS = 512
TEMPERATURE = 0.1
NUM_PROBLEMS = None  # None = all 164 problems, hoặc số nhỏ hơn để test nhanh


# ============================================================================
# STEP 2: LOAD MODELS
# ============================================================================

def load_models():
    """Load base and fine-tuned models"""
    from unsloth import FastLanguageModel
    import torch

    print("="*80)
    print("🤖 LOADING MODELS")
    print("="*80)

    # 1. Load BASE model
    print("\n1️⃣  Loading BASE model...")
    base_model, tokenizer = FastLanguageModel.from_pretrained(
        model_name="unsloth/Qwen2.5-Coder-3B-Instruct",
        max_seq_length=MAX_SEQ_LENGTH,
        dtype=DTYPE,
        load_in_4bit=LOAD_IN_4BIT,
    )
    FastLanguageModel.for_inference(base_model)
    print("✅ Base model loaded")

    # 2. Load FINE-TUNED model
    print("\n2️⃣  Loading FINE-TUNED model...")
    ft_model, _ = FastLanguageModel.from_pretrained(
        model_name="unsloth/Qwen2.5-Coder-3B-Instruct",
        max_seq_length=MAX_SEQ_LENGTH,
        dtype=DTYPE,
        load_in_4bit=LOAD_IN_4BIT,
    )

    # Load adapter
    print(f"   Loading adapter from: {ADAPTER_PATH}")
    try:
        ft_model.load_adapter(ADAPTER_PATH)
        print("   ✅ Loaded via Unsloth")
    except:
        from peft import PeftModel
        ft_model = PeftModel.from_pretrained(ft_model, ADAPTER_PATH)
        print("   ✅ Loaded via PEFT")

    FastLanguageModel.for_inference(ft_model)
    print("✅ Fine-tuned model loaded")

    return base_model, ft_model, tokenizer


# ============================================================================
# STEP 3: LOAD HUMANEVAL DATASET
# ============================================================================

def load_humaneval():
    """Load HumanEval dataset"""
    from datasets import load_dataset

    print("\n" + "="*80)
    print("📊 LOADING HUMANEVAL DATASET")
    print("="*80)

    dataset = load_dataset("openai/openai_humaneval", split="test")

    if NUM_PROBLEMS and NUM_PROBLEMS < len(dataset):
        dataset = dataset.select(range(NUM_PROBLEMS))
        print(f"✅ Loaded {NUM_PROBLEMS} problems (subset for testing)")
    else:
        print(f"✅ Loaded {len(dataset)} problems (full HumanEval)")

    return dataset


# ============================================================================
# STEP 4: GENERATION
# ============================================================================

def generate_solution(model, tokenizer, problem):
    """Generate one solution for a problem"""
    import torch

    # Format prompt
    prompt = f"""<|im_start|>system
You are an expert Python programmer. Complete the given function.<|im_end|>
<|im_start|>user
Complete this function:

```python
{problem['prompt']}```<|im_end|>
<|im_start|>assistant
```python
{problem['prompt']}"""

    # Tokenize
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    input_len = inputs.input_ids.shape[1]

    # Generate
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=MAX_NEW_TOKENS,
            temperature=TEMPERATURE,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
        )

    # Decode
    generated = tokenizer.decode(outputs[0][input_len:], skip_special_tokens=True)

    # Clean up
    generated = generated.replace("```python", "").replace("```", "").strip()

    # Stop at special tokens
    for stop in ["<|im_end|>", "<|endoftext|>", "<|im_start|>"]:
        if stop in generated:
            generated = generated.split(stop)[0]

    # Complete solution
    solution = problem['prompt'] + generated.strip()

    return solution


def evaluate_model(model, tokenizer, dataset, model_name):
    """Evaluate model on HumanEval"""
    from tqdm import tqdm

    print(f"\n{'='*80}")
    print(f"🧪 EVALUATING: {model_name}")
    print(f"{'='*80}\n")

    results = []

    for problem in tqdm(dataset, desc=f"Generating"):
        solution = generate_solution(model, tokenizer, problem)

        results.append({
            'task_id': problem['task_id'],
            'completion': solution,
        })

    return results


# ============================================================================
# STEP 5: TESTING
# ============================================================================

def test_solutions(dataset, generations):
    """Test generated solutions"""
    from human_eval.data import read_problems

    print("\n🔍 Running test cases...")

    problems = read_problems()

    passed = 0
    failed = 0
    errors = 0
    details = []

    for gen in generations:
        task_id = gen['task_id']
        completion = gen['completion']
        problem = problems[task_id]

        # Create test code
        test_code = completion + "\n" + problem['test']

        # Run test
        try:
            exec_globals = {}
            exec(test_code, exec_globals)
            passed += 1
            status = "✅ PASS"
        except AssertionError as e:
            failed += 1
            status = f"❌ FAIL: {str(e)[:50]}"
        except Exception as e:
            errors += 1
            status = f"⚠️ ERROR: {str(e)[:50]}"

        details.append({
            'task_id': task_id,
            'status': status,
        })

    total = len(generations)
    pass_rate = (passed / total) * 100

    print(f"\n{'='*80}")
    print(f"Results:")
    print(f"  ✅ Passed: {passed}/{total}")
    print(f"  ❌ Failed: {failed}/{total}")
    print(f"  ⚠️  Errors: {errors}/{total}")
    print(f"  📊 Pass@1: {pass_rate:.2f}%")
    print(f"{'='*80}")

    return {
        'passed': passed,
        'failed': failed,
        'errors': errors,
        'total': total,
        'pass@1': pass_rate,
        'details': details,
    }


# ============================================================================
# STEP 6: COMPARISON
# ============================================================================

def compare_results(base_results, ft_results):
    """Compare base vs fine-tuned"""

    print("\n" + "="*80)
    print("📊 COMPARISON")
    print("="*80)

    base_pass = base_results['pass@1']
    ft_pass = ft_results['pass@1']
    improvement = ft_pass - base_pass

    print(f"\n{'Model':<30} {'Pass@1':<15} {'Passed':<15}")
    print("-"*80)
    print(f"{'Base Model':<30} {base_pass:<15.2f} {base_results['passed']}/{base_results['total']}")
    print(f"{'Fine-tuned Model':<30} {ft_pass:<15.2f} {ft_results['passed']}/{ft_results['total']}")
    print("-"*80)
    print(f"{'Improvement':<30} {improvement:<15.2f}")
    print("="*80)

    # Verdict
    print("\n🎯 VERDICT:")
    if improvement > 2:
        print(f"✅ Excellent! Fine-tuning improved by {improvement:.2f}%")
    elif improvement >= -2:
        print(f"✅ Good! Performance maintained (change: {improvement:+.2f}%)")
        print("   No catastrophic forgetting detected!")
    else:
        print(f"⚠️  Warning! Performance dropped by {abs(improvement):.2f}%")
        print("   Possible catastrophic forgetting!")

    return improvement


# ============================================================================
# STEP 7: SHOW EXAMPLES
# ============================================================================

def show_examples(dataset, base_gens, ft_gens, num=3):
    """Show sample generations"""

    print("\n" + "="*80)
    print("📋 SAMPLE GENERATIONS")
    print("="*80)

    for i in range(min(num, len(dataset))):
        problem = dataset[i]
        task_id = problem['task_id']

        base_sol = next(g['completion'] for g in base_gens if g['task_id'] == task_id)
        ft_sol = next(g['completion'] for g in ft_gens if g['task_id'] == task_id)

        print(f"\n{'='*80}")
        print(f"Problem {i+1}: {task_id}")
        print(f"{'='*80}")
        print(f"\nPrompt:\n{problem['prompt'][:150]}...")

        print(f"\n{'-'*80}")
        print("Base Model:")
        print(f"{'-'*80}")
        print(base_sol[:200] + "..." if len(base_sol) > 200 else base_sol)

        print(f"\n{'-'*80}")
        print("Fine-tuned Model:")
        print(f"{'-'*80}")
        print(ft_sol[:200] + "..." if len(ft_sol) > 200 else ft_sol)


# ============================================================================
# MAIN FUNCTION - RUN EVERYTHING
# ============================================================================

def run_evaluation():
    """Main evaluation pipeline - Run this!"""
    import torch

    print("="*80)
    print("🚀 HUMANEVAL EVALUATION")
    print("="*80)

    # 1. Load everything
    base_model, ft_model, tokenizer = load_models()
    dataset = load_humaneval()

    # 2. Evaluate base model
    print("\n" + "="*80)
    print("PHASE 1: Evaluating BASE model")
    print("="*80)

    base_generations = evaluate_model(base_model, tokenizer, dataset, "Base Model")
    base_results = test_solutions(dataset, base_generations)

    # Clean up
    del base_model
    torch.cuda.empty_cache()

    # 3. Evaluate fine-tuned model
    print("\n" + "="*80)
    print("PHASE 2: Evaluating FINE-TUNED model")
    print("="*80)

    ft_generations = evaluate_model(ft_model, tokenizer, dataset, "Fine-tuned Model")
    ft_results = test_solutions(dataset, ft_generations)

    # Clean up
    del ft_model
    torch.cuda.empty_cache()

    # 4. Compare
    improvement = compare_results(base_results, ft_results)

    # 5. Show examples
    show_examples(dataset, base_generations, ft_generations, num=2)

    # 6. Save results
    import json
    output = {
        'base_model': base_results,
        'finetuned_model': ft_results,
        'improvement': improvement,
    }

    output_file = "humaneval_results.json"
    with open(output_file, 'w') as f:
        json.dump(output, f, indent=2)

    print(f"\n\n💾 Results saved to: {output_file}")

    print("\n" + "="*80)
    print("✅ EVALUATION COMPLETE!")
    print("="*80)

    return base_results, ft_results


# ============================================================================
# QUICK TEST FUNCTION
# ============================================================================

def quick_test(num_problems=5):
    """Quick test on small subset"""
    global NUM_PROBLEMS
    NUM_PROBLEMS = num_problems

    print(f"⚡ QUICK TEST: Evaluating on {num_problems} problems only\n")

    return run_evaluation()


# ============================================================================
# HOW TO USE
# ============================================================================

"""
🚀 USAGE IN COLAB:

# 1. Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# 2. Set adapter path
ADAPTER_PATH = "/content/drive/MyDrive/BuiDang/qwen2.5-coder-3b-leetcode-lora_adapter"

# 3. Run evaluation!

# Option A: Full evaluation (164 problems, ~30-60 minutes)
base_results, ft_results = run_evaluation()

# Option B: Quick test (5 problems, ~2-3 minutes)
base_results, ft_results = quick_test(5)

# Option C: Custom number of problems
NUM_PROBLEMS = 20  # Test on 20 problems
base_results, ft_results = run_evaluation()


📊 EXPECTED RESULTS:

Good Fine-tuning:
  Base Model:        40-45% pass@1
  Fine-tuned Model:  42-50% pass@1
  Improvement:       +2 to +5%

Excellent Fine-tuning:
  Base Model:        40-45% pass@1
  Fine-tuned Model:  50-60% pass@1
  Improvement:       +10 to +15%

Catastrophic Forgetting:
  Base Model:        40-45% pass@1
  Fine-tuned Model:  30-38% pass@1
  Improvement:       -5 to -10% ⚠️


💡 TIPS:
- Start with quick_test(5) để verify setup
- Sau đó chạy full evaluation
- Kết quả lưu trong humaneval_results.json
"""

# ============================================================================
# RUN THIS IN COLAB
# ============================================================================

if __name__ == "__main__":
    # Uncomment one of these:

    # Full evaluation
    # base_results, ft_results = run_evaluation()

    # Quick test
    base_results, ft_results = quick_test(5)