# Fine-tune Qwen 2.5 with LoRA for Tour Assistant

This notebook will guide you through:
1. Install dependencies
2. Upload dataset
3. Fine-tune model with LoRA
4. Merge and convert to GGUF

**Requirements:** GPU T4 or higher (Colab Free is sufficient)

## 1. Install Dependencies

In [None]:
!pip install -q torch transformers datasets peft bitsandbytes accelerate sentencepiece
!pip install -q huggingface_hub

## 2. Configuration & Import

In [None]:
import torch
import os
from peft import PeftModel, LoraConfig, get_peft_model
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq,
)

# Check GPU
print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
# ===============================
# CONFIG - MODIFY HERE
# ===============================
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"

# Paths (in Colab)
TRAIN_FILE = "/content/dataset/training.jsonl"
TEST_FILE = "/content/dataset/test.jsonl"
OUTPUT_DIR = "/content/lora-adapter"
MERGED_DIR = "/content/qwen_merged"
GGUF_FILE = "/content/qwen2.5-1.5b-tour-assistant-q4.gguf"

# Training hyperparameters
MAX_LENGTH = 512
BATCH_SIZE = 2      # Increase to 4 if you have a better GPU
GRAD_ACCUM = 4
EPOCHS = 3
LR = 2e-4

## 3. Upload Dataset

Create folder and upload `training.jsonl` and `test.jsonl`

Format for each line in JSONL file:
```json
{"input": "User question", "output": "Assistant response"}
```

In [None]:
# Create dataset folder
!mkdir -p /content/dataset

# Upload files manually or mount Google Drive
from google.colab import files
print("Upload training.jsonl:")
uploaded = files.upload()
for fn in uploaded.keys():
    !mv "{fn}" /content/dataset/

print("\nUpload test.jsonl:")
uploaded = files.upload()
for fn in uploaded.keys():
    !mv "{fn}" /content/dataset/

In [None]:
# OR: Mount Google Drive if dataset already exists there
# from google.colab import drive
# drive.mount('/content/drive')
# TRAIN_FILE = "/content/drive/MyDrive/dataset/training.jsonl"
# TEST_FILE = "/content/drive/MyDrive/dataset/test.jsonl"

In [None]:
# Verify dataset
!echo "=== Training samples ==="
!head -2 {TRAIN_FILE}
!echo "\n=== Test samples ==="
!head -2 {TEST_FILE}
!echo "\n=== Counts ==="
!wc -l {TRAIN_FILE} {TEST_FILE}

## 4. Load Tokenizer & Model

In [None]:
# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

print(f"Vocab size: {tokenizer.vocab_size}")

In [None]:
# Load model with 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
)
model.config.use_cache = False

print(f"Model loaded on: {model.device}")

## 5. LoRA Configuration

In [None]:
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

## 6. Prepare Dataset

In [None]:
dataset = load_dataset("json", data_files={"train": TRAIN_FILE, "test": TEST_FILE})
print(dataset)

In [None]:
def preprocess(example):
    """Tokenize and create labels with prompt masking"""
    messages = [
        {"role": "system", "content": "Ban la tro ly ho tro khach hang. Hay tra loi ngan gon va chinh xac."},
        {"role": "user", "content": example["input"]},
        {"role": "assistant", "content": example["output"]},
    ]

    # Format ChatML
    full_text = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=False
    )

    # Tokenize
    tokenized = tokenizer(
        full_text,
        truncation=True,
        max_length=MAX_LENGTH,
        padding=False,
        add_special_tokens=False,
    )

    # Create labels with masking
    input_ids = tokenized["input_ids"]
    labels = input_ids.copy()

    # Calculate prompt length (system + user)
    prompt_messages = messages[:-1]
    prompt_text = tokenizer.apply_chat_template(
        prompt_messages, tokenize=False, add_generation_prompt=True
    )
    prompt_ids = tokenizer(
        prompt_text, truncation=True, max_length=MAX_LENGTH, add_special_tokens=False
    )["input_ids"]
    prompt_len = len(prompt_ids)

    # Mask prompt with -100 (ignore in loss calculation)
    for i in range(len(labels)):
        if i < prompt_len:
            labels[i] = -100

    tokenized["labels"] = labels
    return tokenized

tokenized_dataset = dataset.map(preprocess, remove_columns=dataset["train"].column_names)
print(f"Train: {len(tokenized_dataset['train'])} samples")
print(f"Test: {len(tokenized_dataset['test'])} samples")

## 7. Training

In [None]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUM,
    learning_rate=LR,
    num_train_epochs=EPOCHS,
    logging_steps=10,
    fp16=True,
    save_strategy="epoch",
    eval_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none",
)

data_collator = DataCollatorForSeq2Seq(
    tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
)

In [None]:
# START TRAINING
print("="*50)
print("Starting training...")
print("="*50)

trainer.train()

In [None]:
# Save LoRA adapter
print("Saving LoRA adapter...")
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"Saved at: {OUTPUT_DIR}")

## 8. Test Model After Training

In [None]:
# Test the model
def generate_response(prompt):
    messages = [
        {"role": "system", "content": "Ban la tro ly ho tro khach hang. Hay tra loi ngan gon va chinh xac."},
        {"role": "user", "content": prompt},
    ]
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = tokenizer([text], return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            temperature=0.7,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
        )

    generated_ids = outputs[0][len(inputs.input_ids[0]):]
    return tokenizer.decode(generated_ids, skip_special_tokens=True)

# Test
test_questions = [
    "Tour Da Lat co gia bao nhieu?",
    "Toi muon huy dat tour",
    "Co tour nao di vao cuoi tuan khong?",
]

for q in test_questions:
    print(f"\nUser: {q}")
    print(f"Bot: {generate_response(q)}")
    print("-"*50)

## 9. Merge LoRA into Base Model

In [None]:
# Cleanup memory
del model
del trainer
torch.cuda.empty_cache()

print("VRAM released")

In [None]:
# Load base model (FP16, on CPU to save VRAM)
print("Loading base model...")
base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map="cpu",
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Merge LoRA
print("Merging LoRA adapter...")
merged_model = PeftModel.from_pretrained(base_model, OUTPUT_DIR)
merged_model = merged_model.merge_and_unload()

# Save
print(f"Saving merged model to {MERGED_DIR}...")
os.makedirs(MERGED_DIR, exist_ok=True)
merged_model.save_pretrained(MERGED_DIR)
tokenizer.save_pretrained(MERGED_DIR)

print(f"Merged model saved at: {MERGED_DIR}")

In [None]:
# Cleanup
del base_model
del merged_model
torch.cuda.empty_cache()

## 10. Convert to GGUF

In [None]:
# Clone and build llama.cpp
!git clone https://github.com/ggerganov/llama.cpp /content/llama.cpp
!cd /content/llama.cpp && make -j4

In [None]:
# Install dependencies for convert script
!pip install -q gguf

In [None]:
# Convert HF -> GGUF (FP16)
!python /content/llama.cpp/convert_hf_to_gguf.py {MERGED_DIR} \
    --outfile /content/temp.gguf \
    --outtype f16

print("Converted to GGUF FP16")

In [None]:
# Quantize to 4-bit (Q4_K_M - balance between quality and size)
!/content/llama.cpp/llama-quantize /content/temp.gguf {GGUF_FILE} q4_k_m

# Remove temp file
!rm /content/temp.gguf

print(f"Quantized to 4-bit: {GGUF_FILE}")
!ls -lh {GGUF_FILE}

## 11. Download GGUF File

In [None]:
# Option 1: Download directly
from google.colab import files
files.download(GGUF_FILE)

In [None]:
# Option 2: Copy to Google Drive
from google.colab import drive
drive.mount('/content/drive')

!cp {GGUF_FILE} /content/drive/MyDrive/
print("Copied to Google Drive!")

## Done!

You now have the GGUF file to use with `llama-cpp-python`.

### Usage in Python:
```python
from llama_cpp import Llama

llm = Llama(model_path="qwen2.5-1.5b-tour-assistant-q4.gguf", n_ctx=2048)

output = llm(
    "<|im_start|>user\nTour Da Lat gia bao nhieu?<|im_end|>\n<|im_start|>assistant\n",
    max_tokens=256,
    stop=["<|im_end|>"],
)
print(output["choices"][0]["text"])
```