In [None]:
from datasets import load_dataset_builder, load_dataset
from transformers import AutoTokenizer
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, prepare_model_for_kbit_training
from trl import SFTTrainer
from datasets import DatasetDict
from peft import (
    LoraConfig,
    get_peft_model,
    TaskType,
    prepare_model_for_kbit_training
)
from transformers import DataCollatorForLanguageModeling
from transformers import TrainingArguments

# Directory to save the fine-tuned model and tokenizer
output_dir = "./qwen-vietnamese-poetry-lora"

# Load the Vietnamese poetry dataset from HuggingFace Hub
dataset = load_dataset("truongpdd/vietnamese_poetry")

# Specify the base model to fine-tune
model_name = "Qwen/Qwen2.5-0.5B-Instruct"

# Load the tokenizer for the base model
tokenizer = AutoTokenizer.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def create_conversation(example):
    """
    Convert a raw poetry example into a structured conversation for instruction tuning.

    Args:
        example (dict): A dictionary with a 'text' field containing the poem.

    Returns:
        list: A list of message dicts representing a conversation with system, user, and assistant roles.
    """
    # Split the text into words and create a prompt/completion split
    words = example['text'].split()
    prompt = " ".join(words[:50])
    completion = " ".join(words[50:])

    # Construct the conversation with system, user, and assistant messages
    conversation = [
        {
            "role": "system",
            "content": (
                "Bạn là một chuyên gia về văn học Việt Nam. "
                "Hãy trả lời các câu hỏi về tác phẩm và tác giả một cách chính xác và sâu sắc."
            )
        },
        {
            "role": "user",
            "content": f"Hãy viết tiếp đoạn văn sau:\n{prompt}"
        },
        {
            "role": "assistant",
            "content": completion
        }
    ]
    return conversation


Generating train split: 100%|██████████| 8530/8530 [00:00<00:00, 31107.39 examples/s]
Generating validation split: 100%|██████████| 1066/1066 [00:00<00:00, 144785.73 examples/s]
Generating test split: 100%|██████████| 1066/1066 [00:00<00:00, 111238.69 examples/s]


In [None]:
def apply_template(example):
    """
    Apply the chat template to each example, formatting it for model input.

    Args:
        example (dict): A dataset example.

    Returns:
        dict: The example with an added 'formatted_text' field.
    """
    conversation = create_conversation(example)
    # Use the tokenizer's chat template to format the conversation as a string
    example['formatted_text'] = tokenizer.apply_chat_template(conversation, tokenize=False)
    return example

# Format the entire dataset using the chat template
formatted_dataset = dataset.map(apply_template)

# Split the dataset: 10% for testing, then further split into train/eval
tmp_dataset = formatted_dataset['train'].train_test_split(test_size=0.1)["test"]
train_dataset = tmp_dataset.train_test_split(test_size=0.1)["train"]
eval_dataset = tmp_dataset.train_test_split(test_size=0.1)["test"]
train_dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 8530
})

In [None]:
# Reload the tokenizer and set padding tokens and side
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Configure 4-bit quantization for memory-efficient training
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

# Load the base model with quantization and device mapping
# FIXED: Changed device_map to "auto" instead of "cuda"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",  # Changed from "cuda" to "auto"
    trust_remote_code=True,
    torch_dtype=torch.bfloat16
)

# Disable cache during training for compatibility with LoRA
model.config.use_cache = False

# Prepare the model for k-bit (quantized) training
model = prepare_model_for_kbit_training(model)

In [None]:
# Configure LoRA for parameter-efficient fine-tuning
lora_config = LoraConfig(
    r=4,  # LoRA rank
    lora_alpha=4,  # LoRA scaling factor
    target_modules=[  # Modules to apply LoRA to (specific to Qwen2.5)
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj"
    ],
    lora_dropout=0.1,  # Dropout probability for LoRA layers
    bias="none",  # No bias adaptation
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
)

# Apply LoRA configuration to the model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    optim="adamw_bnb_8bit",
    num_train_epochs=10,
    eval_strategy="steps",
    eval_steps=50,  # Changed from 0.2 to fixed number
    logging_steps=10,
    warmup_steps=10,
    logging_dir="./logs",
    save_strategy="steps",  # Changed from "epoch" to "steps"
    save_steps=100,  # Changed from 0.2 to fixed number
    max_steps=-1,
    learning_rate=2e-4,
    fp16=False,
    bf16=True,  # Changed from False to True for better performance
    group_by_length=True,
    report_to="none",
    run_name="qwen-vietnamese-poetry-finetune",
    remove_unused_columns=False,  # Added this to prevent column removal issues
    dataloader_pin_memory=False,  # Added to prevent memory issues
)

# Data collator for language modeling (no masked LM, left padding to multiple of 8)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
    pad_to_multiple_of=8
)


In [None]:
def formatting_func(example):
    """
    Extract the formatted text for each example.

    Args:
        example (dict): A dataset example with 'formatted_text'.

    Returns:
        str: The formatted text for training.
    """
    return example['formatted_text']

# Initialize the SFTTrainer for supervised fine-tuning
# FIXED: Removed peft_config from SFTTrainer since model already has LoRA applied
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    args=training_arguments,
    formatting_func=formatting_func,
    # max_seq_length=512,  # Added max sequence length
    packing=False,  # Disabled packing to avoid issues
)


In [None]:
# Start the training process
print("Starting training...")
trainer.train()

# Save the fine-tuned model and tokenizer to disk
trainer.save_model()
tokenizer.save_pretrained(output_dir)

print(f"Training completed! Model saved to {output_dir}")