# Train

In [1]:
# CELL 1
!pip install -q torch transformers datasets accelerate peft bitsandbytes sentencepiece sacrebleu rouge_score matplotlib tqdm
!pip install -q trl==0.8.6 accelerate --no-deps
!pip install -q sentence-transformers
!pip install -q --upgrade bitsandbytes transformers accelerate peft
import torch
print("GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m29.4 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.2/245.2 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m103.5 MB/s[0m eta [36m0:00:00[0m00:01[0m:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.9/380.9 kB[0m [31m23.0 MB/s[0

In [2]:
# CELL 2
from datasets import load_dataset, DatasetDict, Dataset
import os

def load_local_text(file_path):
    if not os.path.exists(file_path):
        print(f"Warning: File {file_path} not found.")
        return []
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = [line.strip() for line in f if line.strip()]
    return lines

train_vi_lines = load_local_text('/kaggle/input/vlspdata/train.vi.txt')
train_en_lines = load_local_text('/kaggle/input/vlspdata/train.en.txt')
test_vi_lines = load_local_text('/kaggle/input/vlspdata/public_test.vi (1).txt')
test_en_lines = load_local_text('/kaggle/input/vlspdata/public_test.en.txt')

min_len_train = min(len(train_vi_lines), len(train_en_lines))
train_vi_lines = train_vi_lines[:min_len_train]
train_en_lines = train_en_lines[:min_len_train]

full_train_dataset = Dataset.from_dict({"en": train_en_lines, "vi": train_vi_lines})
split_dataset = full_train_dataset.train_test_split(test_size=0.05, seed=42) 
train_dataset = split_dataset["train"]
val_dataset = split_dataset["test"]

min_len_test = min(len(test_vi_lines), len(test_en_lines))
test_dataset = Dataset.from_dict({"en": test_en_lines[:min_len_test], "vi": test_vi_lines[:min_len_test]})

dataset = DatasetDict({
    "train": train_dataset,
    "validation": val_dataset,
    "test": test_dataset
})

print("\nDataset Structure:")
print(dataset)
print(f"Train: {len(dataset['train']):,} pairs")


Dataset Structure:
DatasetDict({
    train: Dataset({
        features: ['en', 'vi'],
        num_rows: 475000
    })
    validation: Dataset({
        features: ['en', 'vi'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['en', 'vi'],
        num_rows: 3000
    })
})
Train: 475,000 pairs


In [3]:
# CELL 3
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch

model_name = "Qwen/Qwen2.5-1.5B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quant_config,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True,
)

model = prepare_model_for_kbit_training(model)

# Cấu hình LoRA tối ưu
peft_config = LoraConfig(
    r=32, # Tăng rank lên 32 để học tốt hơn các thuật ngữ y tế
    lora_alpha=64,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

2025-12-22 10:37:53.768946: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766399874.162388      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766399874.272379      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766399875.267842      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766399875.267875      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766399875.267878      55 computation_placer.cc:177] computation placer alr

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/684 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

trainable params: 36,929,536 || all params: 1,580,643,840 || trainable%: 2.3364


In [5]:
# CELL 4 - OPTIMIZED FOR SPEED
from transformers import TrainingArguments
from trl import SFTTrainer
import torch
import random

full_train = dataset["train"].shuffle(seed=42)

train_subset = full_train.select(range(min(100000, len(full_train))))

val_subset = dataset["validation"].shuffle(seed=42).select(range(min(2000, len(dataset["validation"]))))

print(f"Training on: {len(train_subset)} samples")

def formatting_prompts_func(example):
    texts = []
    for en, vi in zip(example["en"], example["vi"]):
        # Trộn ngẫu nhiên chiều dịch
        if random.random() < 0.5:
            text = f"Translate English to Vietnamese (Medical domain):\nEnglish: {en.strip()}\nVietnamese: {vi.strip()}<|im_end|>"
        else:
            text = f"Translate Vietnamese to English (Medical domain):\nVietnamese: {vi.strip()}\nEnglish: {en.strip()}<|im_end|>"
        texts.append(text)
    return {"text": texts}

processed_train = train_subset.map(formatting_prompts_func, batched=True, remove_columns=["en", "vi"], num_proc=4)
processed_val = val_subset.map(formatting_prompts_func, batched=True, remove_columns=["en", "vi"], num_proc=4)

training_args = TrainingArguments(
    output_dir="./qwen-medical-vlsp",
    num_train_epochs=1,
    
    per_device_train_batch_size=8,  
    gradient_accumulation_steps=2,   
    
    optim="paged_adamw_8bit",
    learning_rate=2e-4,
    fp16=True,
    bf16=False,
    warmup_steps=200,
    logging_steps=50,            
    eval_strategy="steps",
    eval_steps=1000,               
    save_steps=1000,
    weight_decay=0.01,
    report_to="none",
    save_total_limit=2,
    gradient_checkpointing=True,
    dataloader_num_workers=2,
    group_by_length=True,            
)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=processed_train,
    eval_dataset=processed_val,
    dataset_text_field="text",
    max_seq_length=512,
    tokenizer=tokenizer,
    packing=False, 
)

print("Bắt đầu training với cấu hình tối ưu tốc độ...")
trainer.train()

save_dir = "qwen2.5-1.5b-vlsp-final"
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)
print(f"Saved model to {save_dir}")

Training on: 100000 samples


Map (num_proc=4):   0%|          | 0/100000 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

  super().__init__(


Bắt đầu training với cấu hình tối ưu tốc độ...


Step,Training Loss,Validation Loss
1000,1.6641,1.675312
2000,1.6106,1.609338
3000,1.5738,1.567672
4000,1.5668,1.535496
5000,1.5318,1.514541
6000,1.4833,1.499875


Saved model to qwen2.5-1.5b-vlsp-final
