In [1]:
# CELL 1
!pip install -q torch transformers datasets accelerate peft bitsandbytes sentencepiece sacrebleu rouge_score matplotlib tqdm
!pip install -q trl==0.8.6 accelerate --no-deps
!pip install -q sentence-transformers
!pip install -q --upgrade bitsandbytes transformers accelerate peft
import torch
print("GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")

GPU: CPU


In [None]:
# CELL 2 ‚Äì LOAD MedEV
from datasets import load_dataset, DatasetDict
# H√†m load v√† b·ªè header n·∫øu c√≥ (an to√†n, n·∫øu kh√¥ng c√≥ header th√¨ gi·ªØ nguy√™n)
def load_text_file(url):
    ds = load_dataset("text", data_files=url, split="train")
    # Ki·ªÉm tra n·∫øu c√≥ header GIT-LFS (d√≤ng ƒë·∫ßu ch·ª©a "version https://git-lfs")
    if len(ds) > 0 and "version https://git-lfs" in ds[0]["text"]:
        print(f"B·ªè header GIT-LFS cho file {url}")
        ds = ds.select(range(3, len(ds)))
    return ds

train_en = load_text_file("https://huggingface.co/datasets/nhuvo/MedEV/resolve/main/train.en.txt")
train_vi = load_text_file("https://huggingface.co/datasets/nhuvo/MedEV/resolve/main/train.vi.txt")
val_en = load_text_file("https://huggingface.co/datasets/nhuvo/MedEV/resolve/main/val.en.new.txt")
val_vi = load_text_file("https://huggingface.co/datasets/nhuvo/MedEV/resolve/main/val.vi.new.txt")
test_en = load_text_file("https://huggingface.co/datasets/nhuvo/MedEV/resolve/main/test.en.new.txt")
test_vi = load_text_file("https://huggingface.co/datasets/nhuvo/MedEV/resolve/main/test.vi.new.txt")

# Gh√©p parallel
train = train_en.add_column("vi", train_vi["text"]).rename_column("text", "en")
val = val_en.add_column("vi", val_vi["text"]).rename_column("text", "en")
test = test_en.add_column("vi", test_vi["text"]).rename_column("text", "en")

dataset = DatasetDict({
    "train": train,
    "validation": val,
    "test": test
})

# Ki·ªÉm tra
print("\nC·∫•u tr√∫c dataset sau gh√©p:")
print(dataset)

print("\nM·∫´u 5 example t·ª´ train (medical real data):")
for i in range(5):
    ex = dataset["train"][i]
    print(f"M·∫´u {i+1}:")
    print(f"  EN: {ex['en']}")
    print(f"  VI: {ex['vi']}")
    print("")

print(f"\nK√≠ch th∆∞·ªõc MedEV:")
print(f"Train: {len(dataset['train']):,} pairs")
print(f"Validation: {len(dataset['validation']):,} pairs")
print(f"Test: {len(dataset['test']):,} pairs")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# CELL 3 ‚Äì LOAD MODEL + LoRA
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch
model_name = "Qwen/Qwen2-1.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=quant_config,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True,
)

model = prepare_model_for_kbit_training(model)

peft_config = LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

In [None]:
# CELL 4 ‚Äì FINE-TUNE

from transformers import TrainingArguments
from trl import SFTTrainer
import torch
train_subset_raw = dataset["train"].shuffle(seed=42).select(range(20000))
val_subset_raw = dataset["validation"].shuffle(seed=42).select(range(3000))
def formatting_prompts_func(example):
    texts = []
    for en, vi in zip(example["en"], example["vi"]):
        text = f"Translate English to Vietnamese (Medical domain):\nEnglish: {en.strip()}\nVietnamese: {vi.strip()}<|im_end|>"
        texts.append(text)
    return {"text": texts}
processed_train = train_subset_raw.map(formatting_prompts_func, batched=True, remove_columns=["en", "vi"])
processed_val = val_subset_raw.map(formatting_prompts_func, batched=True, remove_columns=["en", "vi"])

training_args = TrainingArguments(
    output_dir="./qwen2-medical-fast",
    num_train_epochs=1,
    per_device_train_batch_size=12,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps= 2,
    optim="paged_adamw_8bit",
    learning_rate=2e-4,
    fp16=True,
    bf16=False,
    warmup_steps=50,
    logging_steps=10 ,
    eval_strategy="steps",
    eval_steps=200,
    save_steps=400,
    weight_decay=0.01,
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    save_total_limit=2,
    gradient_checkpointing=True,
    dataloader_pin_memory=False ,
    dataloader_num_workers=2,
)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=processed_train,
    eval_dataset=processed_val,
    dataset_text_field="text",
    max_seq_length= 512 ,
    tokenizer=tokenizer,
    packing=False,
)

trainer.train()
# L∆∞u model
save_dir = "./qwen2-1.5b-medical-vi-fast"
model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)

In [None]:
# CELL 5

import torch
from tqdm import tqdm
import sacrebleu
import re
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel
save_dir = "./qwen2-1.5b-medical-vi-fast"
tokenizer = AutoTokenizer.from_pretrained(save_dir, trust_remote_code=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)
base_model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2-1.5B-Instruct",
    quantization_config=quant_config,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True,
)
model = PeftModel.from_pretrained(base_model, save_dir)
model.eval()
device = next(model.parameters()).device
# Prompt inference
def create_prompt(en_text):
    return f"Translate English to Vietnamese :\nEnglish: {en_text.strip()}\nVietnamese:"
NUM_TEST_SAMPLES = 2000
print(f"\nGenerate tr√™n {NUM_TEST_SAMPLES} c√¢u test...")
test_en = [ex["en"] for ex in dataset["test"].select(range(NUM_TEST_SAMPLES))]
test_vi = [ex["vi"] for ex in dataset["test"].select(range(NUM_TEST_SAMPLES))]

prompts = [create_prompt(text) for text in test_en]

inputs = tokenizer(prompts, return_tensors="pt", padding="longest", truncation=True, max_length=448).to(device)

generation_kwargs = {
    "max_new_tokens": 256,
    "do_sample": False,
    "temperature": 0.0,
    "repetition_penalty": 1.2,
    "eos_token_id": tokenizer.eos_token_id,
    "pad_token_id": tokenizer.pad_token_id,
}

batch_size = 8
preds = []

print("Generating...")
for i in tqdm(range(0, len(inputs["input_ids"]), batch_size)):
    batch = {k: v[i:i+batch_size] for k, v in inputs.items()}
    with torch.no_grad():
        output_ids = model.generate(**batch, **generation_kwargs)

    texts = tokenizer.batch_decode(output_ids, skip_special_tokens=True)

    for text in texts:
        if "Vietnamese:" in text:
            pred = text.split("Vietnamese:", 1)[-1].strip()
        else:
            pred = text.strip()

        pred = re.sub(r"\s+", " ", pred).strip()
        # Vi·∫øt hoa ch·ªØ ƒë·∫ßu
        if pred and pred[0].islower():
            pred = pred.capitalize()

        preds.append(pred)

# T√≠nh BLEU
refs = [[ref] for ref in test_vi]
bleu = sacrebleu.corpus_bleu(preds, refs)
print("\n" + "="*80)
print(f"GENERATE {len(preds)} C√ÇU")
print(f" CORPUS BLEU: {bleu.score:.2f}")
print("="*80)

# In 10 v√≠ d·ª• ƒë·ªÉ ki·ªÉm tra ch·∫•t l∆∞·ª£ng
print("\nüîç 10 V√ç D·ª§ ƒê·∫¶U TI√äN:")
for i in range(min(10, len(test_en))):
    print(f"EN   : {test_en[i]}")
    print(f"REF  : {test_vi[i]}")
    print(f"PRED : {preds[i]}")
    print(f"Sentence BLEU: {sacrebleu.sentence_bleu(preds[i], [test_vi[i]]).score:.1f}")
    print("-" * 80)

In [None]:
# CELL 6 ‚Äì V·∫º ƒê·ªí TH·ªä BLEU + ROUGE-L + ERROR ANALYSIS

import matplotlib.pyplot as plt
import sacrebleu
from rouge_score import rouge_scorer
import random

scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
rouge_l_scores = [scorer.score(ref, pred)['rougeL'].fmeasure for ref, pred in zip(test_vi, preds)]
avg_rouge_l = sum(rouge_l_scores) / len(rouge_l_scores)

sent_bleus = [sacrebleu.sentence_bleu(pred, [ref]).score for pred, ref in zip(preds, test_vi)]

plt.figure(figsize=(12, 6))
plt.plot(sent_bleus, color='dodgerblue', alpha=0.7, linewidth=1.5, label='Sentence BLEU')
plt.axhline(y=bleu.score, color='green', linestyle='--', label=f'Corpus BLEU: {bleu.score:.2f}')
plt.axhline(y=sum(sent_bleus)/len(sent_bleus), color='red', linestyle='--', label=f'Avg Sentence BLEU: {sum(sent_bleus)/len(sent_bleus):.1f}')
plt.title(f"Corpus BLEU: {bleu.score:.2f} | ROUGE-L: {avg_rouge_l:.4f} | Samples: {NUM_TEST_SAMPLES}")
plt.xlabel("Sentence Index")
plt.ylabel("BLEU Score")
plt.grid(True, alpha=0.3)
plt.legend()
plt.tight_layout()
plt.savefig("bleu_curve_fast.png", dpi=300)
plt.show()

print(f"ROUGE-L trung b√¨nh: {avg_rouge_l:.4f}")

# Error analysis ng·∫´u nhi√™n 8 c√¢u
print("\n" + "="*90)
print("ERROR ANALYSIS")
print("="*90)
samples_idx = random.sample(range(len(preds)), 8)
for idx in samples_idx:
    print(f"\n[{idx+1:3d}] EN   : {test_en[idx]}")
    print(f"      REF  : {test_vi[idx]}")
    print(f"      PRED : {preds[idx]}")
    print(f"      BLEU : {sent_bleus[idx]:5.1f}")
    print("-" * 80)

In [None]:
# CELL 7 ‚Äì SO S√ÅNH V·ªöI BASELINE (ZERO-SHOT QWEN2 + NLLB)

from transformers import pipeline
import torch

device = 0 if torch.cuda.is_available() else -1

print("=== BASELINE 1: Zero-shot Qwen2-1.5B-Instruct (c√≥ domain prompt) ===")
zero_prompts = [create_prompt(text) for text in test_en]  # D√πng c√πng prompt nh∆∞ fine-tune

zero_inputs = tokenizer(zero_prompts, return_tensors="pt", padding=True, truncation=True, max_length=448).to(device)

zero_preds = []
for i in tqdm(range(0, len(zero_inputs["input_ids"]), 8), desc="Zero-shot"):
    batch = {k: v[i:i+8] for k, v in zero_inputs.items()}
    with torch.no_grad():
        outs = base_model.generate(**batch, **generation_kwargs)
    texts = tokenizer.batch_decode(outs, skip_special_tokens=True)
    for text in texts:
        pred = text.split("Vietnamese:", 1)[-1].strip() if "Vietnamese:" in text else text.strip()
        zero_preds.append(re.sub(r"\s+", " ", pred).strip())

zero_bleu = sacrebleu.corpus_bleu(zero_preds, refs)
print(f"Zero-shot BLEU: {zero_bleu.score:.2f}")

print("\n=== BASELINE 2: NLLB-200-distilled-600M ===")
nllb = pipeline("translation", model="facebook/nllb-200-distilled-600M", src_lang="eng_Latn", tgt_lang="vie_Latn", device=device, batch_size=16)
nllb_preds = [tr["translation_text"].strip() for tr in tqdm(nllb(test_en[:80]), desc="NLLB")]  # Gi·ªõi h·∫°n 80 ƒë·ªÉ nhanh
nllb_bleu = sacrebleu.corpus_bleu(nllb_preds, refs)
print(f"NLLB-200 BLEU: {nllb_bleu.score:.2f}")

print("\n=== T·ªîNG K·∫æT ===")
print(f"Fine-tuned Qwen2-1.5B (medical) : {bleu.score:.2f}")
print(f"Zero-shot Qwen2-1.5B           : {zero_bleu.score:.2f}")
print(f"NLLB-200 dedicated MT          : {nllb_bleu.score:.2f}")
print("‚Üí Fine-tune domain mang l·∫°i c·∫£i thi·ªán r√µ r·ªát!")