In [None]:
!pip install -q torch transformers datasets accelerate peft bitsandbytes sentencepiece sacrebleu rouge_score matplotlib google-generativeai tqdm

import torch
print("GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")

In [None]:
from datasets import load_dataset
import random

# Tải OPUS100 EN-VI từ Hugging Face
dataset_full = load_dataset("opus100", "en-vi")

print("Đã tải xong OPUS100 EN-VI từ Hugging Face .")
print(f"Splits: Train={len(dataset_full['train'])}, Val={len(dataset_full['validation'])}, Test={len(dataset_full['test'])}")

# Hàm lọc y tế
def filter_medical_domain(dataset_split):
    # Keywords y tế tiếng Anh
    keywords = [
        'doctor', 'nurse', 'hospital', 'patient', 'medicine', 'drug', 'disease', 'virus', 'pain',
        'surgery', 'health', 'cancer', 'treatment', 'symptom', 'diagnosis', 'therapy', 'vaccine',
        'infection', 'epidemic', 'mental', 'physical', 'wellness', 'care', 'heal', 'cure', 'illness',
        'blood', 'heart', 'brain', 'lung', 'stomach', 'bone', 'skin', 'eye', 'ear', 'dental',
        'body', 'mind', 'fit', 'diet', 'exercise', 'sleep', 'sick', 'ill', 'medical', 'pharmacy', 'recovery'
    ]

    filtered_data = []
    for example in dataset_split:
        en_text = example['translation']['en'].lower().strip()
        if any(k in en_text for k in keywords):
            filtered_data.append({
                "en": example['translation']['en'].strip(),
                "vi": example['translation']['vi'].strip()
            })

    return filtered_data

# Lọc từng split
medical_train = filter_medical_domain(dataset_full['train'])
medical_val = filter_medical_domain(dataset_full['validation'])
medical_test = filter_medical_domain(dataset_full['test'])

print(f"  Đã lọc Medical domain:")
print(f"  Train: {len(medical_train)} câu (từ {len(dataset_full['train'])})")
print(f"  Val: {len(medical_val)} câu (từ {len(dataset_full['validation'])})")
print(f"  Test: {len(medical_test)} câu (từ {len(dataset_full['test'])})")
print(f" Tỷ lệ lọc train: {len(medical_train)/len(dataset_full['train'])*100:.2f}%")

# Hiển thị sample 5 câu đầu train
print("\n Sample 5 câu đầu (EN → VI, Medical filtered từ OPUS100):")
for i in range(min(5, len(medical_train))):
    print(f"  [{i+1}] EN: {medical_train[i]['en']}")
    print(f"     VI: {medical_train[i]['vi']}\n")

# Tạo DatasetDict mới
from datasets import Dataset, DatasetDict
dataset = DatasetDict({
    "train": Dataset.from_list(medical_train),
    "validation": Dataset.from_list(medical_val),
    "test": Dataset.from_list(medical_test)
})

# Nếu val/test ít (<200 câu sau lọc), bổ sung từ train để cân bằng
if len(medical_val) < 200:
    extra_val = random.sample(medical_train, min(500, len(medical_train)))[:200]
    dataset["validation"] = Dataset.from_list(medical_val + extra_val)
if len(medical_test) < 200:
    extra_test = random.sample(medical_train, min(500, len(medical_train)))[:200]
    dataset["test"] = Dataset.from_list(medical_test + extra_test)

print(f"\n Splits cuối: Train={len(dataset['train'])}, Val={len(dataset['validation'])}, Test={len(dataset['test'])}")
print(" Dữ liệu VLSP Medical (OPUS100 filtered) sẵn sàng! ")

In [None]:
# FINETUNE QWEN2
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch

print("Đang load Qwen/Qwen2-1.5B-Instruct 4-bit…")

model_name = "Qwen/Qwen2-1.5B-Instruct"

# Tokenizer + pad_token
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

# Load model 4-bit
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    ),
    trust_remote_code=True
)

model = prepare_model_for_kbit_training(model)

# LoRA config
peft_config = LoraConfig(
    r=64, lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, peft_config)

print(" HOÀN TẤT!")
model.print_trainable_parameters()

In [None]:

def preprocess(examples):
    inputs = [f"Translate English to Vietnamese (Medical domain):\nEnglish: {en}\nVietnamese:" for en in examples["en"]]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True)
    labels = tokenizer(examples["vi"], max_length=512, truncation=True).input_ids
    model_inputs["labels"] = labels
    return model_inputs

print(" Đang preprocess ")
tokenized = dataset.map(preprocess, batched=True, remove_columns=["en","vi"], num_proc=4)
print("Dữ liệu đã sẵn sàng train")

In [None]:
# decoding.py
from transformers import pipeline
from tqdm import tqdm
import sacrebleu
import os

print("Tạo pipeline từ model đã finetune")

translator = pipeline(
    "text2text-generation",
    model=model,          # Dùng model đang có trong RAM
    tokenizer=tokenizer,
)

def beam_translate(text):
    prompt = f"Translate English to Vietnamese (Medical domain):\nEnglish: {text}\nVietnamese:"
    out = translator(prompt, max_length=512, num_beams=5, early_stopping=True, do_sample=False)
    return out[0]["generated_text"].split("Vietnamese:")[-1].strip()

# Dịch toàn bộ test set
test_en = [ex["en"] for ex in dataset["test"]]
print(f"Đang dịch {len(test_en)} câu test bằng beam search")

preds = []
for text in tqdm(test_en, desc="Decoding"):
    preds.append(beam_translate(text))

# Tính BLEU
refs = [[ex["vi"]] for ex in dataset["test"]]
bleu = sacrebleu.corpus_bleu(preds, refs)

print(f"\n HOÀN TẤT!")
print(f"BLEU SCORE = {bleu.score:.2f} ")
print(f"Test size: {len(test_en)} câu")

os.makedirs("results", exist_ok=True)
with open("results/predictions.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(preds))
with open("results/test_en.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(test_en))
with open("results/test_vi.txt", "w", encoding="utf-8") as f:
    f.write("\n".join([ex["vi"] for ex in dataset["test"]]))


In [None]:
# evaluation.py
import matplotlib.pyplot as plt
import sacrebleu
from rouge_score import rouge_scorer
import random

# Lấy lại dữ liệu để chắc chắn
test_en_list = [ex["en"] for ex in dataset["test"]]
test_vi = [ex["vi"] for ex in dataset["test"]]

print(f"Final BLEU (corpus): {bleu.score:.2f}")

# ROUGE-L
scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
rouge_scores = [scorer.score(r, p)['rougeL'].fmeasure for r, p in zip(test_vi, preds)]
print(f"ROUGE-L: {sum(rouge_scores)/len(rouge_scores):.4f}")
sent_bleus = [sacrebleu.sentence_bleu(p, [r]).score for p, r in zip(preds, test_vi)]

# Vẽ đồ thị
plt.figure(figsize=(14, 6))
plt.plot(sent_bleus, color='dodgerblue', alpha=0.8, linewidth=1.5)
plt.title(f"BLEU per Sentence – VLSP 2025 Medical Domain (OPUS100 filtered)\n"
          f"Corpus BLEU = {bleu.score:.2f} | Test size = {len(preds)} sentences",
          fontsize=14, fontweight='bold')
plt.xlabel("Sentence Index")
plt.ylabel("Sentence-level BLEU")
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig("results/bleu_curve_vlsp2025.png", dpi=300, bbox_inches='tight')
plt.show()

print("ĐÃ LƯU ẢNH results/bleu_curve_vlsp2025.png ")

# Error analysis 10 câu ngẫu nhiên
print("\n" + "="*90)
print("ERROR ANALYSIS ")
print("="*90)
samples_idx = random.sample(range(len(preds)), min(10, len(preds)))

for idx in samples_idx:
    print(f"\n[{idx+1:3d}] EN : {test_en_list[idx]}")
    print(f"     REF: {test_vi[idx]}")
    print(f"     PRED: {preds[idx]}")
    print(f"     BLEU: {sent_bleus[idx]:5.1f} → {'RẤT TỐT' if sent_bleus[idx] > 35 else 'CẦN CẢI THIỆN'}")
    if 'treatment' in test_en_list[idx].lower() and 'điều trị' not in preds[idx].lower():
        print("     → LỖI: Thiếu từ vựng y khoa (treatment → điều trị)")
    if 'patient' in test_en_list[idx].lower() and 'bệnh nhân' not in preds[idx].lower():
        print("     → LỖI: Thiếu từ vựng y khoa (patient → bệnh nhân)")

In [None]:
import random

print("\n" + "="*100)
print("PHÂN TÍCH LỖI DỊCH MÁY – VLSP 2025 MEDICAL DOMAIN")
print("="*100)

def manual_error_analysis(en, ref, pred, bleu_score):
    errors = []
    if 'treatment' in en.lower() and 'điều trị' not in pred.lower():
        errors.append("Sai từ vựng y khoa: 'treatment' → nên là 'điều trị'")
    if 'patient' in en.lower() and 'bệnh nhân' not in pred.lower():
        errors.append("Sai từ vựng y khoa: 'patient' → nên là 'bệnh nhân'")
    if 'doctor' in en.lower() and 'bác sĩ' not in pred.lower():
        errors.append("Sai từ vựng y khoa: 'doctor' → nên là 'bác sĩ'")
    if 'hospital' in en.lower() and 'bệnh viện' not in pred.lower():
        errors.append("Sai từ vựng y khoa: 'hospital' → nên là 'bệnh viện'")
    if len(pred.split()) < len(ref.split()) * 0.7:
        errors.append("Thiếu thông tin (bản dịch ngắn hơn tham chiếu)")
    if len(pred.split()) > len(ref.split()) * 1.5:
        errors.append("Thừa thông tin (dịch dài dòng)")
    if not errors:
        errors.append("Không có lỗi đáng kể ")

    return errors

samples_idx = random.sample(range(len(preds)), 10)

for i, idx in enumerate(samples_idx, 1):
    en = test_en_list[idx]
    ref = test_vi[idx]
    pred = preds[idx]
    bleu_val = sent_bleus[idx] if 'sent_bleus' in globals() else 0
    print(f"\n{'='*25} MẪU {i} {'='*25}")
    print(f"EN : {en}")
    print(f"REF: {ref}")
    print(f"PRED: {pred}")
    print(f"BLEU: {bleu_val:.1f}")
    print("→ PHÂN TÍCH LỖI:")
    for err in manual_error_analysis(en, ref, pred, bleu_val):
        print(f"   • {err}")
    print("-" * 70)