In [1]:
from pathlib import Path

DATA_DIR = Path("/kaggle/input/vlst-medical-mt")

train_en_path = DATA_DIR / "train.en.txt"
train_vi_path = DATA_DIR / "train.vi.txt"
test_en_path  = DATA_DIR / "public_test.en.txt"
test_vi_path  = DATA_DIR / "public_test.vi.txt"

for p in [train_en_path, train_vi_path, test_en_path, test_vi_path]:
    print(p, "-> exists:", p.exists())


/kaggle/input/vlst-medical-mt/train.en.txt -> exists: True
/kaggle/input/vlst-medical-mt/train.vi.txt -> exists: True
/kaggle/input/vlst-medical-mt/public_test.en.txt -> exists: True
/kaggle/input/vlst-medical-mt/public_test.vi.txt -> exists: True


In [None]:
import re
import unicodedata

# Zero-width / invisible chars
ZERO_WIDTH = {
    "\u200b",  # zero-width space
    "\u200c",  # zero-width non-joiner
    "\u200d",  # zero-width joiner
    "\u2060",  # word joiner
    "\ufeff",  # BOM
}

_ws_re = re.compile(r"\s+")

def normalize_text(s: str) -> str:
    if s is None:
        return ""

    # 1) bỏ newline ở cuối (đọc line-by-line thường đã có)
    s = s.rstrip("\n\r")

    # 2) remove BOM/zero-width
    #    (BOM có thể ở đầu file hoặc lẫn trong text)
    for ch in ZERO_WIDTH:
        s = s.replace(ch, "")

    # 3) remove control chars (Unicode category bắt đầu bằng 'C')
    s = "".join(c for c in s if unicodedata.category(c)[0] != "C")

    # 4) Unicode normalize NFC
    s = unicodedata.normalize("NFC", s)

    # 5) whitespace normalize
    s = s.strip()
    s = _ws_re.sub(" ", s)

    return s


In [None]:
from itertools import zip_longest

def clean_parallel(
    src_path: Path,
    tgt_path: Path,
    out_src_path: Path,
    out_tgt_path: Path,
    encoding: str = "utf-8",
):
    stats = {
        "total_pairs_read": 0,
        "kept_pairs": 0,
        "dropped_empty_after_norm": 0,
        "alignment_error": False,
        "first_mismatch_index": None,
    }

    out_src_path.parent.mkdir(parents=True, exist_ok=True)

    with open(src_path, "r", encoding=encoding, errors="replace") as fsrc, \
         open(tgt_path, "r", encoding=encoding, errors="replace") as ftgt, \
         open(out_src_path, "w", encoding=encoding) as osrc, \
         open(out_tgt_path, "w", encoding=encoding) as otgt:

        for i, (src_line, tgt_line) in enumerate(zip_longest(fsrc, ftgt, fillvalue=None), start=1):
            if src_line is None or tgt_line is None:
                stats["alignment_error"] = True
                stats["first_mismatch_index"] = i
                break

            stats["total_pairs_read"] += 1

            src = normalize_text(src_line)
            tgt = normalize_text(tgt_line)

            if not src or not tgt:
                stats["dropped_empty_after_norm"] += 1
                continue

            osrc.write(src + "\n")
            otgt.write(tgt + "\n")
            stats["kept_pairs"] += 1

    return stats

# Output paths
work_dir = Path("/kaggle/working")
clean_train_en = work_dir / "clean_train.en.txt"
clean_train_vi = work_dir / "clean_train.vi.txt"
clean_test_en  = work_dir / "clean_public_test.en.txt"
clean_test_vi  = work_dir / "clean_public_test.vi.txt"

train_stats = clean_parallel(train_en_path, train_vi_path, clean_train_en, clean_train_vi)
test_stats  = clean_parallel(test_en_path,  test_vi_path,  clean_test_en,  clean_test_vi)

print("TRAIN stats:", train_stats)
print("TEST  stats:", test_stats)

if train_stats["alignment_error"]:
    raise RuntimeError(f"TRAIN alignment mismatch at line {train_stats['first_mismatch_index']}")
if test_stats["alignment_error"]:
    raise RuntimeError(f"TEST alignment mismatch at line {test_stats['first_mismatch_index']}")

def preview(path, n=3):
    print(f"\n--- Preview {path.name} ---")
    with open(path, "r", encoding="utf-8") as f:
        for _ in range(n):
            line = f.readline()
            if not line:
                break
            print(line.rstrip("\n"))

preview(clean_train_en, 3)
preview(clean_train_vi, 3)


TRAIN stats: {'total_pairs_read': 500000, 'kept_pairs': 500000, 'dropped_empty_after_norm': 0, 'alignment_error': False, 'first_mismatch_index': None}
TEST  stats: {'total_pairs_read': 3000, 'kept_pairs': 3000, 'dropped_empty_after_norm': 0, 'alignment_error': False, 'first_mismatch_index': None}

--- Preview clean_train.en.txt ---
To evaluate clinical, subclinical symptoms of patients with otitis media with effusion and V.a at otorhinolaryngology department – Thai Nguyen national hospital
Evaluate clinical, subclinical symptoms of patients with otittis media effusion and V a at otorhinolaryngology department - Thai Nguyên National Hospital.
There was a relation between vasodilatation and vaginal dysfunction.

--- Preview clean_train.vi.txt ---
Nghiên cứu đặc điểm lâm sàng, cận lâm sàng bệnh nhân viêm tai ứ dịch trên viêm V.A tại Khoa Tai mũi họng - Bệnh viện Trung ương Thái Nguyên
Đánh giá đặc điểm lâm sàng, cận lâm sàng bệnh nhân viêm tai ứ dịch trên viêm V.a tại Khoa Tai mũi họng - 

In [None]:
import hashlib
from pathlib import Path

work_dir = Path("/kaggle/working")

clean_train_en = work_dir / "clean_train.en.txt"
clean_train_vi = work_dir / "clean_train.vi.txt"

# Output dedup
dedup_train_en = work_dir / "dedup_train.en.txt"
dedup_train_vi = work_dir / "dedup_train.vi.txt"

def pair_key(src: str, tgt: str) -> int:
    h = hashlib.blake2b(digest_size=8)
    h.update(src.encode("utf-8"))
    h.update(b"\t")
    h.update(tgt.encode("utf-8"))
    return int.from_bytes(h.digest(), "little")

def dedup_parallel(src_in, tgt_in, src_out, tgt_out):
    seen = set()
    total = kept = dup = 0

    with open(src_in, "r", encoding="utf-8") as fsrc, \
         open(tgt_in, "r", encoding="utf-8") as ftgt, \
         open(src_out, "w", encoding="utf-8") as osrc, \
         open(tgt_out, "w", encoding="utf-8") as otgt:
        for src, tgt in zip(fsrc, ftgt):
            total += 1
            src = src.rstrip("\n")
            tgt = tgt.rstrip("\n")
            k = pair_key(src, tgt)
            if k in seen:
                dup += 1
                continue
            seen.add(k)
            osrc.write(src + "\n")
            otgt.write(tgt + "\n")
            kept += 1

    return {"total": total, "kept": kept, "duplicates": dup}

dedup_stats = dedup_parallel(clean_train_en, clean_train_vi, dedup_train_en, dedup_train_vi)
print("DEDUP stats:", dedup_stats)


DEDUP stats: {'total': 500000, 'kept': 348218, 'duplicates': 151782}


In [None]:
!paste -d $'\t' /kaggle/working/clean_train.en.txt /kaggle/working/clean_train.vi.txt \
| sort \
| uniq -c \
| sort -nr \
| head -n 20


      4 The thyroid gland is a small, butterfly-shaped gland that sits near the base of the front of your neck.	Tuyến giáp là một tuyến nhỏ, hình con bướm, nằm phía trước cổ của bạn.
      4 The software Epi Info 6.04 and 2 test are used for data analysis and comparison.	Công cụ dùng để phân tích và xử lý số liệu là phần mềm Epi Info 6.04 và sử dụng phép kiểm 2 để so sánh các tỉ lệ.
      4 The disease is not spread from pets, but it can be spread by person to person.	Bệnh này không lây từ vật nuôi, nhưng nó có thể được lây lan từ người sang người.
      4 Stress level of healthcare workers had a statistically significant correlation with occupational factors including the average number of working hours per day (OR = 1.96; 95% CI: 1.02 - 3.74, p<0.05), pressure from negative reaction of COVID-19 patients / their family (OR = 2.23, 95% CI: 1.17 - 4.28; p<0.05) and time pressure to complete work (OR= 5.88, 95% CI: 2.95 - 11.73, p<0.05).	Mức độ stress của nhân viên y tế có mối tương q

In [None]:
from pathlib import Path
import numpy as np

try:
    from transformers import AutoTokenizer
except ImportError:
    import sys
    !{sys.executable} -m pip -q install transformers
    from transformers import AutoTokenizer

MODEL_NAME = "Qwen/Qwen3-0.6B"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

work_dir = Path("/kaggle/working")
src_path = work_dir / "dedup_train.en.txt"
tgt_path = work_dir / "dedup_train.vi.txt"

def token_len_batch(texts):
    enc = tokenizer(texts, add_special_tokens=False, padding=False, truncation=False, return_length=True)
    return enc["length"]

src_lens, tgt_lens = [], []
batch = 2048
buf_s, buf_t = [], []

with open(src_path, "r", encoding="utf-8") as fsrc, open(tgt_path, "r", encoding="utf-8") as ftgt:
    for s, t in zip(fsrc, ftgt):
        buf_s.append(s.rstrip("\n"))
        buf_t.append(t.rstrip("\n"))
        if len(buf_s) >= batch:
            src_lens.extend(token_len_batch(buf_s))
            tgt_lens.extend(token_len_batch(buf_t))
            buf_s, buf_t = [], []
    if buf_s:
        src_lens.extend(token_len_batch(buf_s))
        tgt_lens.extend(token_len_batch(buf_t))

src_lens = np.array(src_lens)
tgt_lens = np.array(tgt_lens)

def pct(a, p):
    return int(np.percentile(a, p))

print("SRC tokens: p50", pct(src_lens,50), "p90", pct(src_lens,90), "p95", pct(src_lens,95), "p99", pct(src_lens,99), "max", int(src_lens.max()))
print("TGT tokens: p50", pct(tgt_lens,50), "p90", pct(tgt_lens,90), "p95", pct(tgt_lens,95), "p99", pct(tgt_lens,99), "max", int(tgt_lens.max()))

ratio = tgt_lens / np.maximum(1, src_lens)
print("RATIO tgt/src: p50", float(np.percentile(ratio,50)), "p95", float(np.percentile(ratio,95)), "p99", float(np.percentile(ratio,99)), "max", float(ratio.max()))


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

SRC tokens: p50 27 p90 58 p95 75 p99 127 max 1249
TGT tokens: p50 37 p90 75 p95 94 p99 155 max 1337
RATIO tgt/src: p50 1.3333333333333333 p95 1.9333333333333333 p99 2.4444444444444446 max 50.5


In [None]:
import numpy as np

ratio_ts = tgt_lens / np.maximum(1, src_lens)
ratio_st = src_lens / np.maximum(1, tgt_lens)

def fpercentiles(x, ps=(0, 0.05, 0.1, 1, 5, 50, 95, 99, 99.9, 100)):
    return {f"p{p}": float(np.percentile(x, p)) for p in ps}

print("tgt/src percentiles:", fpercentiles(ratio_ts))
print("src/tgt percentiles:", fpercentiles(ratio_st))

sym = np.maximum(ratio_ts, ratio_st)
print("sym=max(tgt/src,src/tgt) percentiles:", fpercentiles(sym))


tgt/src percentiles: {'p0': 0.03890489913544669, 'p0.05': 0.38010333333333335, 'p0.1': 0.4473684210526316, 'p1': 0.7428571428571429, 'p5': 0.9534883720930233, 'p50': 1.3333333333333333, 'p95': 1.9333333333333333, 'p99': 2.4444444444444446, 'p99.9': 3.532482352941388, 'p100': 50.5}
src/tgt percentiles: {'p0': 0.019801980198019802, 'p0.05': 0.25, 'p0.1': 0.28308710691823896, 'p1': 0.4090909090909091, 'p5': 0.5172413793103449, 'p50': 0.75, 'p95': 1.048780487804878, 'p99': 1.3461538461538463, 'p99.9': 2.235294117647059, 'p100': 25.703703703703702}
sym=max(tgt/src,src/tgt) percentiles: {'p0': 1.0, 'p0.05': 1.0, 'p0.1': 1.0, 'p1': 1.0, 'p5': 1.04, 'p50': 1.3333333333333333, 'p95': 1.9413043478260572, 'p99': 2.4642857142857144, 'p99.9': 3.592654761905405, 'p100': 50.5}


In [None]:
from pathlib import Path
import numpy as np
from transformers import AutoTokenizer

MODEL_NAME = "Qwen/Qwen3-0.6B"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

def filter_parallel_by_ratio(
    src_in: Path, tgt_in: Path,
    src_out: Path, tgt_out: Path,
    min_ratio=0.2, max_ratio=4.0,
    max_src=512, max_tgt=512,
    batch_size=2048,
    show_drops=8
):
    def lens(texts):
        enc = tokenizer(
            texts,
            add_special_tokens=False,
            padding=False,
            truncation=False,
            return_length=True
        )
        return np.array(enc["length"], dtype=np.int32)

    total = kept = 0
    drop_len = drop_low_ratio = drop_high_ratio = 0
    shown = 0

    with open(src_in, "r", encoding="utf-8") as fsrc, \
         open(tgt_in, "r", encoding="utf-8") as ftgt, \
         open(src_out, "w", encoding="utf-8") as osrc, \
         open(tgt_out, "w", encoding="utf-8") as otgt:

        buf_s, buf_t = [], []

        def flush(buf_s, buf_t):
            nonlocal total, kept, drop_len, drop_low_ratio, drop_high_ratio, shown

            ls = lens(buf_s)
            lt = lens(buf_t)
            ratio = lt / np.maximum(1, ls)

            for s, t, l1, l2, r in zip(buf_s, buf_t, ls, lt, ratio):
                total += 1

                if (max_src is not None and l1 > max_src) or (max_tgt is not None and l2 > max_tgt):
                    drop_len += 1
                    if shown < show_drops:
                        print("\n[DROPPED length]", {"src_tok": int(l1), "tgt_tok": int(l2), "ratio": float(r)})
                        print("SRC:", s[:200])
                        print("TGT:", t[:200])
                        shown += 1
                    continue

                if r < min_ratio:
                    drop_low_ratio += 1
                    if shown < show_drops:
                        print("\n[DROPPED low ratio]", {"src_tok": int(l1), "tgt_tok": int(l2), "ratio": float(r)})
                        print("SRC:", s[:200])
                        print("TGT:", t[:200])
                        shown += 1
                    continue

                if r > max_ratio:
                    drop_high_ratio += 1
                    if shown < show_drops:
                        print("\n[DROPPED high ratio]", {"src_tok": int(l1), "tgt_tok": int(l2), "ratio": float(r)})
                        print("SRC:", s[:200])
                        print("TGT:", t[:200])
                        shown += 1
                    continue

                osrc.write(s + "\n")
                otgt.write(t + "\n")
                kept += 1

        for s, t in zip(fsrc, ftgt):
            buf_s.append(s.rstrip("\n"))
            buf_t.append(t.rstrip("\n"))
            if len(buf_s) >= batch_size:
                flush(buf_s, buf_t)
                buf_s, buf_t = [], []

        if buf_s:
            flush(buf_s, buf_t)

    return {
        "total": total,
        "kept": kept,
        "dropped_len": drop_len,
        "dropped_low_ratio": drop_low_ratio,
        "dropped_high_ratio": drop_high_ratio,
        "params": {
            "min_ratio": min_ratio, "max_ratio": max_ratio,
            "max_src": max_src, "max_tgt": max_tgt
        }
    }

work_dir = Path("/kaggle/working")
src_in = work_dir / "dedup_train.en.txt"
tgt_in = work_dir / "dedup_train.vi.txt"
src_out = work_dir / "train_ratio_filtered.en.txt"
tgt_out = work_dir / "train_ratio_filtered.vi.txt"

stats = filter_parallel_by_ratio(
    src_in, tgt_in, src_out, tgt_out,
    min_ratio=0.25, max_ratio=4.0,
    max_src=256, max_tgt=256,
    batch_size=2048,
    show_drops=8
)

print("\nFILTER stats:", stats)
print("Saved:", src_out, tgt_out)



[DROPPED length] {'src_tok': 276, 'tgt_tok': 270, 'ratio': 0.9782608695652174}
SRC: Independent factors that increase mortality are immunosuppression (OR 2, 3; 95% CI 1, 4 - 3, 86, p < 0, 01), mechanical ventilation (OR 14, 6, 95% CI 5, 4 - 39, 6; p <0, 01), catheter (OR 1, 83; 95% C
TGT: Các yếu tố độc lập làm tăng tỷ lệ tử vong là suy giảm miễn dịch (OR 2,3; 95% CI 1,4-3,86), p < 0,01), thở máy (OR 14,6; 95% CI 5,4 - 39,6; p < 0,01), đặt catheter tĩnh mạch trung tâm (OR 1,83; 95% CI 

[DROPPED high ratio] {'src_tok': 2, 'tgt_tok': 10, 'ratio': 5.0}
SRC: Methods.
TGT: Đối tượng-Phương pháp nghiên cứu.

[DROPPED high ratio] {'src_tok': 2, 'tgt_tok': 9, 'ratio': 4.5}
SRC: Method.
TGT: Đối tượng - Phương pháp nghiên cứu.

[DROPPED length] {'src_tok': 223, 'tgt_tok': 261, 'ratio': 1.1704035874439462}
SRC: The study was completed in May 2015 with 473 children aged 5-year-old in 9 preschools and 476 children aged 12-year-old in 9 middle schools in the district, according to the method of i