# Kriol → English NMT

This notebook trains a Kriol → English translator using NLLB (facebook/nllb-200-distilled-600M) with the Hugging Face Trainer.

- Cleans and preprocesses pairs, caching a cleaned CSV to speed reruns
- Trains a single-GPU baseline and saves a `final/` checkpoint with HF artifacts and `.pth`
- Optional: back-translation plan and custom tokenizer scaffolding (placeholders)

References:
- NLLB model card: https://huggingface.co/facebook/nllb-200-distilled-600M
- Transformers Seq2Seq docs: https://huggingface.co/docs/transformers/en/tasks/translation


### Step 1 — Environment & imports

In [2]:
import os
import torch
from torch.utils.data import Dataset

torch.set_float32_matmul_precision("high")

import pandas as pd
from sklearn.model_selection import train_test_split

from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    AutoConfig,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer as Trainer,
    Seq2SeqTrainingArguments as TrainingArguments,
)

print(torch.__version__)
print(torch.cuda.is_available())

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


2.8.0+cu129
True


### Step 2 — Config

In [None]:

class CFG:
    # Model & paths
    MODEL_NAME = "facebook/nllb-200-distilled-600M"
    OUTPUT_DIR = "../model/"

    # Data
    DATA_FILE = "../data/train_data.xlsx"
    CLEAN_DATA_FILE = "../data/train_data_cleaned.csv"
    SRC_COL = "kriol"
    TGT_COL = "english"
    VAL_SIZE = 0.2
    SEED = 42

    # NLLB language tags (proxy Kriol as Tok Pisin for tokenizer)
    SRC_LANG = "tpi_Latn"
    TGT_LANG = "eng_Latn"

    # Preprocessing
    APPLY_ENGLISH_LID = True
    MAX_TOKENS = 128
    LEN_RATIO = 3.0
    STRIP_PUNCT_SRC = False
    STRIP_PUNCT_TGT = False

    # Cleaning control
    SKIP_CLEAN_IF_EXISTS = True

    # Cross-validation
    USE_CV = True
    K_FOLDS = 5
    CV_FOLD = 0

    # Training
    NUM_EPOCHS = 15
    BATCH_SIZE = 8
    LR = 3e-5
    MAX_LEN = 128
    DROPOUT = 0.2
    ATTENTION_DROPOUT = 0.1

    # Decoding
    BEAM_SIZE = 6
    LENGTH_PENALTY = 1.0
    EARLY_STOPPING = True

    # Back-translation
    ENABLE_BT = True
    # Use NLLB to generate Kriol-proxy via Tok Pisin (tpi) from English
    EN2KR_MODEL = "facebook/nllb-200-distilled-600M"
    EN2KR_SRC_LANG = "eng_Latn"
    EN2KR_TGT_LANG = "tpi_Latn"
    SYNTH_CSV = "../data/synthetic/en_to_kriol_v1.csv"
    SYNTH_CSV_SAMPLE = "../data/synthetic/en_to_kriol_v1_sample.csv"
    BT_FAST = False
    BT_BEAM_SIZE = 8
    BT_LENGTH_PENALTY = 1.1
    BT_EARLY_STOPPING = True
    BT_BATCH = 64

    # Back-translation decode controls
    BT_NUM_BEST = 3
    BT_MIN_LENGTH = 6
    BT_REPETITION_PENALTY = 1.1

    # Synthetic integration
    SYNTH_MAX_RATIO = 1.0

    # Tokenizer (custom placeholder)
    USE_SPM = True
    SPM_DIR = "../outputs/tokenizers/spm_kriol_en_v1"

    # Decoding/generation extras
    GEN_MAX_NEW_TOKENS = 32

    # COMET
    COMET_MODEL = "Unbabel/wmt22-comet-da"
    COMET_BATCH = 32

    # Trainer args
    WARMUP_STEPS = 500
    GRAD_ACCUM_STEPS = 2
    LABEL_SMOOTHING = 0.1
    LOGGING_STEPS = 50
    SAVE_STEPS = 1000
    SAVE_TOTAL_LIMIT = 3
    FP16 = True
    REPORT_TO = ["tensorboard"]

    DROPOUT = 0.2
    ATTENTION_DROPOUT = 0.1
    
    # Increase these for better results
    WARMUP_RATIO = 0.1  # Better warmup
    WEIGHT_DECAY = 0.01
    
    # Mixed precision training
    BF16 = torch.cuda.is_bf16_supported()  # Use BF16 if available
    
    # Learning rate schedule
    LR_SCHEDULER = "cosine"
    
    # Gradient clipping
    MAX_GRAD_NORM = 1.0


# dynamic ties
CFG.BT_BEAM_SIZE = 1 if CFG.BT_FAST else 8

os.makedirs(CFG.OUTPUT_DIR, exist_ok=True)

torch.manual_seed(CFG.SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(CFG.SEED)


### Step 3 — Load data

In [4]:
# Prefer cleaned CSV if present; otherwise load raw and proceed to Step 4
if os.path.exists(CFG.CLEAN_DATA_FILE):
    df = pd.read_csv(CFG.CLEAN_DATA_FILE)
else:
    if CFG.DATA_FILE.endswith(".csv"):
        df = pd.read_csv(CFG.DATA_FILE)
    elif CFG.DATA_FILE.endswith((".xlsx", ".xls")):
        df = pd.read_excel(CFG.DATA_FILE)
    else:
        raise ValueError("Unsupported data file format: use .csv or .xlsx")

assert CFG.SRC_COL in df.columns and CFG.TGT_COL in df.columns, f"Columns not found: {CFG.SRC_COL}, {CFG.TGT_COL}"

# Keep minimal shape only; Step 4 handles full cleaning
df = df[[CFG.SRC_COL, CFG.TGT_COL]].dropna()
df = df[(df[CFG.SRC_COL].astype(str).str.strip() != "") & (df[CFG.TGT_COL].astype(str).str.strip() != "")]



### Step 4 — Clean and persist dataset (merged cleaner + execution)

In [5]:
import re
import html
import unicodedata

try:
    import langid as _langid
except Exception:
    _langid = None


def _to_str(x):
    return "" if x is None else str(x)


def _normalize_unicode(s: str) -> str:
    if not isinstance(s, str):
        s = _to_str(s)
    s = unicodedata.normalize("NFC", s)
    s = re.sub(r"[\u200B-\u200F\u202A-\u202E\u2066-\u2069\uFEFF]", "", s)
    s = "".join(ch for ch in s if (ch in "\t\n\r" or unicodedata.category(ch)[0] != "C"))
    return s


def _fix_mojibake(s: str) -> str:
    replacements = {
        "â€™": "'", "â€˜": "'", "â€œ": '"', "â€�": '"',
        "â€“": "-", "â€”": "-", "Â ": " ", "Â ": " ",
    }
    for k, v in replacements.items():
        s = s.replace(k, v)
    return html.unescape(s)


def _normalize_quotes_and_spacing_no_punct_space(s: str) -> str:
    # Standardize quotes/dashes and collapse spaces; do not add spaces around punctuation here
    s = s.replace("“", '"').replace("”", '"').replace("‘", "'").replace("’", "'")
    s = s.replace("–", "-").replace("—", "-")
    s = s.replace("\r", " ").replace("\n", " ")
    s = re.sub(r"\s+", " ", s).strip()
    return s


def _strip_punct(s: str) -> str:
    return re.sub(r"[^\w\s]", "", s)


def _simple_tok_count(s: str) -> int:
    return 0 if not s else len(s.split())


def _kriol_token_score(s: str) -> int:
    kriol_markers = {"bin","langa","blanga","det","im","imbin","garra","olabat","nomo","wal","mob","deya",
    "garram","gin","seim"}
    tokens = set(_to_str(s).lower().split())
    return sum(1 for t in tokens if t in kriol_markers)


def _maybe_swap(src: str, tgt: str, use_langid: bool = True):
    kriol_src = _kriol_token_score(src)
    kriol_tgt = _kriol_token_score(tgt)
    en_src = en_tgt = 0
    if use_langid and _langid is not None:
        try:
            en_src = 1 if _langid.classify(_to_str(src))[0] == "en" else 0
            en_tgt = 1 if _langid.classify(_to_str(tgt))[0] == "en" else 0
        except Exception:
            pass
    should_swap = (en_src > en_tgt and kriol_tgt > kriol_src and (kriol_tgt - kriol_src) >= 1)
    return (tgt, src) if should_swap else (src, tgt)


def clean_parallel_dataframe(
    df,
    src_col: str,
    tgt_col: str,
    lowercase_src: bool = True,
    lowercase_tgt: bool = False,
    strip_punct_src: bool = True,
    strip_punct_tgt: bool = False,
    max_tokens: int = 128,
    len_ratio: float = 3.0,
    apply_english_lid_on_tgt: bool = False,
    try_swap_misplaced_rows: bool = True,
    drop_identical_pairs: bool = True,
):
    work = df[[src_col, tgt_col]].copy()

    for c in (src_col, tgt_col):
        work[c] = (
            work[c]
            .astype(str)
            .map(_normalize_unicode)
            .map(_fix_mojibake)
            .map(_normalize_quotes_and_spacing_no_punct_space)
        )

    if try_swap_misplaced_rows:
        work[[src_col, tgt_col]] = work.apply(
            lambda r: _maybe_swap(r[src_col], r[tgt_col], use_langid=True), axis=1, result_type="expand"
        )

    if lowercase_src:
        work[src_col] = work[src_col].str.lower()
    if lowercase_tgt:
        work[tgt_col] = work[tgt_col].str.lower()
    if strip_punct_src:
        work[src_col] = work[src_col].map(_strip_punct)
    if strip_punct_tgt:
        work[tgt_col] = work[tgt_col].map(_strip_punct)

    work = work[(work[src_col].str.strip() != "") & (work[tgt_col].str.strip() != "")]
    if drop_identical_pairs:
        work = work[work[src_col] != work[tgt_col]]

    work = work.drop_duplicates(subset=[src_col, tgt_col])

    def _keep_len(row) -> bool:
        s_len = _simple_tok_count(row[src_col])
        t_len = _simple_tok_count(row[tgt_col])
        if s_len == 0 or t_len == 0:
            return False
        if s_len > max_tokens or t_len > max_tokens:
            return False
        ratio = max(s_len / max(1, t_len), t_len / max(1, s_len))
        return ratio <= len_ratio

    work = work[work.apply(_keep_len, axis=1)]

    if apply_english_lid_on_tgt and _langid is not None:
        try:
            work = work[work[tgt_col].map(lambda s: _langid.classify(_to_str(s))[0] == "en")]
        except Exception:
            pass

    return work.reset_index(drop=True)


# Execute cleaning and persist (skippable when cleaned exists)
if CFG.SKIP_CLEAN_IF_EXISTS and os.path.exists(CFG.CLEAN_DATA_FILE):
    print(f"Using existing cleaned dataset: {CFG.CLEAN_DATA_FILE}")
    df = pd.read_csv(CFG.CLEAN_DATA_FILE)
else:
    cleaned = clean_parallel_dataframe(
        df,
        src_col=CFG.SRC_COL,
        tgt_col=CFG.TGT_COL,
        lowercase_src=True,
        lowercase_tgt=False,
        strip_punct_src=CFG.STRIP_PUNCT_SRC,
        strip_punct_tgt=CFG.STRIP_PUNCT_TGT,
        max_tokens=CFG.MAX_TOKENS,
        len_ratio=CFG.LEN_RATIO,
        apply_english_lid_on_tgt=CFG.APPLY_ENGLISH_LID,
        try_swap_misplaced_rows=True,
    )

    os.makedirs(os.path.dirname(CFG.CLEAN_DATA_FILE), exist_ok=True)
    cleaned.to_csv(CFG.CLEAN_DATA_FILE, index=False)
    print(f"Saved cleaned dataset: {CFG.CLEAN_DATA_FILE} rows={len(cleaned)}")

    # Use cleaned data downstream
    df = cleaned.copy()


Using existing cleaned dataset: ../data/train_data_cleaned.csv


### Step 5 — Preprocess & normalize

This step prepares pairs for Kriol→English only:
- Lowercase both sides; normalize whitespace
- Deduplicate pairs to avoid train/val leakage
- Length filter: max 128 tokens on each side
- Length ratio filter: src/tgt and tgt/src ≤ 3.0
- Optional: language ID filter for English targets (off by default)

Note: These filters run before the split to ensure clean train/val sets.


In [6]:
try:
    import langid
except Exception:
    langid = None
from sklearn.model_selection import KFold

def _normalize_common(s: str) -> str:
    s = str(s)
    s = s.replace("“", '"').replace("”", '"').replace("‘", "'").replace("’", "'")
    s = re.sub(r"\s+", " ", s).strip()
    s = re.sub(r"\s*([,;:?!\.])\s*", r" \1 ", s)
    return re.sub(r"\s+", " ", s)

def _strip_punct(s: str) -> str:
    # remove common punctuation; keep alphanumerics and spaces
    return re.sub(r"[^\w\s]", "", s)

def normalize_src_text(s: str) -> str:
    s = _normalize_common(s)
    if CFG.STRIP_PUNCT_SRC:
        s = _strip_punct(s)
    return s.lower()

def normalize_tgt_text(s: str) -> str:
    s = _normalize_common(s)
    if CFG.STRIP_PUNCT_TGT:
        s = _strip_punct(s)
    return s


def simple_token_count(text: str) -> int:
    return len(str(text).split())


def passes_filters(row) -> bool:
    src = normalize_src_text(row[CFG.SRC_COL])
    tgt = normalize_tgt_text(row[CFG.TGT_COL])
    if src == "" or tgt == "":
        return False
    # length tokens
    s_len = simple_token_count(src)
    t_len = simple_token_count(tgt)
    if s_len > CFG.MAX_TOKENS or t_len > CFG.MAX_TOKENS:
        return False
    # ratio
    if s_len > 0 and t_len > 0:
        if s_len / t_len > CFG.LEN_RATIO or t_len / s_len > CFG.LEN_RATIO:
            return False
    # optional English LID on target
    if CFG.APPLY_ENGLISH_LID and langid is not None:
        lid, _ = langid.classify(tgt)
        if lid != "en":
            return False
    return True

# Apply normalization + filters before split
df_filtered = df[[CFG.SRC_COL, CFG.TGT_COL]].dropna().copy()
df_filtered[CFG.SRC_COL] = df_filtered[CFG.SRC_COL].apply(normalize_src_text)
df_filtered[CFG.TGT_COL] = df_filtered[CFG.TGT_COL].apply(normalize_tgt_text)
df_filtered = df_filtered.drop_duplicates(subset=[CFG.SRC_COL, CFG.TGT_COL])
df_filtered = df_filtered[df_filtered.apply(passes_filters, axis=1)]

# Split: K-Fold when enabled, else single random split
if CFG.USE_CV:
    assert CFG.K_FOLDS >= 2, "K_FOLDS must be >= 2 for cross-validation"
    kf = KFold(n_splits=CFG.K_FOLDS, shuffle=True, random_state=CFG.SEED)
    folds = list(kf.split(df_filtered))
    fold_idx = CFG.CV_FOLD % CFG.K_FOLDS
    train_index, val_index = folds[fold_idx]
    train_df = df_filtered.iloc[train_index].reset_index(drop=True)
    val_df = df_filtered.iloc[val_index].reset_index(drop=True)
    print(f"KFold split: fold {fold_idx+1}/{CFG.K_FOLDS} -> train {len(train_df)} val {len(val_df)}")
else:
    train_df, val_df = train_test_split(df_filtered, test_size=CFG.VAL_SIZE, random_state=CFG.SEED)
    print("After filters:", len(train_df), len(val_df))



KFold split: fold 1/5 -> train 18155 val 4539


### Step 6 — Data augmentation (applied to training only)

Applies light noise to Kriol sources and optional synonym replacement on English targets for a small subset of the training set. Validation data is never augmented.


In [7]:
import random
try:
    import nltk
    from nltk.corpus import wordnet as wn
    try:
        nltk.data.find('corpora/wordnet')
    except LookupError:
        nltk.download('wordnet')
except Exception:
    nltk = None
    wn = None

random.seed(CFG.SEED)

def inject_noise(sentence: str, drop_p: float = 0.05, swap_p: float = 0.05) -> str:
    toks = sentence.split()
    # random drop
    keep = []
    for t in toks:
        if random.random() > drop_p:
            keep.append(t)
    toks = keep if keep else toks
    # random swap (one pair)
    if len(toks) >= 2 and random.random() < swap_p:
        i = random.randrange(0, len(toks)-1)
        toks[i], toks[i+1] = toks[i+1], toks[i]
    return " ".join(toks)


def synonym_replace_en(sentence: str, prob: float = 0.1) -> str:
    if wn is None:
        return sentence
    toks = sentence.split()
    out = []
    for t in toks:
        if random.random() < prob:
            syns = set()
            for syn in wn.synsets(t):
                for l in syn.lemmas():
                    w = l.name().replace("_", " ")
                    if w.lower() != t.lower():
                        syns.add(w)
            if syns:
                out.append(random.choice(list(syns)))
                continue
        out.append(t)
    return " ".join(out)

# Apply augmentation: noisy Kriol source, slight synonym replacement on English target (train only)
aug_factor = 0.2  # 20% of train set
num_aug = int(aug_factor * len(train_df))
_sel = train_df.sample(n=num_aug, random_state=CFG.SEED)
_aug = _sel.copy()
_aug[CFG.SRC_COL] = _aug[CFG.SRC_COL].map(lambda s: inject_noise(s, 0.07, 0.07))
_aug[CFG.TGT_COL] = _aug[CFG.TGT_COL].map(lambda s: synonym_replace_en(s, 0.08))

train_df = (
    pd.concat([train_df, _aug], ignore_index=True)
      .drop_duplicates(subset=[CFG.SRC_COL, CFG.TGT_COL])
      .reset_index(drop=True)
)
print(f"Augmented +{len(train_df)-len(df_filtered)} examples (train={len(train_df)}, val={len(val_df)})")



[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\TARIK\AppData\Roaming\nltk_data...


Augmented +-932 examples (train=21762, val=4539)


### Step 7 — Generate synthetic data
Creates English→Kriol synthetic pairs using a reverse model.


In [11]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import os

os.makedirs(os.path.dirname(CFG.SYNTH_CSV), exist_ok=True)

# If CSV exists, just report and skip generation; otherwise, generate
if os.path.exists(CFG.SYNTH_CSV):
    print(f"Using existing synthetic: {CFG.SYNTH_CSV}")
    try:
        _samp = CFG.SYNTH_CSV.replace(".csv", "_sample.csv")
        if os.path.exists(_samp):
            print("Sample exists:", _samp)
    except Exception:
        pass
else:
    _tok = AutoTokenizer.from_pretrained(CFG.EN2KR_MODEL)
    _mdl = AutoModelForSeq2SeqLM.from_pretrained(CFG.EN2KR_MODEL).to(device).eval()

    # Collect English and filter extreme-length sentences for cleaner synthetic
    _eng = train_df[CFG.TGT_COL].tolist()
    _eng_all = [e for e in _eng if 4 <= len(str(e).split()) <= 40]
    _syn = []
    try:
        from tqdm import tqdm as _tqdm
        _iter = _tqdm(range(0, len(_eng_all), CFG.BT_BATCH), total=(len(_eng_all)+CFG.BT_BATCH-1)//CFG.BT_BATCH, desc="Back-translation")
    except Exception:
        _iter = range(0, len(_eng_all), CFG.BT_BATCH)

    for i in _iter:
        chunk = _eng_all[i:i+CFG.BT_BATCH]
        batch = _tok(chunk, return_tensors="pt", padding=True, truncation=True, max_length=CFG.MAX_LEN)
        batch = {k: v.to(device) for k, v in batch.items()}
        out = _mdl.generate(
            **batch,
            max_new_tokens=CFG.GEN_MAX_NEW_TOKENS,
            num_beams=max(3, CFG.BT_BEAM_SIZE),
            num_return_sequences=CFG.BT_NUM_BEST,
            length_penalty=CFG.BT_LENGTH_PENALTY,
            early_stopping=CFG.BT_EARLY_STOPPING,
            no_repeat_ngram_size=3,
            repetition_penalty=CFG.BT_REPETITION_PENALTY,
            min_length=CFG.BT_MIN_LENGTH,
        )
        # Select best per source by simple length/uniqueness heuristic
        decoded = _tok.batch_decode(out, skip_special_tokens=True)
        for j in range(0, len(decoded), CFG.BT_NUM_BEST):
            cand = decoded[j:j+CFG.BT_NUM_BEST]
            cand = [c for c in cand if c and len(c.split()) >= CFG.BT_MIN_LENGTH]
            if not cand:
                _syn.append("")
                continue
            # prefer higher unique token ratio
            def _u(x):
                t = x.split()
                return 0 if not t else len(set(t))/len(t)
            cand.sort(key=_u, reverse=True)
            _syn.append(cand[0])

    _synth = pd.DataFrame({CFG.SRC_COL: _syn, CFG.TGT_COL: _eng_all})
    _synth.to_csv(CFG.SYNTH_CSV, index=False)
    try:
        _samp = CFG.SYNTH_CSV.replace(".csv", "_sample.csv")
        _synth.head(min(100, len(_synth))).to_csv(_samp, index=False)
    except Exception:
        pass
    print("Saved:", CFG.SYNTH_CSV, "rows:", len(_synth))


Back-translation:   0%|          | 0/283 [00:00<?, ?it/s]

: 

### Step 8 — Integrate synthetic data
Applies the same normalization & filtering rules as real data, dedups, and preserves real-only validation (train-only merge).


In [10]:
# Round-trip filter (keeps rows whose back-translation matches English)
try:
    _final_dir = os.path.join(CFG.OUTPUT_DIR, "final")
    if os.path.isdir(_final_dir):
        _tok_f = AutoTokenizer.from_pretrained(_final_dir)
        _mdl_f = AutoModelForSeq2SeqLM.from_pretrained(_final_dir).to(device).eval()
        def _bt(texts):
            _in = _tok_f(texts, return_tensors="pt", padding=True, truncation=True, max_length=CFG.MAX_LEN)
            _in = {k: v.to(device) for k, v in _in.items()}
            _out = _mdl_f.generate(**_in, max_new_tokens=CFG.GEN_MAX_NEW_TOKENS, num_beams=4, early_stopping=True)
            return _tok_f.batch_decode(_out, skip_special_tokens=True)
        _cand = _sel.copy()  # pyright: ignore[reportUndefinedVariable]
        _bt_eng = _bt(_cand[CFG.SRC_COL].tolist())
        from sacrebleu import corpus_chrf
        _scores = []
        for hyp, ref in zip(_bt_eng, _cand[CFG.TGT_COL].tolist()):
            try:
                _scores.append(corpus_chrf([hyp], [[ref]]).score)
            except Exception:
                _scores.append(0.0)
        _rt_mask = pd.Series(_scores) >= 55.0
        _sel = _cand[_rt_mask.values]
        print(f"Round-trip kept {len(_sel)} / {len(_cand)}")
except Exception as _e:
    pass


### Step 8.1 — Minimal integration (train-only, 1:1 cap)

In [10]:
# Step 8 — Minimal integration (train-only, 1:1 cap)
import pandas as _pd
_syn_path = CFG.SYNTH_CSV
if os.path.exists(_syn_path):
    _raw = _pd.read_csv(_syn_path)[[CFG.SRC_COL, CFG.TGT_COL]].dropna()
    _raw[CFG.SRC_COL] = _raw[CFG.SRC_COL].apply(normalize_src_text)
    _raw[CFG.TGT_COL] = _raw[CFG.TGT_COL].apply(normalize_tgt_text)
    _raw = _raw.drop_duplicates(subset=[CFG.SRC_COL, CFG.TGT_COL])
    _raw = _raw[_raw.apply(passes_filters, axis=1)]

    # Extra hygiene: drop synthetic that looks English or gibberish
    try:
        import langid as _lid
    except Exception:
        _lid = None

    def _looks_english(s):
        if _lid is None:
            return False
        try:
            return _lid.classify(str(s))[0] == "en"
        except Exception:
            return False

    def _has_low_var(s):
        t = str(s).split()
        if not t:
            return True
        uniq = len(set(t)) / max(1, len(t))
        if uniq < 0.3:
            return True
        if any(ch * 4 in s for ch in "abcdefghijklmnopqrstuvwxyz"):  # long char repeats
            return True
        return False

    _raw = _raw[~_raw[CFG.SRC_COL].map(_looks_english)]
    _raw = _raw[~_raw[CFG.SRC_COL].map(_has_low_var)]

    # Relax Kriol-marker filter: allow 0 to keep diverse tpi-like sentences, but penalize low-variance
    def _kriol_marker_score(s: str) -> int:
        markers = {"bin","langa","blanga","det","im","imbin","garra","olabat","nomo","wal","mob","deya","garram","gin","seim"}
        toks = str(s).lower().split()
        return sum(1 for t in set(toks) if t in markers)
    # keep rows with score >= 0 (no drop), but later heuristics already removed English/low-var

    _cap = min(len(_raw), int(CFG.SYNTH_MAX_RATIO * len(train_df)))
    _sel = _raw.sample(n=_cap, random_state=CFG.SEED) if _cap < len(_raw) else _raw

    _before = len(train_df)
    train_df = (
        pd.concat([train_df, _sel], ignore_index=True)
          .drop_duplicates(subset=[CFG.SRC_COL, CFG.TGT_COL])
          .reset_index(drop=True)
    )
    print(f"Integrated +{len(train_df)-_before} synthetic pairs (train={len(train_df)}, val={len(val_df)})")
else:
    print(f"Synthetic CSV not found at {_syn_path}")


Integrated +176 synthetic pairs (train=18331, val=4539)


### Step 9 — Custom tokenizer training
Trains a shared SentencePiece tokenizer on combined Kriol+English corpus when enabled. Skipped unless `CFG.USE_SPM=True` and `sentencepiece` is installed.


In [None]:
# Helpers for custom tokenizer training (SentencePiece)
if CFG.USE_SPM:
    try:
        import sentencepiece as spm
        from transformers import AlbertTokenizer
        
        # Prepare corpus
        all_texts = []
        all_texts.extend(train_df[CFG.SRC_COL].tolist() * 2)  # Weight Kriol 2x
        all_texts.extend(train_df[CFG.TGT_COL].tolist())
        
        # Write corpus
        corpus_path = os.path.join(CFG.SPM_DIR, "corpus.txt")
        os.makedirs(CFG.SPM_DIR, exist_ok=True)
        with open(corpus_path, "w", encoding="utf-8") as f:
            for text in all_texts:
                f.write(text.strip() + "\n")
        
        # Train SPM
        spm.SentencePieceTrainer.train(
            input=corpus_path,
            model_prefix=os.path.join(CFG.SPM_DIR, "spm"),
            vocab_size=12000,  # Larger for better coverage
            model_type="unigram",
            character_coverage=0.9995,
            byte_fallback=True,
            split_digits=False,
            user_defined_symbols=["<tpi_Latn>", "<eng_Latn>"]
        )
        
        # Create HF-compatible tokenizer
        tokenizer_custom = AlbertTokenizer(
            vocab_file=os.path.join(CFG.SPM_DIR, "spm.model"),
            do_lower_case=False,
            keep_accents=True
        )
        tokenizer_custom.save_pretrained(CFG.SPM_DIR)
        print(f"Custom tokenizer saved to {CFG.SPM_DIR}")
        
    except Exception as e:
        print(f"Tokenizer training failed: {e}")
        CFG.USE_SPM = False


### Step 9.1 — Custom tokenizer (SentencePiece 10k)

This cell trains a shared unigram SentencePiece tokenizer (10k vocab) over combined Kriol+English corpus. It saves to `CFG.SPM_DIR` and does not change the active tokenizer unless `CFG.USE_SPM=True`.


In [None]:
# Step 9.1 — Train SPM and save Hugging Face tokenizer
if CFG.USE_SPM:
    try:
        import sentencepiece as spm
        from transformers import PreTrainedTokenizerFast

        # 1) Train SentencePiece on combined corpus
        lines = prepare_tokenizer_corpus(pd.concat([train_df, val_df], ignore_index=True), CFG.SRC_COL, CFG.TGT_COL)
        prefix = os.path.join(CFG.SPM_DIR, "spm_kriol_en_v1")
        os.makedirs(CFG.SPM_DIR, exist_ok=True)
        train_sentencepiece_corpus(lines, model_prefix=prefix, vocab_size=10000)

        # 2) Wrap SPM into a HF fast tokenizer and add language tokens
        spm_model_path = prefix + ".model"
        tokenizer_sp = PreTrainedTokenizerFast(bos_token="<s>", eos_token="</s>", unk_token="<unk>", pad_token="<pad>")
        tokenizer_sp.backend_tokenizer = None
        tokenizer_sp.sp_model = spm.SentencePieceProcessor()
        tokenizer_sp.sp_model.Load(spm_model_path)
        tokenizer_sp.add_special_tokens({"additional_special_tokens": [CFG.SRC_LANG, CFG.TGT_LANG]})

        # 3) Save tokenizer in HF format for AutoTokenizer.from_pretrained
        tokenizer_sp.save_pretrained(CFG.SPM_DIR)
        print("SPM trained and HF tokenizer saved at:", CFG.SPM_DIR)
    except Exception as e:
        print("SPM training/packaging failed:", e)
else:
    print("CFG.USE_SPM is False; skipping SPM training and packaging.")


### 9.2 Train SentencePiece tokenizer

In [None]:
# Train SentencePiece tokenizer (optional)
if CFG.USE_SPM:
    try:
        import sentencepiece  # noqa: F401
        lines = prepare_tokenizer_corpus(pd.concat([train_df, val_df], ignore_index=True), CFG.SRC_COL, CFG.TGT_COL)
        train_sentencepiece_corpus(lines, model_prefix=os.path.join(CFG.SPM_DIR, "spm_kriol_en_v1"), vocab_size=10000)
        print("SentencePiece trained at:", CFG.SPM_DIR)
    except Exception as e:
        print("SentencePiece training skipped/failed:", e)
else:
    print("CFG.USE_SPM is False; skipping SentencePiece training.")



### Step 10 — Tokenizer & Model

In [11]:

# Tokenizer & Model (use custom tokenizer when CFG.USE_SPM=True)
src_lang = CFG.SRC_LANG
tgt_lang = CFG.TGT_LANG

if CFG.USE_SPM:
    print("Load tokenizer built in Step 9.1")
    tokenizer = AutoTokenizer.from_pretrained(CFG.SPM_DIR)
    # Ensure language tokens exist
    tokenizer.add_special_tokens({"additional_special_tokens": [src_lang, tgt_lang]})
    # Load base model and resize embeddings
    model = AutoModelForSeq2SeqLM.from_pretrained(CFG.MODEL_NAME)
    model.resize_token_embeddings(len(tokenizer))
    # Configure decoder start to target language token
    forced_bos = tokenizer.convert_tokens_to_ids(tgt_lang)
    if hasattr(model, "config"):
        model.config.use_cache = False
        if forced_bos is not None and forced_bos != tokenizer.unk_token_id:
            model.config.forced_bos_token_id = forced_bos
        if getattr(model.config, "decoder_start_token_id", None) is None:
            model.config.decoder_start_token_id = model.config.forced_bos_token_id
else:
    print("Default NLLB tokenizer with language tags")
    tokenizer = AutoTokenizer.from_pretrained(CFG.MODEL_NAME, src_lang=src_lang, tgt_lang=tgt_lang)
    model = AutoModelForSeq2SeqLM.from_pretrained(CFG.MODEL_NAME)
    if hasattr(model, "config"):
        model.config.use_cache = False
    if hasattr(model, "config") and hasattr(tokenizer, "lang_code_to_id"):
        forced_bos = tokenizer.convert_tokens_to_ids(tgt_lang)
        if forced_bos is not None and forced_bos != tokenizer.unk_token_id:
            model.config.forced_bos_token_id = forced_bos
        if getattr(model.config, "decoder_start_token_id", None) is None:
            model.config.decoder_start_token_id = model.config.forced_bos_token_id

assert torch.cuda.is_available(), "CUDA is not available. Please check your GPU drivers and PyTorch install."
device = torch.device("cuda")
model.to(device)
print(device, torch.cuda.get_device_name(0))


[warn] CFG.SRC_LANG is unset. Using fallback src_lang=eng_Latn. Set CFG.SRC_LANG explicitly when decided.
cuda NVIDIA GeForce RTX 5060 Laptop GPU


### Step 11 — Dataset

In [12]:

class PairedTextDataset(Dataset):
    def __init__(self, df: pd.DataFrame, tokenizer: AutoTokenizer, max_len: int):
        self.src = df[CFG.SRC_COL].tolist()
        self.tgt = df[CFG.TGT_COL].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.src)

    def __getitem__(self, idx: int):
        src_text = str(self.src[idx])
        tgt_text = str(self.tgt[idx])
        model_inputs = self.tokenizer(
            src_text,
            max_length=self.max_len,
            truncation=True,
            padding=False,
            return_tensors="pt",
        )
        labels = self.tokenizer(
            text_target=tgt_text,
            max_length=self.max_len,
            truncation=True,
            padding=False,
            return_tensors="pt",
        )
        item = {k: v.squeeze(0) for k, v in model_inputs.items()}
        item["labels"] = labels["input_ids"].squeeze(0)
        return item

train_ds = PairedTextDataset(train_df, tokenizer, CFG.MAX_LEN)
val_ds = PairedTextDataset(val_df, tokenizer, CFG.MAX_LEN)
len(train_ds), len(val_ds)


(18331, 4539)

### Step 12.1 — Trainer setup (DDP-ready) / Length-grouped sampler
Creates `LengthGroupedSampler` to cluster sequences of similar lengths, reducing padding and stabilizing training.


In [None]:
from transformers.trainer_pt_utils import LengthGroupedSampler
# Curriculum-like batching: length-grouped sampler to reduce padding and stabilize training
train_sampler = LengthGroupedSampler(lengths=[len(str(x).split()) for x in train_df[CFG.SRC_COL].tolist()],
                                     batch_size=CFG.BATCH_SIZE,
                                     mega_batch_mult=4)



### Step 12.2 — Custom Trainer class
Defines `CleanSeq2SeqTrainer` that sanitizes inputs (drops unintended embed keys) to avoid HF arg conflicts.


In [None]:
# Utility: Trainer that drops unintended *_embeds keys to avoid HF arg conflicts
from transformers import Seq2SeqTrainer, EarlyStoppingCallback

class CleanSeq2SeqTrainer(Seq2SeqTrainer):
    def _prepare_inputs(self, inputs):
        # Sanitize at input-prep stage too
        inputs.pop("decoder_inputs_embeds", None)
        inputs.pop("inputs_embeds", None)
        inputs.pop("decoder_input_ids", None)
        allowed = {"input_ids", "attention_mask", "labels", "decoder_attention_mask"}
        filtered = {k: v for k, v in inputs.items() if k in allowed}
        return super()._prepare_inputs(filtered)

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
        # Drop any embed keys and strictly whitelist safe args for seq2seq training
        inputs.pop("decoder_inputs_embeds", None)
        inputs.pop("inputs_embeds", None)
        inputs.pop("decoder_input_ids", None)
        allowed = {"input_ids", "attention_mask", "labels", "decoder_attention_mask"}
        filtered = {k: v for k, v in inputs.items() if k in allowed}
        # Call model explicitly with safe kwargs to avoid decoder ids/embeds conflicts
        outputs = model(
            decoder_input_ids=None,
            decoder_inputs_embeds=None,
            use_cache=False,
            **filtered,
        )
        loss = outputs["loss"] if isinstance(outputs, dict) else outputs.loss
        return (loss, outputs) if return_outputs else loss



### Step 12.3 — Regularization
Applies dropout and attention-dropout settings on the model config for regularization.


In [18]:
# Regularization: set model-level dropouts if supported by config
if hasattr(model, "config"):
    model.config.use_cache = False
    if hasattr(model.config, "dropout"):
        model.config.dropout = CFG.DROPOUT
    if hasattr(model.config, "activation_dropout"):
        model.config.activation_dropout = CFG.DROPOUT
    if hasattr(model.config, "decoder_attention_dropout"):
        model.config.decoder_attention_dropout = CFG.ATTENTION_DROPOUT
    if hasattr(model.config, "encoder_attention_dropout"):
        model.config.encoder_attention_dropout = CFG.ATTENTION_DROPOUT



AttributeError: type object 'CFG' has no attribute 'DROPOUT'

### Step 12.4 — Trainer args and instantiation
Builds `DataCollatorForSeq2Seq`, `TrainingArguments`, and instantiates `CleanSeq2SeqTrainer` with early stopping.


In [14]:
label_pad_token_id = -100
collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, label_pad_token_id=label_pad_token_id, padding=True)


args = TrainingArguments(
    output_dir=CFG.OUTPUT_DIR,
    num_train_epochs=max(CFG.NUM_EPOCHS, 6),
    per_device_train_batch_size=CFG.BATCH_SIZE,
    per_device_eval_batch_size=CFG.BATCH_SIZE,
    learning_rate=CFG.LR,
    warmup_steps=CFG.WARMUP_STEPS,
    gradient_accumulation_steps=CFG.GRAD_ACCUM_STEPS,
    label_smoothing_factor=CFG.LABEL_SMOOTHING,
    weight_decay=CFG.WEIGHT_DECAY,
    optim="adamw_torch",
    logging_steps=CFG.LOGGING_STEPS,
    save_steps=CFG.SAVE_STEPS,
    save_total_limit=CFG.SAVE_TOTAL_LIMIT,
    fp16=CFG.FP16,
    report_to=CFG.REPORT_TO,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    eval_accumulation_steps=1,
    remove_unused_columns=False,
    label_names=["labels"],
    group_by_length=True,

    # Advanced optimization
    optim="adamw_8bit" if torch.cuda.is_available() else "adamw_torch",
    lr_scheduler_type="cosine_with_restarts",
    warmup_ratio=0.1,
    
    # Better evaluation
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    
    # Gradient optimization
    gradient_checkpointing=True,
    max_grad_norm=1.0,
    
    # Mixed precision
    bf16=CFG.BF16,
    bf16_full_eval=CFG.BF16,
    
    # Save best checkpoints
    save_strategy="steps",
    save_steps=500,
    evaluation_strategy="steps",
    eval_steps=500,
)

# Add BLEU/chrF++ as training metrics
from datasets import load_metric
bleu = load_metric("sacrebleu")
chrf = load_metric("chrf")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    bleu_score = bleu.compute(predictions=decoded_preds, references=[[l] for l in decoded_labels])
    chrf_score = chrf.compute(predictions=decoded_preds, references=[[l] for l in decoded_labels])
    
    return {
        "bleu": bleu_score["score"],
        "chrf": chrf_score["score"]
    }

trainer = CleanSeq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=collator,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)



  trainer = CleanSeq2SeqTrainer(


### Step 13 — Train

In [15]:

trainer.train()

# Evaluate once to log metrics
metrics = trainer.evaluate()
print("eval_loss:", metrics.get("eval_loss"))

# Launch TensorBoard from notebook
%load_ext tensorboard
%tensorboard --logdir "../model"


  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


Step,Training Loss
50,3.9095
100,3.2922
150,2.9397
200,2.8111
250,2.6528
300,2.5255
350,2.4325
400,2.3166
450,2.2369
500,2.175




eval_loss: 1.5678737163543701


### Step 14 — Save HF artifacts and a .pth checkpoint

In [16]:

final_dir = os.path.join(CFG.OUTPUT_DIR, "final")
os.makedirs(final_dir, exist_ok=True)
trainer.save_model(final_dir)
model_path = os.path.join(final_dir, "model_state.pth")
torch.save(model.state_dict(), model_path)
print(f"Saved .pth to: {model_path}")


Saved .pth to: ../model/final\model_state.pth


### Step 15 — Inference helper (final only)

In [17]:
final_dir = os.path.join(CFG.OUTPUT_DIR, "final")

final_tok = AutoTokenizer.from_pretrained(final_dir, src_lang=(CFG.SRC_LANG or "eng_Latn"), tgt_lang=(CFG.TGT_LANG or "eng_Latn"))
final_model = AutoModelForSeq2SeqLM.from_pretrained(final_dir).to(device)

@torch.no_grad()
def generate_with(model, tok, texts):
    inputs = tok(texts, return_tensors="pt", padding=True, truncation=True, max_length=CFG.MAX_LEN)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    forced_bos = tok.convert_tokens_to_ids(CFG.TGT_LANG or "eng_Latn")
    out = model.generate(
        **inputs,
        max_new_tokens=CFG.GEN_MAX_NEW_TOKENS,
        num_beams=CFG.BEAM_SIZE,
        length_penalty=CFG.LENGTH_PENALTY,
        early_stopping=CFG.EARLY_STOPPING,
        forced_bos_token_id=forced_bos,
    )
    return tok.batch_decode(out, skip_special_tokens=True)

# Pick 3 random samples from training data and show Kriol / predicted / original
rows = val_df.sample(n=3)
kriols = rows[CFG.SRC_COL].astype(str).tolist()
eng_refs = rows[CFG.TGT_COL].astype(str).tolist()
eng_preds = generate_with(final_model, final_tok, kriols)

for i, (kriol, pred, ref) in enumerate(zip(kriols, eng_preds, eng_refs), start=1):
    print(f"Sample {i}")
    print("Kriol:", kriol)
    print("English predicted:", pred)
    print("English original:", ref)
    print("-")



Sample 1
Kriol: brom deya neksdei jisas en detlot thribala wekinmen bin godan brom det hil en loda pipul bin kaman langa jisas
English predicted: On the next day , when they came down from the mountain , a great crowd came to him . 
English original: On the next day , when they had come down from the mountain , a great multitude met him . 
-
Sample 2
Kriol: maitbi yumob nomo sabi ol yumob kristjan pipul yumob na det serramoni pleis blanga god en det holi spirit im jidanbat langa yumob
English predicted: Don't you know that you are the temple of God , and of the Holy Spirit who lives in you ? 
English original: Don't you know that you are God's temple and that God's Spirit lives in you ? 
-
Sample 3
Kriol: if ai bina dum tharran wal ai bina prei en askim im blanga jandim jigiwan gras langa main fam en nomo larram detlot sid daga blanga gro langa main famâ lagijat na job bin tok en imbin jidan kwaitbala na
English predicted: If I had done so , I would have prayed that he would have given

### Step 16 — COMET evaluation (final only)
Scores validation translations with Unbabel COMET if available; otherwise prints a note (Python 3.13 may lack wheels).


In [16]:
# COMET: evaluate final only
try:
    from comet import download_model, load_from_checkpoint

    BATCH = CFG.COMET_BATCH
    refs = val_df[CFG.TGT_COL].tolist()
    srcs = val_df[CFG.SRC_COL].tolist()

    # Show progress while generating hypotheses for COMET
    try:
        from tqdm import tqdm as _tqdm
        _iter = _tqdm(
            range(0, len(val_df), BATCH),
            total=(len(val_df) + BATCH - 1) // BATCH,
            desc="COMET prep: generating translations",
        )
    except Exception:
        _iter = range(0, len(val_df), BATCH)

    # Faster generation for COMET (greedy, mixed precision)
    @torch.no_grad()
    def fast_generate(texts):
        inputs = final_tok(texts, return_tensors="pt", padding=True, truncation=True, max_length=CFG.MAX_LEN)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        forced_bos = final_tok.convert_tokens_to_ids(CFG.TGT_LANG or "eng_Latn")
        use_cuda = torch.cuda.is_available()
        with torch.autocast(device_type="cuda", dtype=torch.float16, enabled=use_cuda):
            out = final_model.generate(
                **inputs,
                max_new_tokens=CFG.GEN_MAX_NEW_TOKENS,
                num_beams=1,
                do_sample=False,
                length_penalty=1.0,
                early_stopping=True,
                forced_bos_token_id=forced_bos,
            )
        return final_tok.batch_decode(out, skip_special_tokens=True)

    def batched_hyps_final():
        hyps = []
        for i in _iter:
            hyps.extend(fast_generate(srcs[i:i+BATCH]))
        return hyps

    hyps_final = batched_hyps_final()
    data_final = [{"src": s, "mt": h, "ref": r} for s, h, r in zip(srcs, hyps_final, refs)]

    model_path = download_model(CFG.COMET_MODEL)
    comet_model = load_from_checkpoint(model_path)

    def get_score(output):
        if isinstance(output, dict):
            return output.get("system_score") or output.get("score") or output.get("mean_score")
        try:
            _, s = output
            return s
        except Exception:
            return output

    def get_segments(output):
        if isinstance(output, dict):
            segs = output.get("segments") or output.get("scores") or output.get("segment_scores")
            if isinstance(segs, list):
                return segs
        return None

    # Try to enable COMET's internal progress bar if supported
    try:
        out_final = comet_model.predict(
            data_final,
            batch_size=BATCH,
            gpus=1 if torch.cuda.is_available() else 0,
            progress_bar=True,
        )
    except TypeError:
        out_final = comet_model.predict(
            data_final,
            batch_size=BATCH,
            gpus=1 if torch.cuda.is_available() else 0,
        )

    sf = get_score(out_final)
    try:
        print("COMET (final):", f"{float(sf):.4f}")
    except Exception:
        print("COMET (final, raw):", sf)

    # Save COMET outputs to disk
    final_dir = os.path.join(CFG.OUTPUT_DIR, "final")
    os.makedirs(final_dir, exist_ok=True)

    def safe_float(x):
        try:
            return float(x)
        except Exception:
            return None

    # Write system score
    sf_f = safe_float(sf)
    try:
        with open(os.path.join(final_dir, "system_score.txt"), "w", encoding="utf-8") as f:
            f.write(f"{sf_f if sf_f is not None else sf}\n")
    except Exception as _e:
        print("Could not save final system score:", _e)

    # Write per-segment CSV (src, mt, ref, score)
    seg_final = get_segments(out_final)

    try:
        if isinstance(seg_final, list) and len(seg_final) == len(hyps_final):
            df_final = pd.DataFrame({
                "src": srcs,
                "mt": hyps_final,
                "ref": refs,
                "comet_score": seg_final,
            })
            df_final.to_csv(os.path.join(final_dir, "comet_segments.csv"), index=False)
    except Exception as _e:
        print("Could not save final segments CSV:", _e)

except Exception as e:
    print("COMET evaluation unavailable:", e)



COMET prep: generating translations:   0%|          | 0/142 [00:00<?, ?it/s]The following generation flags are not valid and may be ignored: ['early_stopping']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
COMET prep: generating translations: 100%|██████████| 142/142 [19:38<00:00,  8.30s/it]
Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.5.5. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\TARIK\.cache\huggingface\hub\models--Unbabel--wmt22-comet-da\snapshots\2760a223ac957f30acfb18c8aa649b01cf1d75f2\checkpoints\model.ckpt`
Encoder model frozen.
c:\Users\TARIK\Desktop\Charles Darwin University\4 - Year 1 - Semester 2\IT CODE FAIR\AI Challenge\venv\Lib\site-packages\pytorch_lightning\core\saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
💡 Tip: For seamless cloud uploads and versioning, try installing 

COMET (final): 0.5833
