In [1]:
# GPU check

!nvidia-smi

Mon Jun 16 15:55:36 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 560.35.03              Driver Version: 560.35.03      CUDA Version: 12.6     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   42C    P8              9W /   70W |       1MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla T4                       Off |   00

In [2]:
# Package download

!pip install sentencepiece transformers datasets -q

!python3 -m pip install --no-cache-dir llama-cpp-python==0.3.4 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu122 -q

!wget https://huggingface.co/bartowski/Llama-3.2-3B-Instruct-GGUF/resolve/main/Llama-3.2-3B-Instruct-Q6_K.gguf

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.12.0 which is incompatible.
torch 2.5.1+cu124 requires nvidia-cublas-cu12==12.4.5.8; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cublas-cu12 12.8.4.1 which is incompatible.
torch 2.5.1+cu124 requires nvidia-cudnn-cu12==9.1.0.70; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cudnn-cu12 9.3.0.75 which is incompatible.
torch 2.5.1+cu124 requires nvidia-cufft-cu12==11.2.1.3; platform_system == "Linux" and platform_machine == "x86_64", but you have nvidia-cufft-cu12 11.3.3.83 which is incompatible.
torch 2.5.1+cu124 requires nvidia-cu

In [4]:
# Package import

import gc
import random
import asyncio
from typing import Dict, List, Tuple
from tqdm.auto import tqdm, trange
from collections import Counter
from pathlib import Path

import pandas as pd
import numpy as np
from datasets import load_dataset

import re
import jieba

import torch
from transformers import NllbTokenizer, AutoModelForSeq2SeqLM, get_constant_schedule_with_warmup
from transformers.optimization import Adafactor
import sentencepiece as spm
from sentencepiece import sentencepiece_model_pb2 as sp_pb2_model

from llama_cpp import Llama

In [5]:
# Load datasets into dataframes

lexicon = pd.read_csv('/kaggle/input/dataset/lexicon_no_en.csv', sep=",", quotechar='"', header=None, encoding="utf-8")
lexicon.columns = ['pyu', 'zho']

lexicon_en = pd.read_csv('/kaggle/input/dataset/lexicon.csv', sep=",", quotechar='"', header=None, encoding="utf-8")
lexicon_en.columns = ['pyu', 'eng', 'zho']

sentences = pd.read_csv('/kaggle/input/dataset/sentences_no_en.csv', sep=",", quotechar='"', header=None, encoding="utf-8")
sentences.columns = ['pyu', 'zho']

sentences_en = pd.read_csv('/kaggle/input/dataset/sentences.csv', sep=",", quotechar='"', header=None, encoding="utf-8")
sentences_en.columns = ['pyu', 'eng', 'zho']

to_pyu = pd.read_csv('/kaggle/input/dataset/zh_to_pyu_test.csv', sep=",", quotechar='"', header=None, encoding="utf-8")
to_pyu.columns = ['zho']

to_zho = pd.read_csv('/kaggle/input/dataset/pyu_to_zh_test.csv', sep=",", quotechar='"', header=None, encoding="utf-8")
to_zho.columns = ['pyu']

# Part1 Examine and Training Tokenizer

In [6]:
# Nllb tokenizer loading

model_name = "facebook/nllb-200-distilled-600M"

tokenizer = NllbTokenizer.from_pretrained(model_name)

tokenizer.src_lang = "zho_Hant" # zho_Hant for Chinese traditional
tokenizer.tgt_lang = "tgl_Latn" # tgl_Latn for Puyuma (Use existing language tag, originally for Tagalog)
                                # eng_Latn for English

tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


(…)6cea38b9e3d5efcdcb9c251d6b40538e1aab555a:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


(…)b3c438311629547285129b0b81dadabd01bca665:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

In [7]:
# Testing the performances of original tokenizer

_WORD_RE = re.compile(r"(\w+|[^\w\s])")
def word_tokenize(text: str) -> List[str]: # word tokenization for puyuma (jieba package for zho)
    return _WORD_RE.findall(text)

def tokenize_dataframe(
    df: pd.DataFrame,
    tokenizer: tokenizer,
    src_col: str = "zho",
    tgt_col: str = "pyu",
) -> pd.DataFrame:
    
    df = df.copy()

    df[f"{src_col}_toks"] = df[src_col].apply(tokenizer.tokenize)
    df[f"{tgt_col}_toks"] = df[tgt_col].apply(tokenizer.tokenize)

    df[f"{src_col}_words"] = df[src_col].apply(lambda x: list(jieba.cut(x)))
    df[f"{tgt_col}_words"] = df[tgt_col].apply(word_tokenize)

    return df


def _mean_tokens_per_word(
    tok_col: pd.Series, word_col: pd.Series
) -> float:

    return tok_col.map(len).mean() / word_col.map(len).mean()


def _count_unk(
    texts: pd.Series, tokenizer: tokenizer
) -> int:

    return sum(tokenizer.unk_token_id in tokenizer(t).input_ids for t in texts)


def analyze_dataset(
    df: pd.DataFrame,
    name: str,
    tokenizer: tokenizer,
    src_col: str = "zho",
    tgt_col: str = "pyu",
) -> Dict[str, float]:
    
    report = {
        "dataset": name,
        "mean_token_per_word_src": _mean_tokens_per_word(
            df[f"{src_col}_toks"], df[f"{src_col}_words"]
        ),
        "mean_token_per_word_tgt": _mean_tokens_per_word(
            df[f"{tgt_col}_toks"], df[f"{tgt_col}_words"]
        ),
        "num_sentence_with_unk_src": _count_unk(df[src_col], tokenizer),
        "num_sentence_with_unk_tgt": _count_unk(df[tgt_col], tokenizer),
    }
    return report


def run_all_analyses(
    datasets: Dict[str, pd.DataFrame],
    tokenizer: tokenizer,
    src_col: str = "zho",
    tgt_col: str = "pyu",
) -> pd.DataFrame:
    
    reports: List[Dict[str, float]] = []

    for name, raw_df in datasets.items():
        df_tok = tokenize_dataframe(
            raw_df, tokenizer, src_col=src_col, tgt_col=tgt_col
        )
        reports.append(
            analyze_dataset(
                df_tok, name, tokenizer, src_col=src_col, tgt_col=tgt_col
            )
        )

    return pd.DataFrame(reports).set_index("dataset")


datasets = {
    "lexicon": lexicon,
    "lexicon_en": lexicon_en,
    "sentences": sentences,
    "sentences_en": sentences_en,
}

summary_df = run_all_analyses(datasets, tokenizer)
print(summary_df)

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.651 seconds.
Prefix dict has been built successfully.


              mean_token_per_word_src  mean_token_per_word_tgt  \
dataset                                                          
lexicon                      1.819939                 2.284020   
lexicon_en                   1.881565                 1.738603   
sentences                    1.328140                 1.580606   
sentences_en                 1.249425                 1.505004   

              num_sentence_with_unk_src  num_sentence_with_unk_tgt  
dataset                                                             
lexicon                              97                          0  
lexicon_en                          203                          0  
sentences                           223                         36  
sentences_en                        142                         14  


In [8]:
# Training new tokenizer for missing tokens

datasets = [lexicon, sentences, lexicon_en, sentences_en, to_pyu, to_zho]
languages = ['zho', 'pyu']
all_texts = []

for lang in languages:
    for ds in datasets:
        if lang in ds.columns:
            all_texts.extend(ds[lang].dropna().tolist())

all_texts_file = 'all_texts_plain.txt'
with open(all_texts_file, 'w', encoding='utf-8') as f:
    f.write('\n'.join(all_texts))

spm.SentencePieceTrainer.train(
    input=all_texts_file,
    model_prefix='spm_new',
    vocab_size=9909,
    character_coverage=1,
    num_threads=16,
    train_extremely_large_corpus=False,
    add_dummy_prefix=False,
    max_sentencepiece_length=128,
    max_sentence_length=4192 * 4,
    pad_id=0,
    eos_id=1,
    unk_id=2,
    bos_id=-1,
    required_chars=None,
)

sentencepiece_trainer.cc(78) LOG(INFO) Starts training with : 
trainer_spec {
  input: all_texts_plain.txt
  input_format: 
  model_prefix: spm_new
  model_type: UNIGRAM
  vocab_size: 9909
  self_test_sample_size: 0
  character_coverage: 1
  input_sentence_size: 0
  shuffle_input_sentence: 1
  seed_sentencepiece_size: 1000000
  shrinking_factor: 0.75
  max_sentence_length: 16768
  num_threads: 16
  num_sub_iterations: 2
  max_sentencepiece_length: 128
  split_by_unicode_script: 1
  split_by_number: 1
  split_by_whitespace: 1
  split_digits: 0
  pretokenization_delimiter: 
  treat_whitespace_as_suffix: 0
  allow_whitespace_only_pieces: 0
  required_chars: None
  byte_fallback: 0
  vocabulary_output_piece_score: 1
  train_extremely_large_corpus: 0
  seed_sentencepieces_file: 
  hard_vocab_limit: 1
  use_all_vocab: 0
  unk_id: 2
  bos_id: -1
  eos_id: 1
  pad_id: 0
  unk_piece: <unk>
  bos_piece: <s>
  eos_piece: </s>
  pad_piece: <pad>
  unk_surface:  ⁇ 
  enable_differential_privacy: 0


In [9]:
# Add trained tokens to tokenizer 

tokenizer_old = NllbTokenizer.from_pretrained(model_name)

sp_trained = spm.SentencePieceProcessor(model_file='spm_new.model')
added_spm = sp_pb2_model.ModelProto()
added_spm.ParseFromString(sp_trained.serialized_model_proto())
old_spm_nllb = sp_pb2_model.ModelProto()
old_spm_nllb.ParseFromString(tokenizer_old.sp_model.serialized_model_proto())

nllb_tokens_set = {p.piece for p in old_spm_nllb.pieces}
prev_min_score = old_spm_nllb.pieces[-1].score
for p in added_spm.pieces:
    piece = p.piece
    if p.type != 1:
        continue
    if piece not in nllb_tokens_set:
        new_p = sp_pb2_model.ModelProto().SentencePiece()
        new_p.piece = piece
        new_p.score = p.score + prev_min_score
        old_spm_nllb.pieces.append(new_p)

NEW_SPM_NAME = 'spm_nllb_extended.model'
with open(NEW_SPM_NAME, 'wb') as f:
    f.write(old_spm_nllb.SerializeToString())

tokenizer = NllbTokenizer.from_pretrained(model_name, vocab_file=NEW_SPM_NAME)
print(len(tokenizer_old), len(tokenizer))
added_vocab = set(tokenizer.get_vocab()).difference(set(tokenizer_old.get_vocab()))
print(len(added_vocab))

256204 262441
6237


In [10]:
# Add trained tokens to model

model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

for t in tqdm(added_vocab):
    tt = tokenizer_old(t, add_special_tokens=False).input_ids
    if len(tt) == 0:
        tt = [tokenizer_old.unk_token_id]
    idx = tokenizer.convert_tokens_to_ids(t)
    model.model.shared.weight.data[idx] = model.model.shared.weight.data[tt].mean(0)

config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

2025-06-16 15:56:48.852013: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750089409.277891      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750089409.401151      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


(…)1ecdf1e485509035f6b51dfe84f1ada83eefcc42:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


  0%|          | 0/6237 [00:00<?, ?it/s]

# PART2 Model Fine-tuning

In [11]:
# Prepare training pair

LANGS = [('zho', 'zho_Hant'), ('pyu', 'tgl_Latn')]

dfs = [lexicon, sentences, lexicon_en, sentences_en]
df_train = pd.concat([df[['pyu', 'zho']] for df in dfs], ignore_index=True)

def get_batch_pairs(batch_size, data=df_train):
    (l1, long1), (l2, long2) = random.sample(LANGS, 2)
    xx, yy = [], []
    for _ in range(batch_size):
        item = data.iloc[random.randint(0, len(data)-1)]
        xx.append(item[l1])
        yy.append(item[l2])
    return xx, yy, long1, long2

In [12]:
# Trainer setting

model.cuda();
optimizer = Adafactor(
    [p for p in model.parameters() if p.requires_grad],
    scale_parameter=False,
    relative_step=False,
    lr=1e-4,
    clip_threshold=1.0,
    weight_decay=5e-3,
)
scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=2000)

batch_size = 16
max_length = 256
training_steps = 60000
losses = []
MODEL_SAVE_PATH = '/kaggle/working/nllb_extended'

In [13]:
# Training

def cleanup():
    gc.collect()
    torch.cuda.empty_cache()

model.train()
x, y, loss = None, None, None
cleanup()

tq = trange(len(losses), training_steps)
for i in tq:
    xx, yy, lang1, lang2 = get_batch_pairs(batch_size)
    try:
        tokenizer.src_lang = lang1
        x = tokenizer(xx, return_tensors='pt', padding=True, truncation=True, max_length=max_length).to(model.device)
        tokenizer.src_lang = lang2
        y = tokenizer(yy, return_tensors='pt', padding=True, truncation=True, max_length=max_length).to(model.device)
        # -100 is a magic value ignored in the loss function
        # because we don't want the model to learn to predict padding ids
        y.input_ids[y.input_ids == tokenizer.pad_token_id] = -100

        loss = model(**x, labels=y.input_ids).loss
        loss.backward()
        losses.append(loss.item())

        optimizer.step()
        optimizer.zero_grad(set_to_none=True)
        scheduler.step()

    except RuntimeError as e:
        optimizer.zero_grad(set_to_none=True)
        x, y, loss = None, None, None
        cleanup()
        print('error', max(len(s) for s in xx + yy), e)
        continue

    if i % 2000 == 0: # Steps to report loss
        print(i, np.mean(losses[-1000:]))

    if i % 2000 == 0 and i > 0: # Steps to save model
        model.save_pretrained(MODEL_SAVE_PATH)
        tokenizer.save_pretrained(MODEL_SAVE_PATH)

  0%|          | 0/60000 [00:00<?, ?it/s]

0 8.860976219177246


KeyboardInterrupt: 

# PART3 NLLB Inference

(This part can be run independent of PART1 PART2 if model is already fine-tuned)

In [6]:
# Inference the 1st result with fine-tuned NLLB model

model_dir = "/kaggle/input/nllb-extended/other/45000steps/1/results/nllb_extended" # Change this to your trained model directory
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir, local_files_only=True).cuda()
tokenizer = NllbTokenizer.from_pretrained(model_dir, local_files_only=True)

def clean_output(text):
    text = text.strip()
    text = re.sub(r'^[\"“”「」『』、,.;:?!]+', '', text)  # Remove starting punctuation
    text = re.sub(r'[\"“”「」『』、,.;:?!]+$', '', text)  # Remove ending punctuation
    return text if text != "" else "ERROR"

def translate(
    text, src_lang='zho_Hant', tgt_lang='tgl_Latn', 
    a=32, b=3, max_input_length=1024, num_beams=1, **kwargs
):
    tokenizer.src_lang = src_lang
    tokenizer.tgt_lang = tgt_lang
    inputs = tokenizer(
        text, return_tensors='pt', padding=True, truncation=True, 
        max_length=max_input_length
    ).to(model.device)

    model.eval()
    result = model.generate(
        **inputs,
        forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang),
        max_new_tokens=int(a + b * inputs.input_ids.shape[1]),
        num_beams=num_beams, **kwargs
    )
    decoded = tokenizer.batch_decode(result, skip_special_tokens=True)
    decoded = [clean_output(t) for t in decoded]
    return decoded

translated_pyu = translate(to_pyu['zho'].tolist(), src_lang='zho_Hant', tgt_lang='pyu_Latn')
translated_zho = translate(to_zho['pyu'].tolist(), src_lang='pyu_Latn', tgt_lang='zho_Hant')

final = pd.DataFrame({
    "ID": range(1, len(translated_pyu) + len(translated_zho) + 1),
    "answer": translated_pyu + translated_zho
})
final['answer'] = final['answer'].fillna('ERROR')
final.to_csv("submission_NLLB.csv", index=False, encoding='utf-8')
final.head()

2025-06-16 15:52:24.237868: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750089144.441948      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750089144.502056      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Unnamed: 0,ID,answer
0,1,aaydan ziya na rahan na avukulr mu maw na sala...
1,2,"a maranger na variw, kavang zi sazu na 'azi ta..."
2,3,a Katratripulr na pinarahan在怕 kana sa'ami mu
3,4,"a pakamiisi za lramlram, zapilra、到pilra且are'"
4,5,"a meredek harem, kavulay nu kema ku kanmu"


# PART4 Llama Inference

(This part can be run independently)

In [5]:
# Load the Llama model

llama3 = Llama(
    "/kaggle/working/Llama-3.2-3B-Instruct-Q6_K.gguf",
    verbose=False,
    n_gpu_layers=-1,
    n_ctx=30000,
)

def generate_response(_model: Llama, _messages: str) -> str:
    _output = _model.create_chat_completion(
        _messages,
        stop=["<|eot_id|>", "<|end_of_text|>"],
        max_tokens=512,
        temperature=0,
        repeat_penalty=2.0,
    )["choices"][0]["message"]["content"]
    return _output

llama_new_context_with_model: n_ctx_per_seq (30016) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


In [6]:
# Reference translation pair preparation

dfs = [lexicon, sentences, lexicon_en, sentences_en]
combined = pd.concat(
    [df[['pyu', 'zho']] for df in dfs],
    ignore_index=True
)
combined = combined.drop_duplicates().reset_index(drop=True)
pairs = combined['zho'] + ' = ' + combined['pyu']
reference_text = "\n".join(pairs)

In [7]:
# Reference puyuma grammar book

grammar_book = "第三章 詞彙與構詞本章主要討論知本卑南語的詞彙結構及主要構詞方式。3.1 主要構詞單位3.1.1 詞及詞素詞 (Word)：是句子結構中的最小單位 。 音節多寡：詞可以是單音節（如 mu「當...的時候」）、雙音節（如 ru.ma「房子」、pa.kan「餵」）、三音節（如 mu.di.ngan「臉」）或四音節以上（如 pu.a.li.ma「戴戒指」） 。組成成份：有些詞由單一成分組成，無法再分解，稱為單純詞（如 ru.ma「房子」） 。有些則由兩個（如 pa+kan「餵(使...吃)」）或更多成分組成，稱為複雜詞 。語意與句法功能：具有實質語意的稱為實詞（如名詞 pu.ran「檳榔」、動詞 pakan「餵」），屬於開放性詞類 。具有句法功能的稱為虛詞或功能詞（如格位標記 za「斜格」、代名詞 inku「我,主格」），屬於封閉性詞類 。詞素 (Morpheme)：是語言系統中具有意義或語法功能的最小單位 。 自由詞素：可以獨立存在的詞素，如 velrvelr「香蕉」、kayan「坐」、inku「我、主格」 。附著詞素：一定要附加在某個詞上，不能單獨使用的詞素 。可分為詞綴（如 ki-「取得」、pa-「使...」）和依附詞（如代名詞 =ku「我 主格」） 。3.1.2 詞根及詞幹詞根 (Root)：是最小且具有意義的詞素，不包括任何附加成份（如重疊或詞綴） 。例如 matra「眼睛」是詞根，因為無法再切割成有意義的 *ma- 或 *-tra 。詞根不分長短，例如 velrvelr「香蕉」也是一個詞根 。詞幹 (Stem)：可以單純由一個詞根構成，也可以包含詞根再加上詞綴 。例如，在 pa-nadam「教」中，詞幹是 nadam「學習」；而在 ki-pa-nadam「受教」中，詞幹則是 pa-nadam 。3.1.3 詞綴及依附詞詞綴與依附詞都不能單獨使用。在本書中，詞綴用連字符號 - 標示，依附詞則用等號 = 標示 。例如：在 tu veray-ay=ku「他給我」一句中，-ay 是詞綴，=ku 是依附詞 。臺灣南島語在加詞綴的過程中，通常會影響重音。例如 inavă「好」加上後綴 -an 後，重音會移至最後音節，變成 inava-án 。詞綴可分為兩類：屈折詞綴：附加在特定詞類上，用來表示語法功能（如語氣、時貌），但不改變該詞的詞類。例如：動詞 pukpuk「打」→ pukpuk-u!「打！」（命令式動詞） 。衍生詞綴：會產生不同的語意並（或）造成詞類的改變。例如：名詞 avay「年糕」→ 動詞 tu-avay「做年糕」；動詞 ekan「吃」→ 名詞 a-ekan-an「食物」 。依附詞與詞綴的不同在於，依附詞不選擇其「寄主詞」的詞類或語意，通常依附於句中第一個成分。例如，依附詞 =ku「我」可以依附於動詞、名詞或否定詞 。 mapungaw=ku.（我頭暈）vs. a sinsi=ku.（我是老師） 'azi=ku mapungaw.（我沒頭暈）vs. melri=ku a sinsi.（我不是老師） 3.1.4 同位詞 (Allomorph)同位詞是一個詞素在不同語音環境下的變體 。主事焦點中綴 <em> 有三個同位詞：m-、me- 及 <en> 。 m-：出現在母音開頭的動詞上，如 m-abak「裝」、m-alak「拿」 。me-：出現在 n 及 ng 開頭的動詞上，如 me-na'u「看」、me-ngara「等」 。<en>：出現在 v 及 p 開頭的動詞上，如 v<en>usus「騙」、p<en>a'ing「打噴嚏」 。<em>：出現在其餘語音環境，如 k<em>ayan「坐下」、tr<em>evel「理髮」 。表完成的中綴 <in> 有兩個同位詞：in- 及 ni- 。 in-：出現在母音開頭的動詞上，如 in-abak「被裝了的」、in-alak「被拿了的」 。ni-：出現在 n 及 ng 開頭的動詞上，如 ni-na'u「被看了的」、ni-ngara「被等了的」 。<in>：出現在其餘語音環境，如 v<in>usus「被騙了的」、k<in>ayan「被坐下」 。3.2 構詞方法知本卑南語的主要構詞方法包括加綴、重疊及複合 。3.2.1 加綴 (Affixation)前綴：改變詞類（名詞 → 動詞） ki-：「取得」，如 ki-paisu「要錢」 。mi-：「穿、戴、帶、有」，如 mi-kavang「穿衣」、mi-paisu「有錢」 。mutu-：「變成」，如 mutu-trau「變成人」 。tara-：「使用」，如 tara-puyuma「說卑南語」 。tu-：「製造、產生」，如 tua-avay「做糯米糕」 。tinu-：「模擬」，如 tinu-maizang「實習長老」 。不改變詞類 mare-（名詞→名詞）：「互相」，如 mare-wadi「兄弟姊妹」 。kara-（動詞→動詞）：「一起」，如 kara-kayan「坐在一起」 。pa-（動詞→動詞）：「使、讓」，如 pa-ekan「餵、使吃」 。mara-（動詞→動詞）：「比較」，如 mara-lriketri「較短」 。中綴 ： <in>：表示「完成」，如 d<in>away「做好的」 。<em>：表示「主事焦點」，如 k<em>ayan「坐」 。後綴 ： -an：將動詞轉為名詞，表「地方」，如 takesi-an「學校」、tra'i-tra'i-an「廁所」 。環綴 ： ka-...-an：表示「做...的時期」或「真正的...」，如 ka-salem-an「種植的季節」、ka-ruma-an「主屋」 。<in>...anan：表示「...的成員」，如 z<in>pekalr-anan「村民」 。3.2.2 重疊 (Reduplication)Ca-重疊：重疊詞根倒數第二音節的輔音再加上母音 /a/ 。 在數詞上表達「數人」：如 zuwa「二」 → za-zuwa「兩人」 。在動詞上表示「進行」或「非實現」：如 senay「唱」 → s<em>a-senay「正在唱」 。表示「互相」：如 karatr「咬」 → ma-ka-karatr「互咬」 。在名詞上表示「通稱」或「多數」：如 trau「人」 → tra-trau-an「人類」 。形成表示「處所」的名詞：如 dirus「洗澡」 → da-dirus-an「洗澡間」 。形成表示「工具」的名詞：如 ngisil「刷」 → nga-ngisil「牙刷」 。雙音節重疊：重疊字根倒數兩個音節 。 加在名詞表示「複數」或「總稱」：如 zenan → zena-zenan「山脈」、tralun「草」→ tralu-tralun「草叢」 。加在動態動詞表「動作重複」：如 me-na'u「看」 → me-nau-na'u「不斷地看」 。加在靜態動詞「加重程度」：如 dawilr「遠」 → dawidawilr「很遠」 。複雜重疊：結合兩種以上方式的重疊 。例如 wari「天」 → wa-wari-wari「每天」 。3.3 擬聲詞擬聲詞是用聲音摹仿事物、動作或自然界聲音的詞彙 。動物：ngiaw「貓」、up'up「牛蛙」、maymay「鴨」、wa wa「烏鴉」、tutur「鴿子」 。昆蟲：tengteng「蜻蜓」、kengkeng「蚊子」 。動詞 (模擬動作聲音)：tiktik「雕刻聲」、tuktuk「鐵鎚聲」、taktak「砍樹聲」、pukpuk「用棍子打孩子聲」 。動詞 (模擬自然界聲音)：zerung「打雷聲」、treli「閃電」 。3.4 借詞知本卑南語的借詞來源有日語、台語及中文 。日語借詞：kupu「杯子」、layta「打火機」、sulippa「拖鞋」、iga「電影」、kikay「機器」、sinsi「老師」、hikoki「飛機」、dingwa「電話」、tuki「時鐘/手錶」、wasabi「芥末」、tomato「番茄」、sibiru「西裝」 。台語借詞：dolayba「螺ising起子」、tangsuy「雨衣」、ising「醫生」、voksi「牧師」、tu「桌子」、tawyu「醬油」、pisay「白菜」、kiw「茄子」 。3.5 詞類類別詞類可分為成員數量有限的封閉性詞類（如代名詞、副詞）和成員沒有限制的開放性詞類（如動詞、名詞） 。3.5.1 開放性詞類動詞和名詞的區分：從構詞上區分不易，句法上的證據比較可靠 。 指示代名詞可以出現在名詞前（ini na alrak「這個孩子」），但不能出現在動詞前 。自由式的代名詞可以出現在名詞前，但不能在動詞前。例如在 tu ngarayaw tu sinsi「他等他的老師」中，tu sinsi 可以被 nantu sinsi 取代，但 tu ngarayaw 不能被取代 。名詞用 melri 來否定（melri a sinsi intaw.「他不是老師」），動詞用 'azi 來否定（'azi maekan za vulraw.「他不吃魚」） 。名詞：可分為三類，各由不同格位標記來標示 。 人稱專有名詞：包含人名及親屬稱謂，有單複數之分。如 zua i tainataw.「他的媽媽來了。」 。處所名詞：如 adawilr i Tayhok.「台北很遠。」 。一般名詞：有「限定」與「非限定」之分。如 ulra a trau i ruma.「房子裡有人。」 。動詞：動詞上的焦點詞綴決定了主語的語意角色，主要有四種焦點：主事者 (<em>)、受事者 (-aw)、處所 (-ay)、受惠者/工具 (-anay) 。知本卑南語沒有獨立的「形容詞」詞類，其功能由靜態動詞（如「喜歡」、「害怕」）承擔 。動詞分為動態動詞和靜態動詞。動態動詞通常帶 <em>（或其同位詞），而靜態動詞帶 ma- 。兩者在命令句、否定句、非實現貌及使役句中有不同的標記方式 。3.5.2 封閉性詞類格位標記：出現在名詞或名詞組之前，標示其語意角色或文法關係 。人稱代名詞：指「我」、「你」、「他」等。第一人稱複數常區分「包含式」（咱們）和「排除式」（我們） 。指定代名詞：即指示代名詞，可單獨使用或修飾名詞，形式可能因與說話者距離、是否可見、單複數等因素而異 。疑問詞：用於構成特殊問句，如「誰」、「什麼」、「何處」等 。數字：分為基數詞與序數詞等 。詞組標記和子句標記：詞組標記：如連繫詞 na，常出現在名詞之間。例：zua na tatelru na trau.「那三個人來了。」 。並列連詞如 zi「和」。例：vi'as na kadaw zi, pitalupung...「太陽熱，而且要戴帽子...」 。子句標記：如主題標記 mu 和從屬連詞 nu「當」。例：na vavuy mu, tu kuwangaw ni ama za kuwang.「（那隻）山豬，爸爸用槍射了。」 。感嘆詞：表示驚訝、痛苦、悲傷等感情，如 iwa「唉呀」 。"

In [8]:
# Define LLM agent

def clean(text):
    return re.sub(r"[^\w\s]", "", text).strip()
    
def is_valid_word(word: str, min_alpha: int = 4) -> bool:
    num_alpha = sum(c.isalpha() for c in word)
    return num_alpha >= min_alpha

def fuzzy_search_entries(reference_text: str, keyword: str, top_k: int = 50) -> str:
    keyword = clean(keyword)

    ignore_words = {"的", "一", "在", " "}
    raw_words = jieba.cut(keyword)

    words = []
    for w in raw_words:
        if w in ignore_words:
            continue
        if re.fullmatch(r'[a-zA-Z\']+', w):
            if not is_valid_word(w):
                continue
        words.append(w)

    if not words:
        return "無可參考條目"

    matches = []
    for line in reference_text.split("\n"):
        cleaned_line = clean(line)
        if any(w in cleaned_line for w in words):
            matches.append(line)

    return "\n".join(matches[:top_k]) if matches else "找不到相關翻譯資料。"

class LLMAgent:
    def __init__(self, role_description, task_description, references=None):
        self.role_description = role_description
        self.task_description = task_description
        self.references = references

    def inference(self, message: str, ref: str = None, nllb: str = None) -> str:
        used_references = ref if ref is not None else self.references
        messages = [
            {"role": "system", "content": self.role_description},
            {"role": "user", "content": f"以下是卑南語的文法說明，可作為翻譯參考：「{grammar_book}」"},
            {"role": "user", "content": f"以下是相關詞彙的翻譯，可作為翻譯參考：「{used_references}」"},
            {"role": "user", "content": f"以下是NLLB模型輸出的翻譯結果，可作為翻譯參考：「{nllb}」"}, # if no output from previous part, comment this
            {"role": "user", "content": f"{self.task_description}：「{message}」"},
        ]
        return generate_response(llama3, messages)

In [9]:
# LLM agent prompting

transtopyu_agent = LLMAgent(
    role_description="你是聰明的語言模型，擅長翻譯繁體中文與卑南語，幫我參考以下資料後，利用特性嘗試翻譯文字至卑南語，並只保留卑南語的翻譯結果，不須說明。",
    task_description="翻譯以下文字為卑南語，並只保留翻譯內容",
)

transtozho_agent = LLMAgent(
    role_description="你是聰明的語言模型，擅長翻譯繁體中文與卑南語，幫我參考以下資料後，利用特性嘗試翻譯文字至繁體中文，並只保留繁體中文的翻譯結果，不須說明。",
    task_description="翻譯以下文字為繁體中文，並只保留翻譯內容",
)

In [10]:
# Inference the final result with llama

nllb_df = pd.read_csv("/kaggle/input/nllboutput/submission (3).csv") # Change this to your previous output file
nllb_outputs = nllb_df['answer'].tolist()

def translate_with_agent(texts, src_lang='zho'):
    results = []
    for i, text in enumerate(tqdm(texts, desc=f"Translating from {src_lang}")):
        ref = fuzzy_search_entries(reference_text, text)
        try:
            nllb_result = nllb_outputs[i] if i < len(nllb_outputs) else ""
            if src_lang == 'zho':
                result = transtopyu_agent.inference(text, ref=ref, nllb=nllb_result)
            else:
                result = transtozho_agent.inference(text, ref=ref, nllb=nllb_result)
        except Exception as e:
            result = "ERROR"
        cleaned_result = result.replace("\n", "").replace("\r", "").strip()
        results.append(cleaned_result)
    return results

translated_pyu = translate_with_agent(to_pyu['zho'].tolist(), src_lang='zho')
translated_zho = translate_with_agent(to_zho['pyu'].tolist(), src_lang='pyu')

final = pd.DataFrame({
    "ID": range(1, len(translated_pyu) + len(translated_zho) + 1),
    "answer": translated_pyu + translated_zho
})
final['answer'] = final['answer'].fillna('ERROR')
final.to_csv("submission_FINAL.csv", index=False, encoding='utf-8')

Translating from zho:   0%|          | 0/60 [00:00<?, ?it/s]

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 0.721 seconds.
Prefix dict has been built successfully.


Translating from pyu:   0%|          | 0/145 [00:00<?, ?it/s]