In [None]:
!nvidia-smi

In [None]:
# Package download

!pip install sentencepiece -q
!pip install transformers -q
!pip install datasets -q
!pip install peft -q

## Part1

In [None]:
# Nllb loading

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# model_name = "facebook/nllb-200-distilled-600M"
model_name = "facebook/nllb-200-3.3B" # Larger model

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

tokenizer.src_lang = "zho_Hant"
tokenizer.tgt_lang = "tgl_Latn"
# zho_Hant for Chinese traditional
# eng_Latn for English
# tgl_Latn for Puyuma (Use existing language tag)

In [None]:
# Load data into dataframes

import pandas as pd

lexicon = pd.read_csv('/kaggle/input/ml2025-bonus/dataset/lexicon_no_en.csv', sep=",", quotechar='"', header=None, encoding="utf-8")
lexicon.columns = ['pyu', 'zho']

lexicon_en = pd.read_csv('/kaggle/input/ml2025-bonus/dataset/lexicon.csv', sep=",", quotechar='"', header=None, encoding="utf-8")
lexicon_en.columns = ['pyu', 'eng', 'zho']

sentences = pd.read_csv('/kaggle/input/ml2025-bonus/dataset/sentences_no_en.csv', sep=",", quotechar='"', header=None, encoding="utf-8")
sentences.columns = ['pyu', 'zho']

sentences_en = pd.read_csv('/kaggle/input/ml2025-bonus/dataset/sentences.csv', sep=",", quotechar='"', header=None, encoding="utf-8")
sentences_en.columns = ['pyu', 'eng', 'zho']

#lexicon.sample(5)
#lexicon_en.sample(10)
#sentences.sample(5)
#sentences_en.sample(10)

In [None]:
# Testing the performances of original tokenization

import re

def word_tokenize(text):
    
    return re.findall('(\w+|[^\w\s])', text)

def df_tokenize(df):
    df['pyu_toks'] = df.pyu.apply(tokenizer.tokenize)
    df['zho_toks'] = df.zho.apply(tokenizer.tokenize)
    df['pyu_words'] = df.pyu.apply(word_tokenize)
    df['zho_words'] = df.zho.apply(word_tokenize)
    
    return df

def cal_tokperword(df):

    stats = df[['pyu_toks', 'zho_toks', 'pyu_words', 'zho_words']].map(len).describe()
    print(stats.pyu_toks['mean'] / stats.pyu_words['mean'])
    print(stats.zho_toks['mean'] / stats.zho_words['mean'])

    return stats

def check_unk(df, column):

    texts_with_unk = [
        text for text in df[column]
        if tokenizer.unk_token_id in tokenizer(text).input_ids
    ]
    print(len(texts_with_unk))

lexicon = df_tokenize(lexicon)
lexicon_en = df_tokenize(lexicon_en)
sentences = df_tokenize(sentences)
sentences_en = df_tokenize(sentences_en)

print("toks per word of lexicon:")
stats_lexicon = cal_tokperword(lexicon)
print("toks per word of lexicon_en:")
stats_lexicon = cal_tokperword(lexicon_en)
print("toks per word of sentences:")
stats_sentences = cal_tokperword(sentences)
print("toks per word of sentences_en:")
stats_sentences = cal_tokperword(sentences_en)

print("total unk in lexicon zho:")
check_unk(lexicon, "zho")
print("total unk in lexicon pyu:")
check_unk(lexicon, "pyu")
print("total unk in lexicon_en zho:")
check_unk(lexicon_en, "zho")
print("total unk in lexicon_en pyu:")
check_unk(lexicon_en, "pyu")
print("total unk in sentences zho:")
check_unk(sentences, "zho")
print("total unk in sentences pyu:")
check_unk(sentences, "pyu")
print("total unk in sentences_en zho:")
check_unk(sentences_en, "zho")
print("total unk in sentences pyu:")
check_unk(sentences_en, "pyu")

#show datas
#lexicon.sample(10)
#sentences.sample(10)
#stats_lexicon
#stats_sentences

In [None]:
# Training tokenizer for missing tokens

import pandas as pd
from tqdm.auto import tqdm
import re
from collections import Counter
import sentencepiece as spm
from datasets import load_dataset

all_texts = lexicon['zho'].dropna().tolist() + sentences['zho'].dropna().tolist() + lexicon_en['zho'].dropna().tolist() + sentences_en['zho'].dropna().tolist() + lexicon['pyu'].dropna().tolist() + sentences['pyu'].dropna().tolist() + lexicon_en['pyu'].dropna().tolist() + sentences_en['pyu'].dropna().tolist()

all_texts_file = 'all_texts_plain.txt'
with open(all_texts_file, 'w', encoding='utf-8') as f:
    for text in all_texts:
        print(text, file=f)

required_chars = set()

for text in tqdm(all_texts):
    for char in text:
        tokens = tokenizer.tokenize(char)
        if tokens == ['▁', '<unk>']:
            required_chars.add(char)

required_chars_str = "".join(sorted(list(required_chars)))
print(f"需要強制包含的單字元: {required_chars_str}")

spm.SentencePieceTrainer.train(
    input=all_texts_file,
    model_prefix='spm_new',
    vocab_size=5800,
    character_coverage=1,
    num_threads=16,
    train_extremely_large_corpus=False,
    add_dummy_prefix=False,
    max_sentencepiece_length=128,
    max_sentence_length=4192 * 4,
    pad_id=0,
    eos_id=1,
    unk_id=2,
    bos_id=-1,
    required_chars=required_chars_str,
)

In [None]:
# Add trained tokens to tokenizer and model

from sentencepiece import sentencepiece_model_pb2 as sp_pb2_model
from transformers import NllbTokenizer

model_name = 'facebook/nllb-200-3.3B'
tokenizer_nllb = NllbTokenizer.from_pretrained(model_name)

sp_trained = spm.SentencePieceProcessor(model_file='spm_new.model')
added_spm = sp_pb2_model.ModelProto()
added_spm.ParseFromString(sp_trained.serialized_model_proto())
old_spm_nllb = sp_pb2_model.ModelProto()
old_spm_nllb.ParseFromString(tokenizer_nllb.sp_model.serialized_model_proto())

nllb_tokens_set = {p.piece for p in old_spm_nllb.pieces}
prev_min_score = old_spm_nllb.pieces[-1].score
for p in added_spm.pieces:
    piece = p.piece
    if p.type != 1:
        continue
    if piece not in nllb_tokens_set:
        new_p = sp_pb2_model.ModelProto().SentencePiece()
        new_p.piece = piece
        new_p.score = p.score + prev_min_score
        old_spm_nllb.pieces.append(new_p)

NEW_SPM_NAME = 'spm_nllb_extended_268k.model'
with open(NEW_SPM_NAME, 'wb') as f:
    f.write(old_spm_nllb.SerializeToString())

tokenizer = NllbTokenizer.from_pretrained(model_name, vocab_file='spm_new.model')
print(len(tokenizer_nllb), len(tokenizer))
added_vocab = set(tokenizer.get_vocab()).difference(set(tokenizer_nllb.get_vocab()))
#print(added_vocab)(0)

## PART2

In [None]:
from transformers.optimization import Adafactor
from transformers import get_constant_schedule_with_warmup
model.cuda();
optimizer = Adafactor(
    [p for p in model.parameters() if p.requires_grad],
    scale_parameter=False,
    relative_step=False,
    lr=1e-4,
    clip_threshold=1.0,
    weight_decay=5e-3,
)
scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=2000)

In [None]:
import random
LANGS = [('zho', 'zho_Hant'), ('pyu', 'tgl_Latn')]

dfs = [lexicon, sentences, lexicon_en, sentences_en]
df_train = pd.concat([df[['pyu', 'zho']] for df in dfs], ignore_index=True)

def get_batch_pairs(batch_size, data=df_train):
    (l1, long1), (l2, long2) = random.sample(LANGS, 2)
    xx, yy = [], []
    for _ in range(batch_size):
        item = data.iloc[random.randint(0, len(data)-1)]
        xx.append(item[l1])
        yy.append(item[l2])
    return xx, yy, long1, long2

print(get_batch_pairs(1))

In [None]:
batch_size = 12  # 32 already doesn't fit well to 15GB of GPU memory
max_length = 256  # token sequences will be truncated
training_steps = 50000  # Usually, I set a large number of steps,
# and then just interrupt the training manually
losses = []  # with this list, I do very simple tracking of average loss
MODEL_SAVE_PATH = '/kaggle/working/nllb_extended'  # on my Google drive

In [None]:
import gc
import torch
import numpy as np
from tqdm.auto import tqdm, trange

def cleanup():
    gc.collect()
    torch.cuda.empty_cache()

model.train()
x, y, loss = None, None, None
cleanup()

tq = trange(len(losses), training_steps)
for i in tq:
    xx, yy, lang1, lang2 = get_batch_pairs(batch_size)
    try:
        tokenizer.src_lang = lang1
        x = tokenizer(xx, return_tensors='pt', padding=True, truncation=True, max_length=max_length).to(model.device)
        tokenizer.src_lang = lang2
        y = tokenizer(yy, return_tensors='pt', padding=True, truncation=True, max_length=max_length).to(model.device)
        # -100 is a magic value ignored in the loss function
        # because we don't want the model to learn to predict padding ids
        y.input_ids[y.input_ids == tokenizer.pad_token_id] = -100

        loss = model(**x, labels=y.input_ids).loss
        loss.backward()
        losses.append(loss.item())

        optimizer.step()
        optimizer.zero_grad(set_to_none=True)
        scheduler.step()

    except RuntimeError as e:  # usually, it is out-of-memory
        optimizer.zero_grad(set_to_none=True)
        x, y, loss = None, None, None
        cleanup()
        print('error', max(len(s) for s in xx + yy), e)
        continue

    if i % 2000 == 0:
        # each 1000 steps, I report average loss at these steps
        print(i, np.mean(losses[-1000:]))

    if i % 2000 == 0 and i > 0:
        model.save_pretrained(MODEL_SAVE_PATH)
        tokenizer.save_pretrained(MODEL_SAVE_PATH)

## PART3

In [None]:
from transformers import NllbTokenizer, AutoModelForSeq2SeqLM
model_dir = '/kaggle/input/nllb-extended/other/45000steps/1/results/nllb_extended'
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir, local_files_only=True).cuda()
tokenizer = NllbTokenizer.from_pretrained(model_dir, vocab_file='/kaggle/input/nllb-extended/other/45000steps/1/results/spm_new.model')

In [None]:
def translate(
    text, src_lang='zho_Hant', tgt_lang='tgl_Latn', 
    a=32, b=3, max_input_length=1024, num_beams=4, **kwargs
):
    """Turn a text or a list of texts into a list of translations"""
    tokenizer.src_lang = src_lang
    tokenizer.tgt_lang = tgt_lang
    inputs = tokenizer(
        text, return_tensors='pt', padding=True, truncation=True, 
        max_length=max_input_length
    )
    model.eval() # turn off training mode
    result = model.generate(
        **inputs.to(model.device),
        forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang),
        max_new_tokens=int(a + b * inputs.input_ids.shape[1]),
        num_beams=num_beams, **kwargs
    )
    return tokenizer.batch_decode(result, skip_special_tokens=True)

t = '我也沒帶錢耶!'
print(translate(t, 'zho_Hant', 'tgl_Latn'))

## PART Additional

In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import re

model_dir = "/kaggle/input/nllb-extended/other/45000steps/1/results/nllb_extended"
tokenizer = AutoTokenizer.from_pretrained(model_dir, local_files_only=True)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir, local_files_only=True).to("cuda")

def clean_output(text):
    text = text.strip()
    text = re.sub(r'^[\"“”「」『』、,.;:?!]+', '', text)  # 移除開頭標點
    text = re.sub(r'[\"“”「」『』、,.;:?!]+$', '', text)  # 移除結尾標點
    return text if text != "" else "ERROR"

def translate(
    text, src_lang='zho_Hant', tgt_lang='tgl_Latn', 
    a=32, b=3, max_input_length=1024, num_beams=4, **kwargs
):
    tokenizer.src_lang = src_lang
    tokenizer.tgt_lang = tgt_lang
    inputs = tokenizer(
        text, return_tensors='pt', padding=True, truncation=True, 
        max_length=max_input_length
    ).to(model.device)

    model.eval()
    result = model.generate(
        **inputs,
        forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang),
        max_new_tokens=int(a + b * inputs.input_ids.shape[1]),
        num_beams=num_beams, **kwargs
    )
    decoded = tokenizer.batch_decode(result, skip_special_tokens=True)
    decoded = [clean_output(t) for t in decoded]
    return decoded

to_pyu = pd.read_csv('/kaggle/input/ml2025-bonus/dataset/zh_to_pyu_test.csv', header=None)[0]
to_zho = pd.read_csv('/kaggle/input/ml2025-bonus/dataset/pyu_to_zh_test.csv', header=None)[0]

translated_pyu = translate(to_pyu.tolist(), src_lang='zho_Hant', tgt_lang='pyu_Latn')
translated_zho = translate(to_zho.tolist(), src_lang='pyu_Latn', tgt_lang='zho_Hant')

final = pd.DataFrame({
    "ID": range(1, len(translated_pyu) + len(translated_zho) + 1),
    "answer": translated_pyu + translated_zho
})
final['answer'] = final['answer'].fillna('ERROR')
final.to_csv("submission.csv", index=False, encoding='utf-8')
final.head()

## PART4

In [4]:
!python3 -m pip install --no-cache-dir llama-cpp-python==0.3.4 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu122 -q
!python3 -m pip install googlesearch-python bs4 charset-normalizer requests-html lxml_html_clean -q

!wget https://huggingface.co/bartowski/Llama-3.2-3B-Instruct-GGUF/resolve/main/Llama-3.2-3B-Instruct-Q6_K.gguf

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m445.2/445.2 MB[0m [31m120.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m43.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.1/50.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.9/82.9 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.7/161.7 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m144.2/144.2 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.4/107.4 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the 

In [5]:
from llama_cpp import Llama

# Load the model onto GPU
llama3 = Llama(
    "/kaggle/working/Llama-3.2-3B-Instruct-Q6_K.gguf",
    verbose=False,
    n_gpu_layers=-1,
    n_ctx=30000,
)

def generate_response(_model: Llama, _messages: str) -> str:
    
    _output = _model.create_chat_completion(
        _messages,
        stop=["<|eot_id|>", "<|end_of_text|>"],
        max_tokens=512,
        temperature=0.1,
        repeat_penalty=2.0,
    )["choices"][0]["message"]["content"]
    return _output

llama_new_context_with_model: n_ctx_per_seq (30016) < n_ctx_train (131072) -- the full capacity of the model will not be utilized


In [44]:
import pandas as pd
import re
import asyncio

# === 資料載入 ===
lexicon = pd.read_csv('/kaggle/input/ml2025-bonus/dataset/lexicon_no_en.csv', sep=",", quotechar='"', header=None, encoding="utf-8")
lexicon.columns = ['pyu', 'zho']

lexicon_en = pd.read_csv('/kaggle/input/ml2025-bonus/dataset/lexicon.csv', sep=",", quotechar='"', header=None, encoding="utf-8")
lexicon_en.columns = ['pyu', 'eng', 'zho']

sentences = pd.read_csv('/kaggle/input/ml2025-bonus/dataset/sentences_no_en.csv', sep=",", quotechar='"', header=None, encoding="utf-8")
sentences.columns = ['pyu', 'zho']

sentences_en = pd.read_csv('/kaggle/input/ml2025-bonus/dataset/sentences.csv', sep=",", quotechar='"', header=None, encoding="utf-8")
sentences_en.columns = ['pyu', 'eng', 'zho']

# === 組合與清洗資料 ===
df1 = lexicon[['pyu', 'zho']]
df2 = lexicon_en[['pyu', 'zho']]
df3 = sentences[['pyu', 'zho']]
df4 = sentences_en[['pyu', 'zho']]

combined = pd.concat([df1, df2, df3, df4], ignore_index=True)
combined = combined.drop_duplicates()

pairs = combined.apply(lambda row: f"{row['zho']} = {row['pyu']}", axis=1)
reference_text = "\n".join(pairs.tolist())

In [None]:
grammar_book = "第三章 詞彙與構詞本章主要討論知本卑南語的詞彙結構及主要構詞方式。3.1 主要構詞單位3.1.1 詞及詞素詞 (Word)：是句子結構中的最小單位 。 音節多寡：詞可以是單音節（如 mu「當...的時候」）、雙音節（如 ru.ma「房子」、pa.kan「餵」）、三音節（如 mu.di.ngan「臉」）或四音節以上（如 pu.a.li.ma「戴戒指」） 。組成成份：有些詞由單一成分組成，無法再分解，稱為單純詞（如 ru.ma「房子」） 。有些則由兩個（如 pa+kan「餵(使...吃)」）或更多成分組成，稱為複雜詞 。語意與句法功能：具有實質語意的稱為實詞（如名詞 pu.ran「檳榔」、動詞 pakan「餵」），屬於開放性詞類 。具有句法功能的稱為虛詞或功能詞（如格位標記 za「斜格」、代名詞 inku「我,主格」），屬於封閉性詞類 。詞素 (Morpheme)：是語言系統中具有意義或語法功能的最小單位 。 自由詞素：可以獨立存在的詞素，如 velrvelr「香蕉」、kayan「坐」、inku「我、主格」 。附著詞素：一定要附加在某個詞上，不能單獨使用的詞素 。可分為詞綴（如 ki-「取得」、pa-「使...」）和依附詞（如代名詞 =ku「我 主格」） 。3.1.2 詞根及詞幹詞根 (Root)：是最小且具有意義的詞素，不包括任何附加成份（如重疊或詞綴） 。例如 matra「眼睛」是詞根，因為無法再切割成有意義的 *ma- 或 *-tra 。詞根不分長短，例如 velrvelr「香蕉」也是一個詞根 。詞幹 (Stem)：可以單純由一個詞根構成，也可以包含詞根再加上詞綴 。例如，在 pa-nadam「教」中，詞幹是 nadam「學習」；而在 ki-pa-nadam「受教」中，詞幹則是 pa-nadam 。3.1.3 詞綴及依附詞詞綴與依附詞都不能單獨使用。在本書中，詞綴用連字符號 - 標示，依附詞則用等號 = 標示 。例如：在 tu veray-ay=ku「他給我」一句中，-ay 是詞綴，=ku 是依附詞 。臺灣南島語在加詞綴的過程中，通常會影響重音。例如 inavă「好」加上後綴 -an 後，重音會移至最後音節，變成 inava-án 。詞綴可分為兩類：屈折詞綴：附加在特定詞類上，用來表示語法功能（如語氣、時貌），但不改變該詞的詞類。例如：動詞 pukpuk「打」→ pukpuk-u!「打！」（命令式動詞） 。衍生詞綴：會產生不同的語意並（或）造成詞類的改變。例如：名詞 avay「年糕」→ 動詞 tu-avay「做年糕」；動詞 ekan「吃」→ 名詞 a-ekan-an「食物」 。依附詞與詞綴的不同在於，依附詞不選擇其「寄主詞」的詞類或語意，通常依附於句中第一個成分。例如，依附詞 =ku「我」可以依附於動詞、名詞或否定詞 。 mapungaw=ku.（我頭暈）vs. a sinsi=ku.（我是老師） 'azi=ku mapungaw.（我沒頭暈）vs. melri=ku a sinsi.（我不是老師） 3.1.4 同位詞 (Allomorph)同位詞是一個詞素在不同語音環境下的變體 。主事焦點中綴 <em> 有三個同位詞：m-、me- 及 <en> 。 m-：出現在母音開頭的動詞上，如 m-abak「裝」、m-alak「拿」 。me-：出現在 n 及 ng 開頭的動詞上，如 me-na'u「看」、me-ngara「等」 。<en>：出現在 v 及 p 開頭的動詞上，如 v<en>usus「騙」、p<en>a'ing「打噴嚏」 。<em>：出現在其餘語音環境，如 k<em>ayan「坐下」、tr<em>evel「理髮」 。表完成的中綴 <in> 有兩個同位詞：in- 及 ni- 。 in-：出現在母音開頭的動詞上，如 in-abak「被裝了的」、in-alak「被拿了的」 。ni-：出現在 n 及 ng 開頭的動詞上，如 ni-na'u「被看了的」、ni-ngara「被等了的」 。<in>：出現在其餘語音環境，如 v<in>usus「被騙了的」、k<in>ayan「被坐下」 。3.2 構詞方法知本卑南語的主要構詞方法包括加綴、重疊及複合 。3.2.1 加綴 (Affixation)前綴：改變詞類（名詞 → 動詞） ki-：「取得」，如 ki-paisu「要錢」 。mi-：「穿、戴、帶、有」，如 mi-kavang「穿衣」、mi-paisu「有錢」 。mutu-：「變成」，如 mutu-trau「變成人」 。tara-：「使用」，如 tara-puyuma「說卑南語」 。tu-：「製造、產生」，如 tua-avay「做糯米糕」 。tinu-：「模擬」，如 tinu-maizang「實習長老」 。不改變詞類 mare-（名詞→名詞）：「互相」，如 mare-wadi「兄弟姊妹」 。kara-（動詞→動詞）：「一起」，如 kara-kayan「坐在一起」 。pa-（動詞→動詞）：「使、讓」，如 pa-ekan「餵、使吃」 。mara-（動詞→動詞）：「比較」，如 mara-lriketri「較短」 。中綴 ： <in>：表示「完成」，如 d<in>away「做好的」 。<em>：表示「主事焦點」，如 k<em>ayan「坐」 。後綴 ： -an：將動詞轉為名詞，表「地方」，如 takesi-an「學校」、tra'i-tra'i-an「廁所」 。環綴 ： ka-...-an：表示「做...的時期」或「真正的...」，如 ka-salem-an「種植的季節」、ka-ruma-an「主屋」 。<in>...anan：表示「...的成員」，如 z<in>pekalr-anan「村民」 。3.2.2 重疊 (Reduplication)Ca-重疊：重疊詞根倒數第二音節的輔音再加上母音 /a/ 。 在數詞上表達「數人」：如 zuwa「二」 → za-zuwa「兩人」 。在動詞上表示「進行」或「非實現」：如 senay「唱」 → s<em>a-senay「正在唱」 。表示「互相」：如 karatr「咬」 → ma-ka-karatr「互咬」 。在名詞上表示「通稱」或「多數」：如 trau「人」 → tra-trau-an「人類」 。形成表示「處所」的名詞：如 dirus「洗澡」 → da-dirus-an「洗澡間」 。形成表示「工具」的名詞：如 ngisil「刷」 → nga-ngisil「牙刷」 。雙音節重疊：重疊字根倒數兩個音節 。 加在名詞表示「複數」或「總稱」：如 zenan → zena-zenan「山脈」、tralun「草」→ tralu-tralun「草叢」 。加在動態動詞表「動作重複」：如 me-na'u「看」 → me-nau-na'u「不斷地看」 。加在靜態動詞「加重程度」：如 dawilr「遠」 → dawidawilr「很遠」 。複雜重疊：結合兩種以上方式的重疊 。例如 wari「天」 → wa-wari-wari「每天」 。3.3 擬聲詞擬聲詞是用聲音摹仿事物、動作或自然界聲音的詞彙 。動物：ngiaw「貓」、up'up「牛蛙」、maymay「鴨」、wa wa「烏鴉」、tutur「鴿子」 。昆蟲：tengteng「蜻蜓」、kengkeng「蚊子」 。動詞 (模擬動作聲音)：tiktik「雕刻聲」、tuktuk「鐵鎚聲」、taktak「砍樹聲」、pukpuk「用棍子打孩子聲」 。動詞 (模擬自然界聲音)：zerung「打雷聲」、treli「閃電」 。3.4 借詞知本卑南語的借詞來源有日語、台語及中文 。日語借詞：kupu「杯子」、layta「打火機」、sulippa「拖鞋」、iga「電影」、kikay「機器」、sinsi「老師」、hikoki「飛機」、dingwa「電話」、tuki「時鐘/手錶」、wasabi「芥末」、tomato「番茄」、sibiru「西裝」 。台語借詞：dolayba「螺ising起子」、tangsuy「雨衣」、ising「醫生」、voksi「牧師」、tu「桌子」、tawyu「醬油」、pisay「白菜」、kiw「茄子」 。3.5 詞類類別詞類可分為成員數量有限的封閉性詞類（如代名詞、副詞）和成員沒有限制的開放性詞類（如動詞、名詞） 。3.5.1 開放性詞類動詞和名詞的區分：從構詞上區分不易，句法上的證據比較可靠 。 指示代名詞可以出現在名詞前（ini na alrak「這個孩子」），但不能出現在動詞前 。自由式的代名詞可以出現在名詞前，但不能在動詞前。例如在 tu ngarayaw tu sinsi「他等他的老師」中，tu sinsi 可以被 nantu sinsi 取代，但 tu ngarayaw 不能被取代 。名詞用 melri 來否定（melri a sinsi intaw.「他不是老師」），動詞用 'azi 來否定（'azi maekan za vulraw.「他不吃魚」） 。名詞：可分為三類，各由不同格位標記來標示 。 人稱專有名詞：包含人名及親屬稱謂，有單複數之分。如 zua i tainataw.「他的媽媽來了。」 。處所名詞：如 adawilr i Tayhok.「台北很遠。」 。一般名詞：有「限定」與「非限定」之分。如 ulra a trau i ruma.「房子裡有人。」 。動詞：動詞上的焦點詞綴決定了主語的語意角色，主要有四種焦點：主事者 (<em>)、受事者 (-aw)、處所 (-ay)、受惠者/工具 (-anay) 。知本卑南語沒有獨立的「形容詞」詞類，其功能由靜態動詞（如「喜歡」、「害怕」）承擔 。動詞分為動態動詞和靜態動詞。動態動詞通常帶 <em>（或其同位詞），而靜態動詞帶 ma- 。兩者在命令句、否定句、非實現貌及使役句中有不同的標記方式 。3.5.2 封閉性詞類格位標記：出現在名詞或名詞組之前，標示其語意角色或文法關係 。人稱代名詞：指「我」、「你」、「他」等。第一人稱複數常區分「包含式」（咱們）和「排除式」（我們） 。指定代名詞：即指示代名詞，可單獨使用或修飾名詞，形式可能因與說話者距離、是否可見、單複數等因素而異 。疑問詞：用於構成特殊問句，如「誰」、「什麼」、「何處」等 。數字：分為基數詞與序數詞等 。詞組標記和子句標記：詞組標記：如連繫詞 na，常出現在名詞之間。例：zua na tatelru na trau.「那三個人來了。」 。並列連詞如 zi「和」。例：vi'as na kadaw zi, pitalupung...「太陽熱，而且要戴帽子...」 。子句標記：如主題標記 mu 和從屬連詞 nu「當」。例：na vavuy mu, tu kuwangaw ni ama za kuwang.「（那隻）山豬，爸爸用槍射了。」 。感嘆詞：表示驚訝、痛苦、悲傷等感情，如 iwa「唉呀」 。"

In [76]:
import re
import jieba

def clean(text):
    return re.sub(r"[^\w\s]", "", text).strip()
    
def is_valid_word(word: str, min_alpha: int = 4) -> bool:
    num_alpha = sum(c.isalpha() for c in word)
    return num_alpha >= min_alpha

def fuzzy_search_entries(reference_text: str, keyword: str, top_k: int = 50) -> str:
    keyword = clean(keyword)

    ignore_words = {"的", "一", "在", " "}
    raw_words = jieba.cut(keyword)

    words = []
    for w in raw_words:
        if w in ignore_words:
            continue
        if re.fullmatch(r'[a-zA-Z\']+', w):
            if not is_valid_word(w):
                continue
        words.append(w)

    print("有效詞彙：", words)

    if not words:
        return "無可參考條目"

    matches = []
    for line in reference_text.split("\n"):
        cleaned_line = clean(line)
        if any(w in cleaned_line for w in words):
            matches.append(line)

    return "\n".join(matches[:top_k]) if matches else "找不到相關翻譯資料。"

class LLMAgent:
    def __init__(self, role_description, task_description, references=None):
        self.role_description = role_description
        self.task_description = task_description
        self.references = references

    def inference(self, message: str, ref: str = None, nllb: str = None) -> str:
        used_references = ref if ref is not None else self.references
        messages = [
            {"role": "system", "content": self.role_description},
            {"role": "user", "content": f"以下是卑南語的文法說明，可作為翻譯參考：「{grammar_book}」"},
            {"role": "user", "content": f"以下是相關詞彙的翻譯，可作為翻譯參考：「{used_references}」"},
            {"role": "user", "content": f"以下是NLLB模型輸出的翻譯結果，可作為翻譯參考：「{nllb}」"},
            {"role": "user", "content": f"{self.task_description}：「{message}」"},
        ]
        return generate_response(llama3, messages)

In [77]:
transtopyu_agent = LLMAgent(
    role_description="你是聰明的語言模型，擅長翻譯繁體中文與卑南語，幫我參考以下資料後，利用特性嘗試翻譯文字至卑南語，並只保留卑南語的翻譯結果，不須說明。",
    task_description="翻譯以下文字為卑南語，並只保留翻譯內容",
)

transtozho_agent = LLMAgent(
    role_description="你是聰明的語言模型，擅長翻譯繁體中文與卑南語，幫我參考以下資料後，利用特性嘗試翻譯文字至繁體中文，並只保留繁體中文的翻譯結果，不須說明。",
    task_description="翻譯以下文字為繁體中文，並只保留翻譯內容",
)

## PART Additional

In [79]:
import pandas as pd
from tqdm import tqdm

to_pyu = pd.read_csv('/kaggle/input/ml2025-bonus/dataset/zh_to_pyu_test.csv', header=None)[0]
to_zho = pd.read_csv('/kaggle/input/ml2025-bonus/dataset/pyu_to_zh_test.csv', header=None)[0]
nllb_df = pd.read_csv("/kaggle/input/nllboutput/submission (3).csv")
nllb_outputs = nllb_df['answer'].tolist()
grammar_book = 

def translate_with_agent(texts, src_lang='zho'):
    results = []
    for i, text in enumerate(tqdm(texts, desc=f"Translating from {src_lang}")):
        ref = fuzzy_search_entries(reference_text, text)
        try:
            nllb_result = nllb_outputs[i] if i < len(nllb_outputs) else ""
            print(nllb_result)
            if src_lang == 'zho':
                result = transtopyu_agent.inference(text, ref=ref, nllb=nllb_result)
            else:
                result = transtozho_agent.inference(text, ref=ref, nllb=nllb_result)
        except Exception as e:
            result = "ERROR"
        cleaned_result = result.replace("\n", "").replace("\r", "").strip()
        results.append(cleaned_result)
    return results

translated_pyu = translate_with_agent(to_pyu.tolist(), src_lang='zho')
translated_zho = translate_with_agent(to_zho.tolist(), src_lang='pyu')

final = pd.DataFrame({
    "ID": range(1, len(translated_pyu) + len(translated_zho) + 1),
    "answer": translated_pyu + translated_zho
})
final['answer'] = final['answer'].fillna('ERROR')
final.to_csv("submission.csv", index=False, encoding='utf-8')

Translating from zho:   0%|          | 0/60 [00:00<?, ?it/s]

有效詞彙： ['現在', '初鹿', '部落', '祭司', '是', '由長', '老', '推舉', '選任']
aaydan ziya na rahan naulid i zekalr zi natrepa kana valalisen,


Translating from zho:   2%|▏         | 1/60 [00:01<01:04,  1.09s/it]

有效詞彙： ['你', '今天', '穿', '得', '好', '漂亮衣服', '上', '有', '好多', '鈴鐺']
a treme'utran mu, mutrungutrungulr kana 'azi marekamelri za kavang.


Translating from zho:   3%|▎         | 2/60 [00:01<00:47,  1.21it/s]

有效詞彙： ['搖擺', '竹蔭下', '得', '永眠', '之', '所']
a menadanadam muwaarak kanizu na suwan mu, ulra na suwan mu,


Translating from zho:   5%|▌         | 3/60 [00:03<01:07,  1.18s/it]

有效詞彙： ['在家', '準備', '男人', '衣物', '到', '凱旋門', '及', '集會', '所', '佈', '置']
a puresaring, ki物'urian到valenan, kitrepa kana matengetengez,


Translating from zho:   5%|▌         | 3/60 [00:03<01:10,  1.23s/it]


KeyboardInterrupt: 

## PART5

In [None]:
import pandas as pd
from tqdm import tqdm
import re

# 初始化比較用的 Agent，只允許模型回覆 1 或 2
compare_agent = LLMAgent(
    role_description="你是翻譯品質評估專家，請從以下兩個翻譯版本中選出較佳者，只回覆「一」或「二」，不要輸出其他文字或說明，以下為第一版本。",
    task_description="以下為第二版本。"
)

# 從模型回傳中提取「1」或「2」
def compare_translations(version_a, version_b):
    try:
        result = compare_agent.inference(message=version_b, ref=version_a)
        match = re.search(r"(一|二)", result)
        if match:
            return match.group(1)
    except Exception:
        pass
    return "ERROR"

# 載入兩個翻譯版本
df_a = pd.read_csv('/kaggle/input/puyuma-translation/submission (3).csv')
df_b = pd.read_csv('/kaggle/working/submission.csv')

# 比對並挑選較佳翻譯
final_answers = []
for i in tqdm(range(len(df_a))):
    ans_a = df_a.loc[i, "answer"]
    ans_b = df_b.loc[i, "answer"]
    choice = compare_translations(ans_a, ans_b)
    if choice == "一":
        final_answers.append(ans_a)
    elif choice == "二":
        final_answers.append(ans_b)
    else:
        final_answers.append("ERROR")

# 匯出最終結果
final_df = pd.DataFrame({
    "ID": range(1, len(final_answers) + 1),
    "answer": final_answers
})
final_df.to_csv("final_submission.csv", index=False, encoding='utf-8')