In [None]:
!nvidia-smi

In [None]:
# Package download

!pip install sentencepiece -q
!pip install transformers -q
!pip install datasets -q
!pip install peft -q

## Part1

In [None]:
# Nllb loading

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

model_name = "facebook/nllb-200-distilled-600M"
# model_name = "facebook/nllb-200-3.3B" # Larger model

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

tokenizer.src_lang = "zho_Hant"
tokenizer.tgt_lang = "tgl_Latn"
# zho_Hant for Chinese traditional
# eng_Latn for English
# tgl_Latn for Puyuma (Use existing language tag)

In [None]:
# Load data into dataframes

import pandas as pd

lexicon = pd.read_csv('/kaggle/input/ml2025-bonus/dataset/lexicon_no_en.csv', sep=",", quotechar='"', header=None, encoding="utf-8")
lexicon.columns = ['pyu', 'zho']

lexicon_en = pd.read_csv('/kaggle/input/ml2025-bonus/dataset/lexicon.csv', sep=",", quotechar='"', header=None, encoding="utf-8")
lexicon_en.columns = ['pyu', 'eng', 'zho']

sentences = pd.read_csv('/kaggle/input/ml2025-bonus/dataset/sentences_no_en.csv', sep=",", quotechar='"', header=None, encoding="utf-8")
sentences.columns = ['pyu', 'zho']

sentences_en = pd.read_csv('/kaggle/input/ml2025-bonus/dataset/sentences.csv', sep=",", quotechar='"', header=None, encoding="utf-8")
sentences_en.columns = ['pyu', 'eng', 'zho']

#lexicon.sample(5)
#lexicon_en.sample(10)
#sentences.sample(5)
#sentences_en.sample(10)

In [None]:
# Testing the performances of original tokenization

import re

def word_tokenize(text):
    
    return re.findall('(\w+|[^\w\s])', text)

def df_tokenize(df):
    df['pyu_toks'] = df.pyu.apply(tokenizer.tokenize)
    df['zho_toks'] = df.zho.apply(tokenizer.tokenize)
    df['pyu_words'] = df.pyu.apply(word_tokenize)
    df['zho_words'] = df.zho.apply(word_tokenize)
    
    return df

def cal_tokperword(df):

    stats = df[['pyu_toks', 'zho_toks', 'pyu_words', 'zho_words']].map(len).describe()
    print(stats.pyu_toks['mean'] / stats.pyu_words['mean'])
    print(stats.zho_toks['mean'] / stats.zho_words['mean'])

    return stats

def check_unk(df, column):

    texts_with_unk = [
        text for text in df[column]
        if tokenizer.unk_token_id in tokenizer(text).input_ids
    ]
    print(len(texts_with_unk))

lexicon = df_tokenize(lexicon)
lexicon_en = df_tokenize(lexicon_en)
sentences = df_tokenize(sentences)
sentences_en = df_tokenize(sentences_en)

print("toks per word of lexicon:")
stats_lexicon = cal_tokperword(lexicon)
print("toks per word of lexicon_en:")
stats_lexicon = cal_tokperword(lexicon_en)
print("toks per word of sentences:")
stats_sentences = cal_tokperword(sentences)
print("toks per word of sentences_en:")
stats_sentences = cal_tokperword(sentences_en)

print("total unk in lexicon zho:")
check_unk(lexicon, "zho")
print("total unk in lexicon pyu:")
check_unk(lexicon, "pyu")
print("total unk in lexicon_en zho:")
check_unk(lexicon_en, "zho")
print("total unk in lexicon_en pyu:")
check_unk(lexicon_en, "pyu")
print("total unk in sentences zho:")
check_unk(sentences, "zho")
print("total unk in sentences pyu:")
check_unk(sentences, "pyu")
print("total unk in sentences_en zho:")
check_unk(sentences_en, "zho")
print("total unk in sentences pyu:")
check_unk(sentences_en, "pyu")

#show datas
#lexicon.sample(10)
#sentences.sample(10)
#stats_lexicon
#stats_sentences

In [None]:
# Training tokenizer for missing tokens

import pandas as pd
from tqdm.auto import tqdm
import re
from collections import Counter
import sentencepiece as spm
from datasets import load_dataset

all_texts = lexicon['zho'].dropna().tolist() + sentences['zho'].dropna().tolist() + lexicon_en['zho'].dropna().tolist() + sentences_en['zho'].dropna().tolist() + lexicon['pyu'].dropna().tolist() + sentences['pyu'].dropna().tolist() + lexicon_en['pyu'].dropna().tolist() + sentences_en['pyu'].dropna().tolist()

all_texts_file = 'all_texts_plain.txt'
with open(all_texts_file, 'w', encoding='utf-8') as f:
    for text in all_texts:
        print(text, file=f)

required_chars = set()

for text in tqdm(all_texts):
    for char in text:
        tokens = tokenizer.tokenize(char)
        if tokens == ['▁', '<unk>']:
            required_chars.add(char)

required_chars_str = "".join(sorted(list(required_chars)))
print(f"需要強制包含的單字元: {required_chars_str}")

spm.SentencePieceTrainer.train(
    input=all_texts_file,
    model_prefix='spm_new',
    vocab_size=5800,
    character_coverage=1,
    num_threads=16,
    train_extremely_large_corpus=False,
    add_dummy_prefix=False,
    max_sentencepiece_length=128,
    max_sentence_length=4192 * 4,
    pad_id=0,
    eos_id=1,
    unk_id=2,
    bos_id=-1,
    required_chars=required_chars_str,
)

In [None]:
# Add trained tokens to tokenizer and model

from sentencepiece import sentencepiece_model_pb2 as sp_pb2_model
from transformers import NllbTokenizer

model_name = 'facebook/nllb-200-distilled-600M'
tokenizer_nllb = NllbTokenizer.from_pretrained(model_name)

sp_trained = spm.SentencePieceProcessor(model_file='spm_new.model')
added_spm = sp_pb2_model.ModelProto()
added_spm.ParseFromString(sp_trained.serialized_model_proto())
old_spm_nllb = sp_pb2_model.ModelProto()
old_spm_nllb.ParseFromString(tokenizer_nllb.sp_model.serialized_model_proto())

nllb_tokens_set = {p.piece for p in old_spm_nllb.pieces}
prev_min_score = old_spm_nllb.pieces[-1].score
for p in added_spm.pieces:
    piece = p.piece
    if p.type != 1:
        continue
    if piece not in nllb_tokens_set:
        new_p = sp_pb2_model.ModelProto().SentencePiece()
        new_p.piece = piece
        new_p.score = p.score + prev_min_score
        old_spm_nllb.pieces.append(new_p)

NEW_SPM_NAME = 'spm_nllb_extended_268k.model'
with open(NEW_SPM_NAME, 'wb') as f:
    f.write(old_spm_nllb.SerializeToString())

tokenizer = NllbTokenizer.from_pretrained(model_name, vocab_file='spm_new.model')
print(len(tokenizer_nllb), len(tokenizer))
added_vocab = set(tokenizer.get_vocab()).difference(set(tokenizer_nllb.get_vocab()))
#print(added_vocab)(0)

## PART2

In [None]:
from transformers.optimization import Adafactor
from transformers import get_constant_schedule_with_warmup
model.cuda();
optimizer = Adafactor(
    [p for p in model.parameters() if p.requires_grad],
    scale_parameter=False,
    relative_step=False,
    lr=1e-4,
    clip_threshold=1.0,
    weight_decay=1e-3,
)
scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=1000)

In [None]:
import random
LANGS = [('zho', 'zho_Hant'), ('pyu', 'tgl_Latn')]

dfs = [lexicon, sentences, lexicon_en, sentences_en]
df_train = pd.concat([df[['pyu', 'zho']] for df in dfs], ignore_index=True)

def get_batch_pairs(batch_size, data=df_train):
    (l1, long1), (l2, long2) = random.sample(LANGS, 2)
    xx, yy = [], []
    for _ in range(batch_size):
        item = data.iloc[random.randint(0, len(data)-1)]
        xx.append(item[l1])
        yy.append(item[l2])
    return xx, yy, long1, long2

print(get_batch_pairs(1))

In [None]:
batch_size = 16  # 32 already doesn't fit well to 15GB of GPU memory
max_length = 128  # token sequences will be truncated
training_steps = 50000  # Usually, I set a large number of steps,
# and then just interrupt the training manually
losses = []  # with this list, I do very simple tracking of average loss
MODEL_SAVE_PATH = '/kaggle/working/nllb_extended'  # on my Google drive

In [None]:
import gc
import torch
import numpy as np
from tqdm.auto import tqdm, trange

def cleanup():
    gc.collect()
    torch.cuda.empty_cache()

model.train()
x, y, loss = None, None, None
cleanup()

tq = trange(len(losses), training_steps)
for i in tq:
    xx, yy, lang1, lang2 = get_batch_pairs(batch_size)
    try:
        tokenizer.src_lang = lang1
        x = tokenizer(xx, return_tensors='pt', padding=True, truncation=True, max_length=max_length).to(model.device)
        tokenizer.src_lang = lang2
        y = tokenizer(yy, return_tensors='pt', padding=True, truncation=True, max_length=max_length).to(model.device)
        # -100 is a magic value ignored in the loss function
        # because we don't want the model to learn to predict padding ids
        y.input_ids[y.input_ids == tokenizer.pad_token_id] = -100

        loss = model(**x, labels=y.input_ids).loss
        loss.backward()
        losses.append(loss.item())

        optimizer.step()
        optimizer.zero_grad(set_to_none=True)
        scheduler.step()

    except RuntimeError as e:  # usually, it is out-of-memory
        optimizer.zero_grad(set_to_none=True)
        x, y, loss = None, None, None
        cleanup()
        print('error', max(len(s) for s in xx + yy), e)
        continue

    if i % 1000 == 0:
        # each 1000 steps, I report average loss at these steps
        print(i, np.mean(losses[-1000:]))

    if i % 1000 == 0 and i > 0:
        model.save_pretrained(MODEL_SAVE_PATH)
        tokenizer.save_pretrained(MODEL_SAVE_PATH)

## PART3

In [None]:
from transformers import NllbTokenizer, AutoModelForSeq2SeqLM
model_dir = '/kaggle/input/nllb-extended/other/45000steps/1/results/nllb_extended'
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir, local_files_only=True).cuda()
tokenizer = NllbTokenizer.from_pretrained(model_dir, vocab_file='/kaggle/input/nllb-extended/other/45000steps/1/results/spm_new.model')

In [None]:
def translate(
    text, src_lang='zho_Hant', tgt_lang='tgl_Latn', 
    a=32, b=3, max_input_length=1024, num_beams=1, **kwargs
):
    """Turn a text or a list of texts into a list of translations"""
    tokenizer.src_lang = src_lang
    tokenizer.tgt_lang = tgt_lang
    inputs = tokenizer(
        text, return_tensors='pt', padding=True, truncation=True, 
        max_length=max_input_length
    )
    model.eval() # turn off training mode
    result = model.generate(
        **inputs.to(model.device),
        forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang),
        max_new_tokens=int(a + b * inputs.input_ids.shape[1]),
        num_beams=num_beams, **kwargs
    )
    return tokenizer.batch_decode(result, skip_special_tokens=True)

t = '我也沒帶錢耶!'
print(translate(t, 'zho_Hant', 'tgl_Latn'))

## PART Additional

In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_dir = "/kaggle/input/nllb-extended/other/45000steps/1/results/nllb_extended"
tokenizer = AutoTokenizer.from_pretrained(model_dir, local_files_only=True)
model = AutoModelForSeq2SeqLM.from_pretrained(model_dir, local_files_only=True).to("cuda")

def translate(
    text, src_lang='zho_Hant', tgt_lang='tgl_Latn', 
    a=32, b=3, max_input_length=1024, num_beams=1, **kwargs
):
    tokenizer.src_lang = src_lang
    tokenizer.tgt_lang = tgt_lang
    inputs = tokenizer(
        text, return_tensors='pt', padding=True, truncation=True, 
        max_length=max_input_length
    ).to(model.device)

    model.eval()
    result = model.generate(
        **inputs,
        forced_bos_token_id=tokenizer.convert_tokens_to_ids(tgt_lang),
        max_new_tokens=int(a + b * inputs.input_ids.shape[1]),
        num_beams=num_beams, **kwargs
    )
    decoded = tokenizer.batch_decode(result, skip_special_tokens=True)

    # 如果解碼結果是空字串，就用 'ERROR' 代替
    decoded = [t if t.strip() != "" else "ERROR" for t in decoded]

    return decoded

to_pyu = pd.read_csv('/kaggle/input/ml2025-bonus/dataset/zh_to_pyu_test.csv', header=None)[0]
to_zho = pd.read_csv('/kaggle/input/ml2025-bonus/dataset/pyu_to_zh_test.csv', header=None)[0]

translated_pyu = translate(to_pyu.tolist(), src_lang='zho_Hant', tgt_lang='pyu_Latn')
translated_zho = translate(to_zho.tolist(), src_lang='pyu_Latn', tgt_lang='zho_Hant')

final = pd.DataFrame({
    "ID": range(1, len(translated_pyu) + len(translated_zho) + 1),
    "answer": translated_pyu + translated_zho
})
final['answer'] = final['answer'].fillna('ERROR')
final.to_csv("submission.csv", index=False, encoding='utf-8')
final.head()

## PART4

In [None]:
!python3 -m pip install --no-cache-dir llama-cpp-python==0.3.4 --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu122 -q
!python3 -m pip install googlesearch-python bs4 charset-normalizer requests-html lxml_html_clean -q

!wget https://huggingface.co/bartowski/Llama-3.2-3B-Instruct-GGUF/resolve/main/Llama-3.2-3B-Instruct-Q6_K.gguf

In [None]:
from llama_cpp import Llama

# Load the model onto GPU
llama3 = Llama(
    "/kaggle/working/Llama-3.2-3B-Instruct-Q6_K.gguf",
    verbose=False,
    n_gpu_layers=-1,
    n_ctx=30000,
)

def generate_response(_model: Llama, _messages: str) -> str:
    
    _output = _model.create_chat_completion(
        _messages,
        stop=["<|eot_id|>", "<|end_of_text|>"],
        max_tokens=512,
        temperature=0,
        repeat_penalty=2.0,
    )["choices"][0]["message"]["content"]
    return _output

In [None]:
import pandas as pd
import re
import asyncio

# === 資料載入 ===
lexicon = pd.read_csv('/kaggle/input/ml2025-bonus/dataset/lexicon_no_en.csv', sep=",", quotechar='"', header=None, encoding="utf-8")
lexicon.columns = ['pyu', 'zho']

lexicon_en = pd.read_csv('/kaggle/input/ml2025-bonus/dataset/lexicon.csv', sep=",", quotechar='"', header=None, encoding="utf-8")
lexicon_en.columns = ['pyu', 'eng', 'zho']

sentences = pd.read_csv('/kaggle/input/ml2025-bonus/dataset/sentences_no_en.csv', sep=",", quotechar='"', header=None, encoding="utf-8")
sentences.columns = ['pyu', 'zho']

sentences_en = pd.read_csv('/kaggle/input/ml2025-bonus/dataset/sentences.csv', sep=",", quotechar='"', header=None, encoding="utf-8")
sentences_en.columns = ['pyu', 'eng', 'zho']

# === 組合與清洗資料 ===
df1 = lexicon[['pyu', 'zho']]
df2 = lexicon_en[['pyu', 'zho']]
df3 = sentences[['pyu', 'zho']]
df4 = sentences_en[['pyu', 'zho']]

combined = pd.concat([df1, df2, df3, df4], ignore_index=True)
combined = combined.drop_duplicates()

pairs = combined.apply(lambda row: f"{row['zho']} = {row['pyu']}", axis=1)
reference_text = "\n".join(pairs.tolist())

In [None]:
def clean(text):
    return re.sub(r"[^\w\s]", "", text).strip()

def fuzzy_search_entries(reference_text: str, keyword: str, top_k: int = 50) -> str:
    keyword = clean(keyword)
    chars = list(keyword)  # 拆成單個中文字

    matches = []
    for line in reference_text.split("\n"):
        cleaned_line = clean(line)
        if any(c in cleaned_line for c in chars):
            matches.append(line)

    return "\n".join(matches[:top_k]) if matches else "找不到相關翻譯資料。"


class LLMAgent:
    def __init__(self, role_description, task_description, references=None):
        self.role_description = role_description
        self.task_description = task_description
        self.references = references

    def inference(self, message: str, ref: str = None) -> str:
        used_references = ref if ref is not None else self.references
        messages = [
            {"role": "system", "content": self.role_description},
            {"role": "user", "content": used_references},
            {"role": "user", "content": f"{self.task_description}：「{message}」"},
        ]
        return generate_response(llama3, messages)

In [None]:
transtopyu_agent = LLMAgent(
    role_description="你是聰明的語言模型，擅長翻譯繁體中文與卑南語，根據卑南語Agglutinative的特性，幫我參考以下資料後，利用特性嘗試翻譯文字至卑南語，並只保留卑南語的翻譯結果，不須說明。",
    task_description="翻譯以下文字為卑南語，並只保留翻譯內容",
)

transtozho_agent = LLMAgent(
    role_description="你是聰明的語言模型，擅長翻譯繁體中文與卑南語，根據卑南語Agglutinative的特性，幫我參考以下資料後，利用特性嘗試翻譯文字至繁體中文，並只保留繁體中文的翻譯結果，不須說明。",
    task_description="翻譯以下文字為繁體中文，並只保留翻譯內容",
)

## PART Additional

In [None]:
import pandas as pd
from tqdm import tqdm

to_pyu = pd.read_csv('/kaggle/input/ml2025-bonus/dataset/zh_to_pyu_test.csv', header=None)[0]
to_zho = pd.read_csv('/kaggle/input/ml2025-bonus/dataset/pyu_to_zh_test.csv', header=None)[0]

def translate_with_agent(texts, src_lang='zho'):
    results = []
    for text in tqdm(texts, desc=f"Translating from {src_lang}"):
        ref = fuzzy_search_entries(reference_text, text)
        try:
            if src_lang == 'zho':
                result = transtopyu_agent.inference(text, ref=ref)
            else:
                result = transtozho_agent.inference(text, ref=ref)
        except Exception as e:
            result = "ERROR"
        cleaned_result = result.replace("\n", "").replace("\r", "").strip()
        results.append(cleaned_result)
    return results

translated_pyu = translate_with_agent(to_pyu.tolist(), src_lang='zho')
translated_zho = translate_with_agent(to_zho.tolist(), src_lang='pyu')

final = pd.DataFrame({
    "ID": range(1, len(translated_pyu) + len(translated_zho) + 1),
    "answer": translated_pyu + translated_zho
})
final['answer'] = final['answer'].fillna('ERROR')
final.to_csv("submission.csv", index=False, encoding='utf-8')

## PART5

In [None]:
import pandas as pd
from tqdm import tqdm
import re

# 初始化比較用的 Agent，只允許模型回覆 1 或 2
compare_agent = LLMAgent(
    role_description="你是翻譯品質評估專家，請從以下兩個翻譯版本中選出較佳者，只回覆「一」或「二」，不要輸出其他文字或說明，以下為第一版本。",
    task_description="以下為第二版本。"
)

# 從模型回傳中提取「1」或「2」
def compare_translations(version_a, version_b):
    try:
        result = compare_agent.inference(message=version_b, ref=version_a)
        match = re.search(r"(一|二)", result)
        if match:
            return match.group(1)
    except Exception:
        pass
    return "ERROR"

# 載入兩個翻譯版本
df_a = pd.read_csv('/kaggle/input/puyuma-translation/submission (3).csv')
df_b = pd.read_csv('/kaggle/working/submission.csv')

# 比對並挑選較佳翻譯
final_answers = []
for i in tqdm(range(len(df_a))):
    ans_a = df_a.loc[i, "answer"]
    ans_b = df_b.loc[i, "answer"]
    choice = compare_translations(ans_a, ans_b)
    if choice == "一":
        final_answers.append(ans_a)
    elif choice == "二":
        final_answers.append(ans_b)
    else:
        final_answers.append("ERROR")

# 匯出最終結果
final_df = pd.DataFrame({
    "ID": range(1, len(final_answers) + 1),
    "answer": final_answers
})
final_df.to_csv("final_submission.csv", index=False, encoding='utf-8')