In [192]:
from collections import defaultdict

can2man_table = defaultdict(list)

with open("phrase_table.txt", "r") as input_file:
    for line in input_file.read().splitlines():
        [man_word, can_word] = line.split("|")
        can2man_table[can_word].append(man_word)

print(f"Generated Cantonese to Mandarin phrase table of size {len(can2man_table)}")
print(list(can2man_table.items())[0:10])

Generated Cantonese to Mandarin phrase table of size 11753
[('少少', ['一丁點兒', '一點兒', '一點', '一點點兒', '很少份量', '很少']), ('一上一落', ['一上一下']), ('下', ['一下']), ('一搊', ['一串']), ('啲', ['一些', '些', '某些', '這些']), ('單嘢', ['一件事']), ('件', ['一件']), ('一班', ['一伙', '全班', '那班']), ('單拖', ['一個人']), ('獨贏', ['一個人得頭彩'])]


In [156]:
common_trad_chars = None

with open("common_trad_chars.txt", "r") as input_file:
    common_trad_chars = set(input_file.read())

print("A sample of common traditional characters: ", list(common_trad_chars)[0:10])

A sample of common traditional characters:  ['蠍', '元', '癰', '額', '攀', '法', '爐', '戊', '莆', '穢']


In [157]:
import pandas as pd
from StarCC import PresetConversion
convert = PresetConversion(src='cn', dst='hk', with_phrase=False)

df = pd.read_csv("common_man_words.csv", sep="\t")
common_man_words = { convert(word) for word in df["word"] }

print(f"Got {len(common_man_words)} Mandarin words")

Got 55735 Mandarin words


In [158]:
df = pd.read_csv("common_can_words.csv", sep=",")
common_can_words = set(df["char"])

print(f"Got {len(common_can_words)} Cantonese words")

Got 92568 Cantonese words


In [193]:
common_words = common_can_words.intersection(common_man_words)

num_added_words = 0
for word in common_words:
    if not word in can2man_table or not word in can2man_table[word]:
        num_added_words += 1
        can2man_table[word].append(word)

print(f"Got {len(common_words)} common words")
print(f"Added {num_added_words} shared words to can2man_table")

Got 28543 common words
Added 28523 shared words to can2man_table


In [179]:
def longest_match_translate(s, phrase_table):
    man_phrases: list[list[str]] = []
    oov_word = ""
    while s:
        longest_match = None
        for phrase in phrase_table:
            if s.startswith(phrase) and (longest_match is None or len(phrase) > len(longest_match)):
                longest_match = phrase
        if longest_match:
            if len(oov_word) > 0:
                man_phrases.append([oov_word])
                oov_word = ""
            can_original = [longest_match] if len(longest_match) <= 1 and all(c in common_trad_chars for c in longest_match) else []
            man_phrase = phrase_table[longest_match]
            man_phrases.append(can_original + man_phrase)
            s = s[len(longest_match):].lstrip()
        else:
            oov_word += s[0]
            s = s[1:].lstrip()
    if len(oov_word) > 0:
        man_phrases.append([oov_word])
    # Merge anchor phrases (those with a single mandarin translation)
    i = 0
    merged_man_phrases = []
    while i < len(man_phrases):
        merged_phrase = ""
        while i < len(man_phrases) and len(man_phrases[i]) == 1:
            merged_phrase += man_phrases[i][0]
            i += 1
        if len(merged_phrase) > 0:
            merged_man_phrases.append([merged_phrase])
            merged_phrase = ""
        else:
            merged_man_phrases.append(man_phrases[i])
            i += 1
    return merged_man_phrases

In [180]:
longest_match_translate("唔該你細聲啲，我喺度做緊嘢。", can2man_table)

[['不好意思', '勞', '勞駕', '有勞您', '請你', '請', '請教', '謝', '謝謝', '麻煩您'],
 ['你小聲點，我'],
 ['在此', '在這裡', '在這邊', '在那裡', '在那邊'],
 ['正在做'],
 ['事情', '東西'],
 ['。']]

In [181]:
from transformers import BertTokenizerFast, GPT2LMHeadModel
tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese')
model = GPT2LMHeadModel.from_pretrained('ckiplab/gpt2-base-chinese')

In [182]:
from datasets import Dataset
import torch

def score_sentence_ppl(s: str) -> float:
    test = Dataset.from_dict({
        "text": [s],
    })
    encodings = tokenizer("\n\n".join(test["text"]), return_tensors="pt")

    max_length = model.config.n_positions
    stride = 512
    seq_len = encodings.input_ids.size(1)

    nlls = []
    prev_end_loc = 0
    for begin_loc in range(0, seq_len, stride):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc  # may be different from stride on last loop
        input_ids = encodings.input_ids[:, begin_loc:end_loc]
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)

            # loss is calculated using CrossEntropyLoss which averages over input tokens.
            # Multiply it with trg_len to get the summation instead of average.
            # We will take average over all the tokens to get the true average
            # in the last step of this example.
            neg_log_likelihood = outputs.loss * trg_len

        nlls.append(neg_log_likelihood)

        prev_end_loc = end_loc
        if end_loc == seq_len:
            break

    ppl = torch.exp(torch.stack(nlls).sum() / end_loc).item()
    return ppl

In [196]:
import regex

# match any unicode punctuation character and anything after it
punctuation_pattern = regex.compile(r"\p{P}+.*", flags=regex.UNICODE)
chinese_char_pattern = regex.compile(r"[\u4e00-\u9fff]")

def chop_off_at_punctuation(s: str) -> str:
    match = punctuation_pattern.search(s)
    if match:
        index = match.start()
        return s[:index]
    else:
        return s

def chop_off_at_canto_char(s: str) -> str:
    for i, c in enumerate(s):
        if chinese_char_pattern.match(c) and not c in common_trad_chars:
            return s[:i]
    return s

def flatten(l):
    return [item for sublist in l for item in sublist]

def can2man(s: str) -> str:
    man_phrases = longest_match_translate(s, can2man_table)
    print(man_phrases)
    for i, phrases in enumerate(man_phrases):
        if len(phrases) == 1:
            continue
        else:
            best_ppl = float("+inf")
            best_phrase = ""
            j = i + 1
            while j < len(man_phrases) and man_phrases[j] == 1:
                j += 1
            backward_context = "".join(flatten(man_phrases[:i]))
            forward_context = "".join(flatten(man_phrases[i + 1:j]))
            # forward context is too small
            while len(forward_context) < 5 and j < len(man_phrases):
                forward_context += man_phrases[j][0]
                j += 1
            forward_context = chop_off_at_canto_char(chop_off_at_punctuation(forward_context))
            print(f"i={i} backward_context={backward_context}, forward_context={forward_context}")
            for phrase in man_phrases[i]:
                s = backward_context + phrase + forward_context
                ppl = score_sentence_ppl(s)
                print(s, ppl)
                if ppl < best_ppl:
                    best_ppl = ppl
                    best_phrase = phrase
            man_phrases[i] = [best_phrase]
    print(man_phrases)
    return "".join(flatten(man_phrases))

In [197]:
can2man("唔該你細聲啲，我喺度做緊嘢。")

[['不好意思', '勞', '勞駕', '有勞您', '請你', '請', '請教', '謝', '謝謝', '麻煩您'], ['你小聲點，我'], ['在此', '在這裡', '在這邊', '在那裡', '在那邊'], ['正在做'], ['事情', '東西'], ['。']]
i=0 backward_context=, forward_context=你小聲點
不好意思你小聲點 1170.188232421875
勞你小聲點 23191.947265625
勞駕你小聲點 23770.203125
有勞您你小聲點 12233.4599609375
請你你小聲點 5392.3173828125
請你小聲點 6540.541015625
請教你小聲點 3959.528564453125
謝你小聲點 8292.0556640625
謝謝你小聲點 2870.170654296875
麻煩您你小聲點 7786.3974609375
i=2 backward_context=不好意思你小聲點，我, forward_context=正在做事情
不好意思你小聲點，我在此正在做事情 290.8885803222656
不好意思你小聲點，我在這裡正在做事情 186.99769592285156
不好意思你小聲點，我在這邊正在做事情 234.96160888671875
不好意思你小聲點，我在那裡正在做事情 191.083740234375
不好意思你小聲點，我在那邊正在做事情 232.5902557373047
i=4 backward_context=不好意思你小聲點，我在這裡正在做, forward_context=
不好意思你小聲點，我在這裡正在做事情 186.99769592285156
不好意思你小聲點，我在這裡正在做東西 173.3293914794922
[['不好意思'], ['你小聲點，我'], ['在這裡'], ['正在做'], ['東西'], ['。']]


'不好意思你小聲點，我在這裡正在做東西。'

In [198]:
can2man("邊個整到本書甩皮甩骨")

[['哪一個', '哪位'], ['整', '修理', '整理', '生產', '製作', '製造', '調整', '造'], ['到', '達到'], ['一本書', '書本'], ['散了架子', '皮開骨散']]
i=0 backward_context=, forward_context=整到一本書
哪一個整到一本書 1213.12451171875
哪位整到一本書 3353.59228515625
i=1 backward_context=哪一個, forward_context=到一本書散了架子
哪一個整到一本書散了架子 1153.7410888671875
哪一個修理到一本書散了架子 826.3753662109375
哪一個整理到一本書散了架子 742.5451049804688
哪一個生產到一本書散了架子 768.58544921875
哪一個製作到一本書散了架子 782.8555297851562
哪一個製造到一本書散了架子 917.5545654296875
哪一個調整到一本書散了架子 655.1337890625
哪一個造到一本書散了架子 1464.9222412109375
i=2 backward_context=哪一個調整, forward_context=一本書散了架子
哪一個調整到一本書散了架子 655.1337890625
哪一個調整達到一本書散了架子 793.9451293945312
i=3 backward_context=哪一個調整到, forward_context=散了架子
哪一個調整到一本書散了架子 655.1337890625
哪一個調整到書本散了架子 1099.811279296875
i=4 backward_context=哪一個調整到一本書, forward_context=
哪一個調整到一本書散了架子 655.1337890625
哪一個調整到一本書皮開骨散 1256.7835693359375
[['哪一個'], ['調整'], ['到'], ['一本書'], ['散了架子']]


'哪一個調整到一本書散了架子'

In [199]:
can2man("佢舉重嗰時掬住度氣堅持住，終於破咗世界紀錄")

[['他', '她', '它'], ['舉重嗰時掬'], ['住', '先'], ['度', '反復衡量', '度數', '揣度', '測量', '琢磨', '程度', '處', '裡', '那裏', '量'], ['氣', '人的精神狀態', '心情', '氣息', '氣體', '病象', '空氣'], ['堅持'], ['住', '先'], ['，終於破了'], ['生活', '世界'], ['紀錄']]
i=0 backward_context=, forward_context=舉重
他舉重 10582.169921875
她舉重 16104.935546875
它舉重 26628.703125
i=2 backward_context=他舉重嗰時掬, forward_context=度氣堅持住
他舉重嗰時掬住度氣堅持住 12438.759765625
他舉重嗰時掬先度氣堅持住 11879.0673828125
i=3 backward_context=他舉重嗰時掬先, forward_context=氣堅持住
他舉重嗰時掬先度氣堅持住 11879.0673828125
他舉重嗰時掬先反復衡量氣堅持住 10575.3505859375
他舉重嗰時掬先度數氣堅持住 10932.5673828125
他舉重嗰時掬先揣度氣堅持住 16721.763671875
他舉重嗰時掬先測量氣堅持住 8536.3984375
他舉重嗰時掬先琢磨氣堅持住 8938.1884765625
他舉重嗰時掬先程度氣堅持住 10696.958984375
他舉重嗰時掬先處氣堅持住 16819.1953125
他舉重嗰時掬先裡氣堅持住 17449.294921875
他舉重嗰時掬先那裏氣堅持住 12883.935546875
他舉重嗰時掬先量氣堅持住 12750.7490234375
i=4 backward_context=他舉重嗰時掬先測量, forward_context=堅持住
他舉重嗰時掬先測量氣堅持住 8536.3984375
他舉重嗰時掬先測量人的精神狀態堅持住 1234.3973388671875
他舉重嗰時掬先測量心情堅持住 4336.623046875
他舉重嗰時掬先測量氣息堅持住 6795.72509765625
他舉重嗰時掬先測量氣體堅持住 4760.0786132

'他舉重嗰時掬先測量人的精神狀態堅持住，終於破了世界紀錄'

In [195]:
can2man("而家男子100米嘅世界記錄係9.58秒。")

[['現下', '現而今', '這會兒'], ['男子100'], ['米', '公尺'], ['的'], ['生活', '世界'], ['記錄是9.58秒。']]
i=0 backward_context=, forward_context=男子100
現下男子100 6701.765625
現而今男子100 5827.7001953125
這會兒男子100 7303.15625
i=2 backward_context=現而今男子100, forward_context=的生活
現而今男子100米的生活 1117.426513671875
現而今男子100公尺的生活 673.7601318359375
i=4 backward_context=現而今男子100公尺的, forward_context=記錄是9
現而今男子100公尺的生活記錄是9 373.15582275390625
現而今男子100公尺的世界記錄是9 190.93446350097656
[['現而今'], ['男子100'], ['公尺'], ['的'], ['世界'], ['記錄是9.58秒。']]


'現而今男子100公尺的世界記錄是9.58秒。'