In [1]:
from collections import defaultdict

can2man_table = defaultdict(list)

with open("phrase_table.txt", "r") as input_file:
    for line in input_file.read().splitlines():
        [man_word, can_word] = line.split("|")
        can2man_table[can_word].append(man_word)

print(f"Generated Cantonese to Mandarin phrase table of size {len(can2man_table)}")
print(list(can2man_table.items())[0:10])

Generated Cantonese to Mandarin phrase table of size 11751
[('少少', ['一丁點兒', '一點兒', '一點', '一點點兒', '很少份量', '很少']), ('一上一落', ['一上一下']), ('下', ['一下']), ('一搊', ['一串']), ('啲', ['一些', '些', '某些', '這些']), ('單嘢', ['一件事']), ('件', ['一件']), ('一班', ['一伙', '全班', '那班']), ('單拖', ['一個人']), ('獨贏', ['一個人得頭彩'])]


In [2]:
common_trad_chars = None

with open("common_trad_chars.txt", "r") as input_file:
    common_trad_chars = set(input_file.read())

print("A sample of common traditional characters: ", list(common_trad_chars)[0:10])

A sample of common traditional characters:  ['羨', '磕', '階', '基', '慷', '吻', '已', '畸', '婪', '剖']


In [3]:
import pandas as pd
from StarCC import PresetConversion
convert = PresetConversion(src='cn', dst='hk', with_phrase=False)

# df = pd.read_csv("common_man_words.csv", sep="\t")
# common_man_words = { convert(word) for word in df["word"] }
common_man_words = set()
with open("common_man_words.dict.yaml", "r") as input_file:
    for line in input_file.read().splitlines():
        if not line.startswith("#"):
            word = line.split("\t")[0]
            common_man_words.add(convert(word))

with open("common_man_words.txt", "w+") as output_file:
    for word in common_man_words:
        output_file.write(word + "\n")

print(f"Got {len(common_man_words)} Mandarin words")

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/kk/n4ff6h1n3t170b1m4zv09yf40000gn/T/jieba.cache
Loading model cost 0.708 seconds.
Prefix dict has been built successfully.


Got 526508 Mandarin words


In [4]:
df = pd.read_csv("common_can_words.csv", sep=",")
common_can_words = set(df["char"])

print(f"Got {len(common_can_words)} Cantonese words")

Got 92568 Cantonese words


In [5]:
common_words = common_can_words.intersection(common_man_words)

num_added_words = 0
for word in common_words:
    if not word in can2man_table or not word in can2man_table[word]:
        num_added_words += 1
        can2man_table[word].append(word)

with open("can2man_phrase_table.txt", "w+") as output_file:
    for can, mans in can2man_table.items():
        for man in mans:
            output_file.write(can + "|" + man + "\n")

print(f"Got {len(common_words)} common words")
print(f"Added {num_added_words} shared words to can2man_table")

Got 48844 common words
Added 48820 shared words to can2man_table


In [6]:
def longest_match_translate(s, phrase_table):
    man_phrases: list[list[str]] = []
    oov_word = ""
    while s:
        longest_match = None
        for phrase in phrase_table:
            if s.startswith(phrase) and (longest_match is None or len(phrase) > len(longest_match)):
                longest_match = phrase
        if longest_match:
            if len(oov_word) > 0:
                man_phrases.append([oov_word])
                oov_word = ""
            can_original = [longest_match] if len(longest_match) <= 1 and all(c in common_trad_chars for c in longest_match) else []
            man_phrase = phrase_table[longest_match]
            man_phrases.append(can_original + man_phrase)
            s = s[len(longest_match):].lstrip()
        else:
            oov_word += s[0]
            s = s[1:].lstrip()
    if len(oov_word) > 0:
        man_phrases.append([oov_word])
    # Merge anchor phrases (those with a single mandarin translation)
    i = 0
    merged_man_phrases = []
    while i < len(man_phrases):
        merged_phrase = ""
        while i < len(man_phrases) and len(man_phrases[i]) == 1:
            merged_phrase += man_phrases[i][0]
            i += 1
        if len(merged_phrase) > 0:
            merged_man_phrases.append([merged_phrase])
            merged_phrase = ""
        else:
            merged_man_phrases.append(man_phrases[i])
            i += 1
    return merged_man_phrases

In [7]:
longest_match_translate("唔該你細聲啲，我喺度做緊嘢。", can2man_table)

[['不好意思', '勞', '勞駕', '有勞您', '請你', '請', '請教', '謝', '謝謝', '麻煩您'],
 ['你小聲點，我'],
 ['在此', '在這裡', '在這邊', '在那裡', '在那邊'],
 ['正在做'],
 ['事情', '東西'],
 ['。']]

In [46]:
from transformers import BertTokenizerFast, GPT2LMHeadModel
tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese')
model = GPT2LMHeadModel.from_pretrained('ckiplab/gpt2-base-chinese')

In [47]:
import torch

# https://huggingface.co/docs/transformers/perplexity
def get_most_fluent_sentence_index(candidates: list[str]) -> int:
    encodings = [tokenizer(candidate, return_tensors="pt") for candidate in candidates]
    ppls = []
    for encoding in encodings:
        target_ids_list = []
        seq_len = encoding.input_ids.size(1)
        for end_loc in range(2, seq_len + 1, 2):
            target_ids = encoding.input_ids[0].clone()
            target_ids[end_loc:] = -100
            target_ids_list.append(target_ids)
        target_ids = torch.stack(target_ids_list)
        input_ids = encoding.input_ids.expand(target_ids.shape)
        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)
            ppl = outputs.loss.item() * seq_len
            ppls.append(ppl)
    return torch.argmin(torch.tensor(ppls))

In [67]:
import regex

# match any unicode punctuation character and anything after it
punctuation_pattern = regex.compile(r"\p{P}+.*", flags=regex.UNICODE)
chinese_char_pattern = regex.compile(r"[\u4e00-\u9fff]")

def chop_off_at_punctuation(s: str) -> str:
    match = punctuation_pattern.search(s)
    if match:
        index = match.start()
        return s[:index]
    else:
        return s

def chop_off_at_canto_char(s: str) -> str:
    for i, c in enumerate(s):
        if chinese_char_pattern.match(c) and not c in common_trad_chars:
            return s[:i]
    return s

def flatten(l):
    return [item for sublist in l for item in sublist]

def can2man(s: str) -> str:
    man_phrases = longest_match_translate(s, can2man_table)
    # print(man_phrases)
    for i, phrases in enumerate(man_phrases):
        if len(phrases) == 1:
            continue
        else:
            j = i + 1
            while j < len(man_phrases) and man_phrases[j] == 1:
                j += 1
            backward_context = "".join(flatten(man_phrases[max(0, i-5):i]))
            forward_context = "".join(flatten(man_phrases[i + 1:j]))
            # forward context is too small
            while len(forward_context) < 10 and j < len(man_phrases):
                forward_context += man_phrases[j][0]
                j += 1
            forward_context = chop_off_at_canto_char(chop_off_at_punctuation(forward_context))
            # print(f"i={i} backward_context={backward_context}, forward_context={forward_context}")
            candidates = [backward_context + phrase + forward_context for phrase in man_phrases[i]]
            j = get_most_fluent_sentence_index(candidates)
            man_phrases[i] = [man_phrases[i][j]]
    # print(man_phrases)
    return "".join(flatten(man_phrases))

import random
def can2man_random(s: str) -> str:
    man_phrases = longest_match_translate(s, can2man_table)
    # print(man_phrases)
    for i, phrases in enumerate(man_phrases):
        if len(phrases) == 1:
            continue
        else:
            man_phrases[i] = [random.choice(man_phrases[i])]
    # print(man_phrases)
    return "".join(flatten(man_phrases))

In [70]:
print(can2man("唔該你細聲啲，我喺度做緊嘢。"))
print(can2man_random("唔該你細聲啲，我喺度做緊嘢。"))

請你小聲點，我在這裡正在做東西。
請教你小聲點，我在那裡正在做事情。


In [71]:
print(can2man("邊個整到本書甩皮甩骨"))
print(can2man_random("邊個整到本書甩皮甩骨"))

哪一個調整到一本書散了架子
哪一個造達到一本書皮開骨散


In [72]:
print(can2man("佢舉重嗰時掬住度氣堅持住，終於破咗世界紀錄"))
print(can2man_random("佢舉重嗰時掬住度氣堅持住，終於破咗世界紀錄"))

他舉重嗰時掬先度氣堅持先，終於破了世界紀錄
他舉重嗰時掬先那裏氣體堅持先，終於破了世界紀錄


In [73]:
print(can2man("而家男子100米嘅世界記錄係9.58秒。"))
print(can2man_random("而家男子100米嘅世界記錄係9.58秒。"))

現下男子100米的世界記錄是9.58秒。
這會兒男子100米的生活記錄是9.58秒。


In [75]:
print(can2man("絕對 唔 可以 同等 ， 母語 （ 粵語 ） 一 定係 第一 ， 至於 英文 你 覺得 重要 唔係 問題 ， 但 地位 唔 可以 超越 母語 ， 你 有 見 過 其他 國家 （ 除咗 新加坡 ） 會 將 母語 ge 地位 擺 係 外語 之後 ？".replace(" ", "")))
print(can2man_random("絕對 唔 可以 同等 ， 母語 （ 粵語 ） 一 定係 第一 ， 至於 英文 你 覺得 重要 唔係 問題 ， 但 地位 唔 可以 超越 母語 ， 你 有 見 過 其他 國家 （ 除咗 新加坡 ） 會 將 母語 ge 地位 擺 係 外語 之後 ？".replace(" ", "")))

絕對不可以同等，母語（粵語）一定是第一，至於英文你覺得重要不然問題，但地位不可以超越母語，你有見過其他國家（除了新加坡）會把母語ge地位擺是外語之後？
絕對不可以同等，母語（粵語）一定是第一，至於英文你覺得重要否則問題，但是地位不可以超越母語，你有見比其他國家（摘了新加坡）會將母語ge地位擺設是外語之後？


In [57]:
from tqdm import tqdm

with open("dev.can", "r") as input_file, open("dev.pred.base.man", "w+") as output_file:
    for line in tqdm(input_file.read().splitlines()[0:1000]):
        output_file.write(can2man(line) + "\n")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 3/1000 [00:03<20:11,  1.22s/it]


KeyboardInterrupt: 

In [77]:
from tqdm import tqdm

with open("can.txt", "r") as input_file, open("man_40K_to_120K.txt", "w+") as output_file:
    for line in tqdm(input_file.read().splitlines()[40000:40000*3]):
        output_file.write(can2man_random(line) + "\n")

100%|██████████| 80000/80000 [1:16:57<00:00, 17.32it/s]


In [6]:
# Extend phrase table with wordshk
import json
import math

with open("wordshk_phrase_table.json", "r") as input_file:
    wordshk_table = json.load(input_file)

def max_man_len(can_word_len: int) -> int:
    return math.ceil(-2 * math.tanh(.5 * can_word_len - 1.9) + 3.1)

print(f"Showing first few pairs added from wordshk")
num_added_words = 0
for word, mans in wordshk_table.items():
    if not word in common_trad_chars and (not word in can2man_table or not word in can2man_table[word]):
        num_added_words += 1
        mans = [m for ms in mans for m in ms if len(m) <= max_man_len(len(word))]
        if num_added_words <= 10 and len(mans) > 0:
            print(f"Adding the pair {word} -> {mans}")
        if len(mans) > 0:
            can2man_table[word].extend(mans)

with open("can2man_phrase_table_all.txt", "w+") as output_file:
    for can, mans in can2man_table.items():
        for man in mans:
            output_file.write(can + "|" + man + "\n")

print(f"Added {num_added_words} new words from wordshk to can2man_table")

Showing first few pairs added from wordshk
Adding the pair 田螺厴 -> ['鰓蓋']
Adding the pair 腍滋滋 -> ['柔軟的']
Adding the pair 淋滋滋 -> ['柔軟的']
Adding the pair 走犯 -> ['逃犯']
Adding the pair 打正旗號 -> ['公開地']
Adding the pair 咇 -> ['打', '巡邏']
Added 11982 new words from wordshk to can2man_table


In [21]:
can2man("唔該你細聲啲，我喺度做緊嘢。")

'不好意思你小聲點，我在這裡正在做東西。'

In [22]:
can2man("邊個整到本書甩皮甩骨")

'哪一個受傷一本書散了架子'

In [23]:
can2man("佢舉重嗰時掬住度氣堅持住，終於破咗世界紀錄")

'他舉重什麼時候提升住程度人的精神狀態堅持住，終於打破了世界紀錄'

In [24]:
can2man("而家男子100米嘅世界記錄係9.58秒。")

'現下男子100公尺的世界記錄是9.58秒。'

In [25]:
from tqdm import tqdm

with open("dev.can", "r") as input_file, open("dev.pred.wordshk.man", "w+") as output_file:
    for line in tqdm(input_file.read().splitlines()[0:1000]):
        output_file.write(can2man(line) + "\n")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


100%|██████████| 1000/1000 [10:05<00:00,  1.65it/s]


In [26]:
# Measure BLEU of base model

import os
import sacrebleu

def eval_bleu(ref, hyp):
    """
    Given a file of hypothesis and reference files,
    evaluate the BLEU score using Moses scripts.
    """
    assert os.path.isfile(ref) and os.path.isfile(hyp)
    with open(ref, "r") as ref_file, open(hyp, "r") as hyp_file:
        refs = [ref_file.read().splitlines()]
        hyp = hyp_file.read().splitlines()
        bleu = sacrebleu.BLEU(trg_lang="zh")
        return bleu.corpus_score(hyp, refs).score


def eval_chrf(ref, hyp):
    """
    Given a file of hypothesis and reference files,
    evaluate the BLEU score using Moses scripts.
    """
    assert os.path.isfile(ref) and os.path.isfile(hyp)
    with open(ref, "r") as ref_file, open(hyp, "r") as hyp_file:
        refs = [ref_file.read().splitlines()]
        hyp = hyp_file.read().splitlines()
        chrf = sacrebleu.CHRF()
        return chrf.corpus_score(hyp, refs).score


print("Identity charBLEU:", eval_bleu("dev.man", "dev.can"))
print("Identity CHRF:", eval_chrf("dev.man", "dev.can"))

print()

print("Phrase-Base charBLEU:", eval_bleu("dev.man", "dev.pred.base.man"))
print("Phrase-Base CHRF:", eval_chrf("dev.man", "dev.pred.base.man"))

print()

print("Phrase-Wordshk charBLEU:", eval_bleu("dev.man", "dev.pred.wordshk.man"))
print("Phrase-Wordshk CHRF:", eval_chrf("dev.man", "dev.pred.wordshk.man"))


Identity charBLEU: 11.916798739593405
Identity CHRF: 11.892985577296292

Phrase-Base charBLEU: 18.834564873844943
Phrase-Base CHRF: 20.091559992795194

Phrase-Wordshk charBLEU: 12.415971775936248
Phrase-Wordshk CHRF: 16.061245936906026


In [24]:
# from joblib import Parallel, delayed

import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"

with open("cantonese_18M_freq_10_up.txt", "r") as input_file, open("cantonese_18M_freq_10_up.pred.base.txt", "w+") as output_file:
    # Parallel(prefer="threads", n_jobs=-1, verbose=2)(delayed(translate)(line) for line in input_file.read().splitlines()[0:100000])
    for line in input_file.read().splitlines()[0:100000]:
        output_file.write(can2man(line.replace(" ", "")) + "\n")


KeyboardInterrupt: 