In [3]:
from collections import defaultdict

can2man_table = defaultdict(list)

with open("phrase_table.txt", "r") as input_file:
    for line in input_file.read().splitlines():
        [man_word, can_word] = line.split("|")
        can2man_table[can_word].append(man_word)

print(f"Generated Cantonese to Mandarin phrase table of size {len(can2man_table)}")
print(list(can2man_table.items())[0:10])

Generated Cantonese to Mandarin phrase table of size 11751
[('少少', ['一丁點兒', '一點兒', '一點', '一點點兒', '很少份量', '很少']), ('一上一落', ['一上一下']), ('下', ['一下']), ('一搊', ['一串']), ('啲', ['一些', '些', '某些', '這些']), ('單嘢', ['一件事']), ('件', ['一件']), ('一班', ['一伙', '全班', '那班']), ('單拖', ['一個人']), ('獨贏', ['一個人得頭彩'])]


In [2]:
common_trad_chars = None

with open("common_trad_chars.txt", "r") as input_file:
    common_trad_chars = set(input_file.read())

print("A sample of common traditional characters: ", list(common_trad_chars)[0:10])

A sample of common traditional characters:  ['梳', '燭', '粳', '憑', '賢', '曾', '舔', '刨', '寞', '箭']


In [None]:
import pandas as pd
from StarCC import PresetConversion
convert = PresetConversion(src='cn', dst='hk', with_phrase=False)

# df = pd.read_csv("common_man_words.csv", sep="\t")
# common_man_words = { convert(word) for word in df["word"] }
common_man_words = set()
with open("common_man_words.dict.yaml", "r") as input_file:
    for line in input_file.read().splitlines():
        if not line.startswith("#"):
            word = line.split("\t")[0]
            common_man_words.add(convert(word))

with open("common_man_words.txt", "w+") as output_file:
    for word in common_man_words:
        output_file.write(word + "\n")

print(f"Got {len(common_man_words)} Mandarin words")

In [None]:
df = pd.read_csv("common_can_words.csv", sep=",")
common_can_words = set(df["char"])

print(f"Got {len(common_can_words)} Cantonese words")

In [None]:
common_words = common_can_words.intersection(common_man_words)

num_added_words = 0
for word in common_words:
    if not word in can2man_table or not word in can2man_table[word]:
        num_added_words += 1
        can2man_table[word].append(word)

with open("can2man_phrase_table.txt", "w+") as output_file:
    for can, mans in can2man_table.items():
        for man in mans:
            output_file.write(can + "|" + man + "\n")

print(f"Got {len(common_words)} common words")
print(f"Added {num_added_words} shared words to can2man_table")

In [None]:
# Extend phrase table with wordshk
import json
import math

with open("wordshk_phrase_table.json", "r") as input_file:
    wordshk_table = json.load(input_file)

def max_man_len(can_word_len: int) -> int:
    return math.ceil(-2 * math.tanh(.5 * can_word_len - 1.9) + 3.1)

print(f"Showing first few pairs added from wordshk")
num_added_words = 0
for word, mans in wordshk_table.items():
    if not word in common_trad_chars and (not word in can2man_table or not word in can2man_table[word]):
        num_added_words += 1
        mans = [m for ms in mans for m in ms if len(m) <= max_man_len(len(word))]
        if num_added_words <= 10 and len(mans) > 0:
            print(f"Adding the pair {word} -> {mans}")
        if len(mans) > 0:
            can2man_table[word].extend(mans)

with open("can2man_phrase_table_all.txt", "w+") as output_file:
    for can, mans in can2man_table.items():
        for man in mans:
            output_file.write(can + "|" + man + "\n")

print(f"Added {num_added_words} new words from wordshk to can2man_table")