In [41]:
import json
from collections import defaultdict
from tqdm import tqdm

can2man_table = defaultdict(list)

with open("can2man_phrase_table.txt", "r") as input_file:
    for line in input_file.read().splitlines():
        [can, man] = line.split("|")
        can2man_table[can].append(man)
    print(f"Loaded {len(can2man_table)} word pairs from can2man_table")

common_man_words = set()
with open("common_man_words.txt", "r") as input_file:
    for word in input_file.read().splitlines():
        common_man_words.add(word)

wordshk_table = defaultdict(list)

import re

def is_ascii(s):
    return all(ord(c) < 128 for c in s)

def filter_empty_string(strs: list[str]) -> list[str]:
    return [s for s in strs if len(s) > 0]

def process_eng_definition(d: str) -> list[str]:
    no_paren = re.sub("([\(\[（]).*?([\)\]）])", "", d).strip()
    no_newline = re.sub(r"\n.*", "", no_paren)
    return filter_empty_string(
        [re.sub(r"\b(.+)/(.+)\b", "\g<1>", d.strip()) for d in re.split("[;|,]", no_newline) if\
        (not d.strip().lower().startswith("literally")
        and not d.strip().lower().startswith("usually")
        and not d.strip().lower().startswith("this word")
        and not d.strip().lower().startswith("note that")
        and not d.strip().lower().startswith("for example")
        and not re.search(r"\beg\b", d)
        and not re.search(r"\be[.]g[.]", d)
        and not re.search(r"\betc\b", d)
        and not re.search(r"\bhomophone\b", d)
        and not re.search(r"\bi[.]e[.]", d)
        and not "\"" in d
        and not (d.strip().lower() == "surname" or d.strip().lower() == "a surname"))
        and is_ascii(d)
        and d.count(" ") <= 5])

with open("wordshk_can_eng.json", "r") as input_file:
    data = json.load(input_file)
    print(f"Loaded {len(data.keys())} entries from wordshk")
    for word, eng in tqdm(data.items()):
        variants = word.split("/")
        for variant in variants:
            # Filter out words like "⋯嚟⋯去"
            if "⋯" in variant:
                if variant.count("⋯") > 1:
                    continue
                dot_index = variant.index("⋯")
                if dot_index == 0:
                    variant = variant[1:]
                elif dot_index == len(variant) - 1:
                    variant = variant[:-1]
                else:
                    # print(f"skipping {variant}")
                    continue
            if not variant in can2man_table and not variant in common_man_words:
                defs = [process_eng_definition(d) for d in eng if len(process_eng_definition(d)) >= 1]
                if len(defs) >= 1:
                    wordshk_table[variant].extend(defs)

with open("wordshk_phrase_table_eng.json", "w+") as output_file:
    json.dump(wordshk_table, output_file, ensure_ascii=False)

print(f"Found {len(wordshk_table.keys())} new definitions from wordshk")

Loaded 58876 word pairs from can2man_table
Loaded 46456 entries from wordshk


100%|██████████| 46456/46456 [00:00<00:00, 76252.64it/s]

Found 14272 new definitions from wordshk





In [61]:
%env GOOGLE_APPLICATION_CREDENTIALS=translate-english-to-mandarin-c793a8415d64.json

from google.cloud import translate_v2 as translate
translate_client = translate.Client()

wordshk_id_table = defaultdict(list)
eng_phrase_to_id = {}
max_id = 0

for word, defs in wordshk_table.items():
    for ds in defs:
        ds_id = []
        for d in ds:
            if not d in eng_phrase_to_id:
                eng_phrase_to_id[d] = max_id
                max_id += 1
            ds_id.append(eng_phrase_to_id[d])
        wordshk_id_table[word].append(ds_id)

# https://stackoverflow.com/a/312464/6798201
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

def translate_texts(source, target, texts):
    # Text can also be a sequence of strings, in which case this method
    # will return a sequence of results for each text.
    final_results = []
    for segment in tqdm(list(chunks(texts, 128))):
        results = translate_client.translate(segment, source_language=source, target_language=target)
        final_results.extend([result["translatedText"] for result in results])
    return final_results

assert translate_texts("en", "zh-hk", ["section", "length", "festival", "holiday"]) == ["部分", "長度", "節日", "假期"]

print("Calling Google Translate API...")
man_phrases = translate_texts("en", "zh-hk", list(eng_phrase_to_id.keys()))
print("Google Translate API Finished translation...")

id_to_man_phrase = dict(zip(eng_phrase_to_id.values(), man_phrases))
wordshk_man_table = {word: [[id_to_man_phrase[d_id] for d_id in ds_id] for ds_id in defs_id] for word, defs_id in wordshk_id_table.items()}

with open("wordshk_phrase_table.json", "w+") as output_file:
    json.dump(wordshk_man_table, output_file, ensure_ascii=False)

print(f"Translated {len(wordshk_man_table.keys())} English definitions to Mandarin")

env: GOOGLE_APPLICATION_CREDENTIALS=translate-english-to-mandarin-c793a8415d64.json


100%|██████████| 1/1 [00:00<00:00,  3.64it/s]


Calling Google Translate API...


100%|██████████| 147/147 [00:25<00:00,  5.77it/s]

Google Translate API Finished translation...
Translated 14272 English definitions to Mandarin



