# Convert to JSON

In [13]:
import json

def txt_to_json(input_file: str, output_file: str):
    data_dict = {}

    # Read the text file
    with open(input_file, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if line:  # Skip empty lines
                parts = line.split()
                if len(parts) >= 2:
                    # The first part is the key, the rest are joined to form the value
                    key = parts[0]
                    value = " ".join(parts[1:])
                    data_dict[key] = value

    data_dict = dict(sorted(data_dict.items(), key=lambda x: x[0]))

    # Save the dictionary as a JSON file
    with open(output_file, 'w', encoding='utf-8') as json_file:
        json.dump(data_dict, json_file, ensure_ascii=False, indent=4)
    
    print(f"Successfully converted '{input_file}' to '{output_file}'")

txt_to_json('data/translate_raw.txt', 'data/translate.json')


Successfully converted 'data/translate_raw.txt' to 'data/translate.json'


# Add words from wugniu_dict.json

### Preprocess wugniu_dict.json

In [14]:
import json

def remove_endings_but_keep_self(input_file: str, output_file: str):
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # Define endings to remove, and exceptions to keep
    endings_to_remove = ('些', '的', '他', '你', '給', '與', '被', '誰', '呢', '掉')
    exceptions_to_keep = set(endings_to_remove)

    # Apply filter
    filtered = {
        key: value for key, value in data.items()
        if not (key.endswith(endings_to_remove) and key not in exceptions_to_keep)
    }

    print(f"Removed {len(data) - len(filtered)} entries.")
    
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(filtered, f, ensure_ascii=False, indent=4)

    print(f"Filtered data saved to {output_file}")


remove_endings_but_keep_self('data/wugniu_dict_vocab.json', 'data/wugniu_dict_updated.json')

Removed 272 entries.
Filtered data saved to data/wugniu_dict_updated.json


In [15]:
import json

def remove_startswith_any_but_keep_self(input_file: str, output_file: str, prefixes: list):
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    filtered = {
        key: value for key, value in data.items()
        if not any(key.startswith(p) and key != p for p in prefixes)
    }

    removed_count = len(data) - len(filtered)
    print(f"Removed {removed_count} entries starting with {prefixes}.")

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(filtered, f, ensure_ascii=False, indent=4)

    print(f"Filtered data saved to {output_file}")

remove_startswith_any_but_keep_self(
    'data/wugniu_dict_updated.json',
    'data/wugniu_dict_updated.json',
    prefixes=['你', '妳', '被', '誰', '喝']
)

Removed 140 entries starting with ['你', '妳', '被', '誰', '喝'].
Filtered data saved to data/wugniu_dict_updated.json


### Add words from pre-processed dict

In [16]:
import json

def enrich_translate_with_matches(translate_file: str, wugniu_file: str, output_file: str):
    # Load both JSON files
    with open(translate_file, 'r', encoding='utf-8') as f:
        translate_dict = json.load(f)

    with open(wugniu_file, 'r', encoding='utf-8') as f:
        wugniu_dict = json.load(f)

    # Go through each single-character key in translate.json
    for key, value in translate_dict.copy().items():
        if len(key) == 1:
            # Look for any word in wugniu_dict that contains this character
            for word in wugniu_dict:
                if key in word and word not in translate_dict:
                    translate_dict[word] = word  # Add it as word: word

    translate_dict = dict(sorted(translate_dict.items(), key=lambda x: x[0]))

    # Save the updated translate dictionary
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(translate_dict, f, ensure_ascii=False, indent=4)

    print(f"Updated translate dictionary saved to {output_file}")

enrich_translate_with_matches('data/translate.json', 'data/wugniu_dict_updated.json', 'data/translate.json')

Updated translate dictionary saved to data/translate.json


### Sorted

In [17]:
import json
from collections import defaultdict, OrderedDict

def group_and_sort_translate(input_file: str, output_file: str):
    # Load the dictionary
    with open(input_file, 'r', encoding='utf-8') as f:
        translate_dict = json.load(f)

    # Create a grouping structure
    groups = defaultdict(list)
    singles = {}

    for key, value in translate_dict.items():
        if len(key) == 1:
            singles[key] = value
        else:
            # Group under each single char that exists in the key and also in the single-char list
            for char in key:
                if char in translate_dict and len(char) == 1:
                    groups[char].append((key, value))
                    break
            else:
                # If no grouping character found, leave it ungrouped
                groups[None].append((key, value))

    # Now build the final sorted dictionary
    final_dict = OrderedDict()

    # Sort single chars
    for char in sorted(singles):
        final_dict[char] = singles[char]
        # Sort multi-char entries under this char
        for word, val in sorted(groups.get(char, []), key=lambda x: x[0]):
            final_dict[word] = val

    # Add ungrouped words at the end, sorted
    for word, val in sorted(groups.get(None, []), key=lambda x: x[0]):
        final_dict[word] = val

    # Save the new JSON
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(final_dict, f, ensure_ascii=False, indent=4)

    print(f"Grouped and sorted dictionary saved to {output_file}")

group_and_sort_translate('data/translate.json', 'data/translate.json')

Grouped and sorted dictionary saved to data/translate.json


# To Simp. Chinese

In [18]:
import json
from opencc import OpenCC
from typing import Dict, Union

def preserve_convert(text: str, exclude_char: str, cc: OpenCC) -> str:
    """Convert text to simplified Chinese, but preserve exclude_char."""
    return ''.join(char if char == exclude_char else cc.convert(char) for char in text)

def add_simplified_forms(json_file_path: str, output_file_path: str) -> None:
    cc = OpenCC('t2s')  # Traditional to Simplified Chinese
    EXCLUDE_CHAR = "箇"

    # Load the JSON file
    try:
        with open(json_file_path, 'r', encoding='utf-8') as f:
            words_dict: Dict[str, Union[str, list]] = json.load(f)
    except FileNotFoundError:
        print(f"Error: The file {json_file_path} was not found.")
        return
    except json.JSONDecodeError:
        print(f"Error: Failed to decode JSON from {json_file_path}.")
        return

    updated_dict: Dict[str, Union[str, list]] = {}

    for traditional_word, translation in words_dict.items():
        simplified_word = cc.convert(traditional_word)

        # Apply special preserve logic to translation
        if isinstance(translation, str):
            simplified_translation = preserve_convert(translation, EXCLUDE_CHAR, cc)
        elif isinstance(translation, list):
            simplified_translation = [
                preserve_convert(t, EXCLUDE_CHAR, cc) for t in translation
            ]
        else:
            simplified_translation = translation  # Just in case of odd format

        # Add original traditional pair
        updated_dict.setdefault(traditional_word, translation)

        # Add simplified version if key differs
        if simplified_word != traditional_word:
            if simplified_word in updated_dict:
                if isinstance(updated_dict[simplified_word], list):
                    if simplified_translation not in updated_dict[simplified_word]:
                        updated_dict[simplified_word].append(simplified_translation)
                else:
                    if updated_dict[simplified_word] != simplified_translation:
                        updated_dict[simplified_word] = [
                            updated_dict[simplified_word],
                            simplified_translation
                        ]
            else:
                updated_dict[simplified_word] = simplified_translation

    # Save updated dictionary
    try:
        with open(output_file_path, 'w', encoding='utf-8') as f:
            json.dump(updated_dict, f, ensure_ascii=False, indent=4)
        print(f"Updated dictionary saved to {output_file_path}")
    except Exception as e:
        print(f"Error saving file: {e}")

# Run it
add_simplified_forms('data/translate.json', 'data/translate.json')


Updated dictionary saved to data/translate.json
