In [1]:
from hanzipy.decomposer import HanziDecomposer
from hanzipy.dictionary import HanziDictionary

decomposer = HanziDecomposer()
dictionary = HanziDictionary()

print('Ready:', type(decomposer).__name__, type(dictionary).__name__)

INFO:root:Done compiling 12040 characters
DEBUG:root:Compiling hanzi characters dictionary...
DEBUG:root:Starting to read frequency data
DEBUG:root:Frequency data loaded


Ready: HanziDecomposer HanziDictionary


## Dictionary functions
Try `definition_lookup`, `dictionary_search`, `get_pinyin`, and examples.

In [11]:
# definition_lookup() returns a list of dictionary entries
dictionary.definition_lookup('大人')

[{'traditional': '大人',
  'simplified': '大人',
  'pinyin': 'da4 ren5',
  'definition': 'adult/grownup/title of respect toward superiors'}]

In [3]:
# dictionary_search() finds occurrences/compounds containing the query
results = dictionary.dictionary_search('听')
len(results), results[:5]

(138,
 [{'traditional': '不問就聽不到假話',
   'simplified': '不问就听不到假话',
   'pinyin': 'bu4 wen4 jiu4 ting1 bu4 dao4 jia3 hua4',
   'definition': "Don't ask and you won't be told any lies. (idiom)"},
  {'traditional': '不聽命',
   'simplified': '不听命',
   'pinyin': 'bu4 ting1 ming4',
   'definition': 'to disobey'},
  {'traditional': '不聽老人言，吃虧在眼前',
   'simplified': '不听老人言，吃亏在眼前',
   'pinyin': 'bu4 ting1 lao3 ren2 yan2 , chi1 kui1 zai4 yan3 qian2',
   'definition': '(idiom) ignore your elders at your peril'},
  {'traditional': '且聽下回分解',
   'simplified': '且听下回分解',
   'pinyin': 'qie3 ting1 xia4 hui2 fen1 jie3',
   'definition': 'to listen to the next chapter for an explanation'},
  {'traditional': '中聽',
   'simplified': '中听',
   'pinyin': 'zhong1 ting1',
   'definition': "pleasant to hear (i.e. agreeable news)/to one's liking/music to one's ears/Taiwan pr. [zhong4 ting1]"}])

In [9]:
# get_pinyin() returns all possible pinyin readings for a single character
dictionary.get_pinyin('本')

['ben3']

In [5]:
# get_examples() groups example words by frequency buckets
dictionary.get_examples('听')

{'high_frequency': [{'traditional': '听',
   'simplified': '听',
   'pinyin': 'yin3',
   'definition': 'smile (archaic)'},
  {'traditional': '聽',
   'simplified': '听',
   'pinyin': 'ting1',
   'definition': 'to listen/to hear/to obey/a can (loanword from English "tin")/classifier for canned beverages'},
  {'traditional': '聽',
   'simplified': '听',
   'pinyin': 'ting4',
   'definition': '(literary pronunciation, still advocated in Taiwan) to rule/to sentence/to allow'}],
 'mid_frequency': [{'traditional': '傾聽',
   'simplified': '倾听',
   'pinyin': 'qing1 ting1',
   'definition': 'to listen attentively'},
  {'traditional': '好聽',
   'simplified': '好听',
   'pinyin': 'hao3 ting1',
   'definition': 'pleasant to hear'},
  {'traditional': '收聽',
   'simplified': '收听',
   'pinyin': 'shou1 ting1',
   'definition': 'to listen to (a radio broadcast)'},
  {'traditional': '竊聽',
   'simplified': '窃听',
   'pinyin': 'qie4 ting1',
   'definition': 'to eavesdrop/to wiretap'},
  {'traditional': '聆聽',
   'simp

## Decomposer functions
Try decomposition levels and component helpers.

In [None]:
# decompose(character, decomposition_type=None)
# decomposition_type: 1='Once', 2='Radical', 3='Graphical'
decomposer.decompose('爱')

In [None]:
decomposer.decompose('爱', 2)

In [None]:
decomposer.decompose('爱', 3)

In [None]:
# decompose_many() decomposes each char in a string
decomposer.decompose_many('爱橄黃', 2)

In [None]:
# component_exists() and get_characters_with_component()
decomposer.component_exists('乂'), decomposer.component_exists('$')

In [None]:
decomposer.get_characters_with_component('囗')[:25]

In [6]:
# get_radical_meaning() returns a short meaning (when known)
decomposer.get_radical_meaning('氵')

'water'

## Helper: numbered pinyin → tone marks
If you want tone marks (e.g. `xue3` → `xuě`) you can use this helper.

In [None]:
TONE_MARKS = {
    'a': ['ā', 'á', 'ǎ', 'à'],
    'e': ['ē', 'é', 'ě', 'è'],
    'i': ['ī', 'í', 'ǐ', 'ì'],
    'o': ['ō', 'ó', 'ǒ', 'ò'],
    'u': ['ū', 'ú', 'ǔ', 'ù'],
    'ü': ['ǖ', 'ǘ', 'ǚ', 'ǜ'],
}

def numbered_pinyin_to_tone_marks(pinyin: str) -> str:
    def convert_syllable(syllable: str) -> str:
        if not syllable:
            return syllable
        syllable = syllable.replace('u:', 'ü').replace('U:', 'Ü')
        tone = 5
        if syllable[-1].isdigit():
            tone = int(syllable[-1])
            syllable = syllable[:-1]
        if tone in (0, 5):
            return syllable
        lower = syllable.lower()
        vowels = 'aeiouü'
        vowel_positions = [i for i, ch in enumerate(lower) if ch in vowels]
        if not vowel_positions:
            return syllable
        mark_index = None
        for v in ('a', 'e'):
            idx = lower.find(v)
            if idx != -1:
                mark_index = idx
                break
        if mark_index is None and 'ou' in lower:
            mark_index = lower.find('o')
        if mark_index is None and 'iu' in lower:
            mark_index = lower.find('u')
        if mark_index is None and 'ui' in lower:
            mark_index = lower.find('i')
        if mark_index is None:
            mark_index = vowel_positions[-1]
        ch = lower[mark_index]
        marked = TONE_MARKS.get(ch, [ch, ch, ch, ch])[tone - 1]
        if syllable[mark_index].isupper():
            marked = marked.upper()
        return syllable[:mark_index] + marked + syllable[mark_index + 1:]
    return ' '.join(convert_syllable(s) for s in pinyin.split())

# Demo
numbered_pinyin_to_tone_marks('xue3'), numbered_pinyin_to_tone_marks('nv3'), numbered_pinyin_to_tone_marks('lu:4')