# Ancy —— 小鹤音形

## 目标

1. 尽量利用现有词库——搜狗细胞词库
2. 不考虑支持繁体
3. 尽量减少内存消耗，从而可以使用更多的细胞词库
4. 词组输入：通过首字形+末字形组合减少重复

## 思路

1. 获取规范汉字
2. 获取汉字编码：拼音编码+小鹤形码
3. 拼接编码（输入方案中，利用 Rime 的拼写运算将音形相拼）
4. 构造词组+编码
5. 输出方案

In [3]:
# 规范汉字已经位于 stage1/hanzi.txt
# 单字数据库（小鹤音形）也已经位于 stage2/single_ch.db
# 得到单字形码
import sqlite3

single_ch_db = sqlite3.connect("stage2/single_ch.db")
chars = {}
cursor = single_ch_db.cursor()
for ch, encodings in cursor.execute("select character, encodings from single_characters"):
    if ch in chars:
        # 只需要关注形码，形码不存在多音的情况
        continue
    shape_encoding = ''
    for encoding in encodings.split(' '):
        # 智能情况下，没必要考虑 *（隐藏不用的形码）
        encoding = encoding.replace('*', '')
        # 仅形码
        encoding = encoding[2:]
        if len(shape_encoding) < len(encoding):
            shape_encoding = encoding
    
    chars[ch] = [shape_encoding]


In [4]:
# 读取字拼音，所有没有形码的都暂时不加形码
# 目前似乎没有办法自动化拆字

from pypinyin import pinyin, Style


# 调整多音字频率（利用明月拼音）
char_multi_freq = {} # ch -> {vocal -> freq_percent}
with open('data/luna_pinyin.dict.yaml', 'r', encoding='UTF-8') as f:
    content = f.read()
    for line in content.splitlines():
        if not '%' in line:
            continue
        ch, vocal, freq_percent = line.split('\t')
        if len(ch) > 1:
            continue
        if not ch in char_multi_freq:
            char_multi_freq[ch] = {}
        char_multi_freq[ch][vocal] = float(freq_percent.strip('%'))

with open('data/pinyin.txt', 'r', encoding='UTF-8') as f:
    content = f.read()

char_vocal = {}
char_freq = {}
for line in content.splitlines():
    splits = line.split()
    ch = splits[1]
    freq = splits[2]
    if len(splits) <= 4:
        print(ch)
        continue
    vocals = splits[4].replace('1', '').replace('2', '').replace('3', '').replace('4', '').split('/')
    char_vocal[ch] = [set(vocals)]
    char_freq[ch] = float(freq)
































秊









﨧


In [5]:
import itertools

# 输出字码
char_encodings = {}
with open('data/out_char.txt', 'w', encoding='UTF-8') as f:
    for ch, ch_pinyin in char_vocal.items():
        shape_encoding = chars.get(ch)
        if shape_encoding is None:
            shape_encoding = ['']
        encodings = []
        vocal_encoding = ch_pinyin[0]
        encodings = itertools.product(vocal_encoding, shape_encoding)
        encodings = list(map(lambda x: ':'.join(x), encodings))
        freq = char_freq[ch]

        for encoding in encodings:
            if ch in char_multi_freq:
                freq_percent = char_multi_freq[ch].get(encoding.split(':')[0])
                if freq_percent is None:
                    freq_percent = 0.0
                freq = freq * (freq_percent / 100.0)
            f.write('{}\t{}\t{}\n'.format(ch, encoding, int(freq)))

In [6]:
from pypinyin import pinyin, Style

# 读取词
with open('data/dict.txt.big.txt', 'r', encoding='UTF-8') as f:
    content = f.read()
    phrases = list(map(lambda line: (line.split()[0], int(line.split()[1])), content.splitlines()))

In [7]:
def gen_workable_phrases(phrases):
    workable = [] # (phrase, encoding, freq)
    for phrase, freq in phrases:
        if len(phrase) == 1:
            continue
        vocal = pinyin(phrase, style=Style.NORMAL)
        phrase_vocals = itertools.product(*vocal)
        for phrase_vocal in phrase_vocals:
            shape_encodings = []
            skip = False
            for p in phrase:
                shape = chars.get(p)
                if shape is None:
                    skip = True
                else:
                    shape_encodings.append(shape[0])
            if skip:
                continue
            out = list(map(lambda x: ':'.join(x), zip(phrase_vocal, shape_encodings)))
            workable.append((phrase, out, freq))
    return workable

with open('data/out_phrase.txt', 'w', encoding='UTF-8') as f:
    for phrase, out, freq in gen_workable_phrases(phrases):
        f.write('{}\t{}\t{}\n'.format(phrase, ' '.join(out), freq))

In [8]:
# 加入细胞词库
# 首先使用 https://github.com/studyzy/imewlconverter/releases/tag/v2.9.0 转换为 搜狗拼音 txt
with open('data/sogou.txt', 'r', encoding='UTF-8') as f:
    content = f.read()
    lines = content.splitlines()

with open('data/out_sogou.txt', 'w', encoding='UTF-8') as f:
    for line in lines:
        vocal, char = line.split(' ')
        vocal = vocal.split("'")[1:]
        combination = list(zip(char, vocal))
        out_phrase = ''
        out_encoding = ''
        for ch, v in combination:
            shape = chars.get(ch)
            if shape is None:
                shape = ''
            else:
                shape = shape[0]
            out_phrase += ch
            out_encoding += ' ' + v + ':' + shape
        f.write('{}\t{}\n'.format(out_phrase, out_encoding.strip()))
        


In [1]:
# 全部合并
header = r'''# Rime default settings

# Rime schema: ancy_flypy_extend

# Rime dictionary: ancy_flypy_extend

---
name: ancy_flypy_extend
version: "0.2"
sort: by_weight
use_preset_vocabulary: false
...
'''

singles = [
    ('一', 'y'),
    ('个', 'g'),
    ('非', 'f'),
    ('他', 't'),
    ('不', 'b'),
    ('可', 'k'),
    ('的', 'd'),
    ('小' ,'x'),
    ('三', 's'),
    ('才', 'c'),
    ('出', 'i'),
    ('去', 'q'),
    ('哦', 'o'),
    ('在', 'z'),
    ('这', 'v'),
    ('就', 'j'),
    ('是', 'u'),
    ('你', 'n'),
    ('我', 'w'),
    ('二', 'e'),
    ('人', 'r'),
    ('没', 'm'),
    ('和', 'h'),
    ('平', 'p'),
    ('了', 'l'),
    ('啊', 'a')
]

for ch, encoding in singles:
    header += '{}\t{}\t10000000\n'.format(ch, encoding)

with open('data/out_char.txt', 'r', encoding='UTF-8') as f:
    content = f.read()

with open('data/out_phrase.txt', 'r', encoding='UTF-8') as f:
    content += f.read()

with open('data/out_sogou.txt', 'r', encoding='UTF-8') as f:
    content += f.read()

with open('data/ancy_flypy_extend.dict.yaml', 'w', encoding='UTF-8') as f:
    f.write(header + content)
