# 自製智能中文選字系統  (2)

In [1]:
import sys
sys.version

'3.7.3 (v3.7.3:ef4ec6ed12, Mar 25 2019, 22:22:05) [MSC v.1916 64 bit (AMD64)]'

確認版本為 python3

## 資料前處理

In [2]:
import re

In [3]:
def prepocess_line(line):
    chinese_pattern = r'[\u4E00-\u9FFF]+'
    segments = re.findall(chinese_pattern, line, flags=re.UNICODE)
    return segments

In [5]:
segments = []
with open('./datasets/wiki_zh_small.txt', encoding='utf8') as fr:
    for line in fr.readlines():
        segments += prepocess_line(line)

In [6]:
segments[:10]

['英語',
 '英語英語',
 '又稱爲英文',
 '是一種西日耳曼語言',
 '誕生於中世紀早期的英格蘭',
 '如今具有全球通用語的地位',
 '英語',
 '一詞源於遷居英格蘭的日耳曼部落盎格魯',
 '而',
 '盎格魯']

## 斷詞

In [7]:
!pip install jieba

Defaulting to user installation because normal site-packages is not writeable


Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.


In [8]:
import jieba

In [9]:
# 參考: https://blog.csdn.net/laobai1015/article/details/80420016?utm_medium=distribute.pc_relevant.none-task-blog-2%7Edefault%7EBlogCommendFromMachineLearnPai2%7Edefault-1.control&dist_request_id=&depth_1-utm_source=distribute.pc_relevant.none-task-blog-2%7Edefault%7EBlogCommendFromMachineLearnPai2%7Edefault-1.control
# cut_for_search: 方法接受两个参数：需要分词的字符串；是否使用 HMM 模型。该方法适合用于搜索引擎构建倒排索引的分词，粒度比较细
list(jieba.cut_for_search(segments[6001]))

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\aband\AppData\Local\Temp\jieba.cache
Loading model cost 0.800 seconds.
Prefix dict has been built successfully.


['所以', '僅用', '於', '還原', '一些', '貴重', '的', '化合', '化合物']

In [11]:
cut_segments = []
for seg in segments:
    # 使用結巴斷詞的 cut_for_search
    # YOUR CODE HERE
    cut_segments += list(jieba.cut_for_search(seg))
    # END YOUR CODE
cut_segments[:10]

['英語', '英語', '英語', '又', '稱', '爲', '英文', '是', '一種', '西']

## 使用斷詞的結果來作Ngram

In [12]:
from collections import Counter

class Counters:
    def __init__(self, n):
        self.n = n
        self.counters = [Counter() for _ in range(n + 1)]

    def fit(self, segments):
        for i in range(1, 1 + self.n):
            for segment in segments:
                self.counters[i] += Counter(self._skip(segment, i))

        base_count = sum(dict(self.counters[1]).values())
        self.counters[0] = Counter({'': base_count})

    def __getitem__(self, k):
        return self.counters[k]

    def _skip(self, segment, n):
        assert n > 0
        if len(segment) < n:
            return []
        shift = n - 1
        for i in range(len(segment) - shift):
            yield segment[i:i+shift+1]

In [13]:
counters = Counters(n=5)
counters.fit(cut_segments)

In [14]:
class Ngram:
    def __init__(self, n, counters):
        assert n <= counters.n
        self.n = n
        self.major_counter = counters[n]
        self.minor_counter = counters[n-1]

    def predict_proba(self, prefix='', top_k=5):
        assert len(prefix) >= self.n - 1

        reference = prefix[-(self.n - 1):] if self.n > 1 else ''
        count_referecne = self.minor_counter[reference]
        probs = []
        for key, count in dict(self.major_counter).items():
            if key.startswith(reference):
                prob = count / count_referecne
                probs.append((prob, key[-1]))
        sorted_probs = sorted(probs, reverse=True)
        return sorted_probs[:top_k] if top_k > 0 else sorted_probs

    def get_proba_dict(self, prefix=''):
        return {word: prob for prob, word in self.predict_proba(prefix, top_k=-1)}

In [15]:
ngrams = [Ngram(i, counters) for i in range(1, 6)]

In [20]:
ngrams[1].get_proba_dict('我思')

{'想': 0.3465346534653465,
 '考': 0.10891089108910891,
 '維': 0.0891089108910891,
 '汗': 0.06930693069306931,
 '聰': 0.009900990099009901,
 '源': 0.009900990099009901,
 '成': 0.009900990099009901,
 '性': 0.009900990099009901,
 '廣': 0.009900990099009901,
 '騁': 0.0049504950495049506,
 '邈': 0.0049504950495049506,
 '華': 0.0049504950495049506,
 '義': 0.0049504950495049506,
 '無': 0.0049504950495049506,
 '樓': 0.0049504950495049506,
 '明': 0.0049504950495049506,
 '州': 0.0049504950495049506,
 '南': 0.0049504950495049506,
 '作': 0.0049504950495049506,
 '主': 0.0049504950495049506}

## 使用Smoothing of Language Models來建立第二版選字系統

In [27]:
class ChineseWordRecommenderV2:
    def __init__(self, ngrams):
        self.ngrams = ngrams
    
    def predict_proba(self, prefix='', top_k=5):
        interpolation_lambda = 0.99    # 超參數
        proba_dicts = [ngram.get_proba_dict(prefix) for ngram in ngrams[:len(prefix)+1]]    # 過長的不適用ngram, 要看prefix長度

        
        # 在此你可以選擇兩種 Smoothing of Language Models 的方法：
        # Back-off Smoothing 或 Interpolation Smoothing
        # 如果你選擇 Interpolation Smoothing，你可以運用已經準備好的輔助函式
        # _get_interpolation_proba 來達到此目的
        # YOUR CODE HERE
        # 這邊是透過輔住函式計算平滑後的機率，這邊我們要把我有word找出再透過輔助函式計算，其中使用set計算
        probs = []
        words = []
        for proba_dict in proba_dicts:
            words += proba_dict.keys()
        words = list(set(words))
        probs = [(self._get_interpolation_proba(word, proba_dicts, interpolation_lambda), word) for word in words]
        sorted_probas = sorted(probs, reverse=True)
        
        # END YOUR CODE
        return sorted_probas[:top_k] if top_k > 0 else sorted_probas
    
    def _get_interpolation_proba(self, word, proba_dicts, interp_lambda, idx=None):
        if idx is None:
            idx = len(proba_dicts) - 1
        if idx == 0:
            return proba_dicts[idx].get(word, 0.)
        return interp_lambda * proba_dicts[idx].get(word, 0.) + \
               (1 - interp_lambda) * self._get_interpolation_proba(word, proba_dicts, interp_lambda, idx=idx-1)

In [28]:
model = ChineseWordRecommenderV2(ngrams)

In [40]:
probs = model.predict_proba('資料科學', top_k=10)
probs

[(1.7336872249364213e-05, '家'),
 (5.290699545403566e-06, '院'),
 (1.502994017772679e-06, '界'),
 (1.5025083428822606e-06, '及'),
 (7.956629952599319e-07, '校'),
 (7.614313809754809e-07, '系'),
 (7.540063065015268e-07, '上'),
 (7.529960290767157e-07, '性'),
 (7.519978703952214e-07, '史'),
 (7.510346931282596e-07, '化')]

## Demo

In [30]:
!pip install -U pip
!pip install -q ipywidgets

Defaulting to user installation because normal site-packages is not writeable


Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.
Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.


In [34]:
import ipywidgets as widgets

text = widgets.Textarea()
label = widgets.Label()
display(label, text)

def func(change):
    probs = model.predict_proba(change.new, top_k=10)
    label.value = ' ' + '\t'.join([word for prob, word in probs])

text.observe(func, names='我')

Label(value='')

Textarea(value='')