# 自製智能中文選字系統  (2)

In [1]:
import sys
sys.version

'3.6.12 |Anaconda custom (64-bit)| (default, Sep  9 2020, 00:29:25) [MSC v.1916 64 bit (AMD64)]'

確認版本為 python3

## 資料前處理

In [2]:
import re

In [3]:
def prepocess_line(line):
    chinese_pattern = r'[\u4E00-\u9FFF]+'
    segments = re.findall(chinese_pattern, line, flags=re.UNICODE)
    return segments

In [4]:
segments = []
with open('./wiki_zh_small.txt', encoding='utf-8') as fr:
    for line in fr.readlines():
        segments += prepocess_line(line)

In [5]:
segments[:10]

['英語',
 '英語英語',
 '又稱爲英文',
 '是一種西日耳曼語言',
 '誕生於中世紀早期的英格蘭',
 '如今具有全球通用語的地位',
 '英語',
 '一詞源於遷居英格蘭的日耳曼部落盎格魯',
 '而',
 '盎格魯']

## 斷詞

In [6]:
!pip install jieba



In [7]:
import jieba

In [8]:
list(jieba.cut_for_search(segments[6001]))

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\angus.tu\AppData\Local\Temp\jieba.cache
Loading model cost 1.470 seconds.
Prefix dict has been built successfully.


['所以', '僅用', '於', '還原', '一些', '貴重', '的', '化合', '化合物']

In [9]:
list(jieba.cut_for_search(segments[4]))

['誕生', '於', '中世', '中世紀', '早期', '的', '英格蘭']

In [10]:
cut_segments = []
for seg in segments:
    # 使用結巴斷詞的 cut_for_search
    # YOUR CODE HERE
    #words = list(jieba.cut_for_search(seg))
    #for word in words:
    #    cut_segments.append(word)
    cut_segments.extend(list(jieba.cut_for_search(seg)))
    # END YOUR CODE

In [11]:
len(cut_segments)

222118

## 使用斷詞的結果來作Ngram

In [12]:
from collections import Counter

class Counters:
    def __init__(self, n):
        self.n = n
        self.counters = [Counter() for _ in range(n + 1)]

    def fit(self, segments):
        for i in range(1, 1 + self.n):
            for segment in segments:
                self.counters[i] += Counter(self._skip(segment, i))

        base_count = sum(dict(self.counters[1]).values())
        self.counters[0] = Counter({'': base_count})

    def __getitem__(self, k):
        return self.counters[k]

    def _skip(self, segment, n):
        assert n > 0
        if len(segment) < n:
            return []
        shift = n - 1
        for i in range(len(segment) - shift):
            yield segment[i:i+shift+1]

In [13]:
counters = Counters(n=5)
counters.fit(cut_segments)

In [14]:
for counter in counters.counters:
    print(counter)

Counter({'': 392780})
Counter({'的': 13274, '國': 4801, '中': 4221, '一': 4189, '在': 3746, '爲': 3637, '是': 3606, '人': 3482, '年': 3338, '有': 3303, '大': 2811, '和': 2735, '以': 2314, '於': 2022, '用': 1995, '學': 1995, '個': 1875, '地': 1835, '文': 1762, '不': 1747, '了': 1722, '時': 1683, '成': 1661, '化': 1658, '西': 1535, '分': 1465, '會': 1456, '上': 1429, '民': 1420, '語': 1416, '其': 1413, '部': 1393, '發': 1378, '方': 1333, '教': 1313, '行': 1304, '後': 1277, '之': 1275, '主': 1274, '作': 1261, '南': 1227, '省': 1212, '生': 1211, '出': 1209, '公': 1199, '多': 1195, '可': 1194, '要': 1187, '家': 1184, '等': 1169, '自': 1166, '子': 1158, '他': 1156, '這': 1153, '合': 1145, '來': 1144, '代': 1137, '法': 1128, '數': 1117, '北': 1115, '能': 1110, '日': 1109, '與': 1088, '到': 1070, '而': 1066, '及': 1066, '第': 1066, '區': 1034, '同': 1026, '全': 1024, '高': 1002, '定': 978, '經': 975, '也': 974, '最': 966, '體': 960, '對': 958, '海': 954, '使': 950, '東': 947, '原': 947, '斯': 943, '理': 943, '十': 940, '元': 936, '世': 914, '本': 904, '種': 891, '名': 889, '族': 88

In [15]:
class Ngram:
    def __init__(self, n, counters):
        assert n <= counters.n
        self.n = n
        self.major_counter = counters[n]
        self.minor_counter = counters[n-1]

    def predict_proba(self, prefix='', top_k=5):
        assert len(prefix) >= self.n - 1

        reference = prefix[-(self.n - 1):] if self.n > 1 else ''
        count_referecne = self.minor_counter[reference]
        probs = []
        for key, count in dict(self.major_counter).items():
            if key.startswith(reference):
                prob = count / count_referecne
                probs.append((prob, key[-1]))
        sorted_probs = sorted(probs, reverse=True)
        return sorted_probs[:top_k] if top_k > 0 else sorted_probs

    def get_proba_dict(self, prefix=''):
        return {word: prob for prob, word in self.predict_proba(prefix, top_k=-1)}

In [16]:
ngrams = [Ngram(i, counters) for i in range(1, 6)]

In [17]:
#for ngram in ngrams:
#    print(len(ngram.major_counter))

#for ngram in ngrams:
#    print(ngram.major_counter)
#    for key,value in ngram.major_counter.items():
#        print(key,value)

ngrams[:5]

[<__main__.Ngram at 0x93784e0>,
 <__main__.Ngram at 0x9378f98>,
 <__main__.Ngram at 0x9378c50>,
 <__main__.Ngram at 0x9378d68>,
 <__main__.Ngram at 0x9378f60>]

## 使用Smoothing of Language Models來建立第二版選字系統

In [18]:
class ChineseWordRecommenderV2:
    def __init__(self, ngrams):
        self.ngrams = ngrams
    
    def predict_proba(self, prefix='', top_k=5):
        interpolation_lambda = 0.99
        proba_dicts = [ngram.get_proba_dict(prefix) for ngram in ngrams[:len(prefix)+1]]

        
        # 在此你可以選擇兩種 Smoothing of Language Models 的方法：
        # Back-off Smoothing 或 Interpolation Smoothing
        # 如果你選擇 Interpolation Smoothing，你可以運用已經準備好的輔助函式
        # _get_interpolation_proba 來達到此目的
        # YOUR CODE HERE
        probas ={}
        for proba_dict in proba_dicts:
            for w in proba_dict:
                probas[w]=self._get_interpolation_proba(w,proba_dicts,interpolation_lambda)
    
        sorted_probas = [(item[1],item[0]) for item in sorted(probas.items(),key=lambda item:item[1],reverse=True)]
        # END YOUR CODE
        
        return sorted_probas[:top_k] if top_k > 0 else sorted_probas
    
    def _get_interpolation_proba(self, word, proba_dicts, interp_lambda, idx=None):
        if idx is None:
            idx = len(proba_dicts) - 1
        if idx == 0:
            return proba_dicts[idx].get(word, 0.)
        return interp_lambda * proba_dicts[idx].get(word, 0.) + \
               (1 - interp_lambda) * self._get_interpolation_proba(word, proba_dicts, interp_lambda, idx=idx-1)

In [19]:
model = ChineseWordRecommenderV2(ngrams)

In [20]:
probs = model.predict_proba('法', top_k=10)
probs

[(0.051904146168121834, '國'),
 (0.04388616116650217, '律'),
 (0.029848241612163377, '院'),
 (0.020222220928179165, '語'),
 (0.011414997351123958, '蘭'),
 (0.010549762034510142, '系'),
 (0.00792383560501087, '定'),
 (0.007908712635409567, '制'),
 (0.007072068387587443, '學'),
 (0.007060356996987107, '西')]

## Demo

In [21]:
!pip install -U pip
!pip install -q ipywidgets



In [22]:
import ipywidgets as widgets

text = widgets.Textarea()
label = widgets.Label()
display(label, text)

def func(change):
    probs = model.predict_proba(change.new, top_k=10)
    label.value = ' ' + '\t'.join([word for prob, word in probs])

text.observe(func, names='value')

Label(value='')

Textarea(value='')