# 自製智能中文選字系統  (1)

In [1]:
import sys
sys.version

'3.7.9 (default, Aug 31 2020, 17:10:11) [MSC v.1916 64 bit (AMD64)]'

## 資料前處理

確認版本為 python3

In [2]:
import re

In [3]:
def prepocess_line(line):
    # 僅僅挑出中文字元，並且斷開不連續的中文字
    pattern = r'[^\u4e00-\u9fa5]+'
    segments = re.split(pattern, line)
    #segments = [x for x in segments if x!='']
    segments = [x for x in segments]

    return segments

In [4]:
prepocess_line('“英語”一詞源於遷居英格蘭的日耳曼部落盎格魯（），而“盎格魯”得名於')  
# 應該為：['英語', '一詞源於遷居英格蘭的日耳曼部落盎格魯', '而', '盎格魯', '得名於']

['', '英語', '一詞源於遷居英格蘭的日耳曼部落盎格魯', '而', '盎格魯', '得名於']

In [5]:
segments = []
with open('./wiki_zh_small.txt', encoding ='utf-8') as fr:
    for line in fr.readlines():
        segments += prepocess_line(line)
segments[:10]

['英語',
 '英語英語',
 '又稱爲英文',
 '是一種西日耳曼語言',
 '誕生於中世紀早期的英格蘭',
 '如今具有全球通用語的地位',
 '英語',
 '一詞源於遷居英格蘭的日耳曼部落盎格魯',
 '而',
 '盎格魯']

## Ngram

一開始要先計算字詞出現的次數

In [6]:
from collections import Counter

class Counters:
    def __init__(self, n):
        self.n = n
        self.counters = [Counter() for _ in range(n + 1)]  # 分別代表計算0、1、...個字的出現次數
        #self.counters: [Counter(), Counter(), Counter(), Counter()]

    def fit(self, segments):
        # 因為 self.counters 分別代表計算0、1、...個字的出現次數
        # 請在此實作利用 segments 以及函式 _skip 來統計次數
        self.counters[0] = Counter({'': len(''.join(segments))})
        for i in range(1,self.n+1):
            #for seg in segments:
            seg_skip = self._skip(''.join(segments), i)
            #print(Counter(seg_skip))
            self.counters[i] = Counter(seg_skip)
            #self.counters[i] = Counter(seg_skip) 如果 = 0
            
        return self.counters
    
    def _skip(self, segment, n):
        assert n > 0
        if len(segment) < n:
            return []
        shift = n - 1
        for i in range(len(segment) - shift):
            yield segment[i:i+shift+1]

In [7]:
counters = Counters(n=3)
counters = counters.fit(segments)

In [8]:
print(counters[0])
print(counters[1].most_common()[:5])
print(counters[2].most_common()[:5])
print(counters[3].most_common()[:5])

Counter({'': 371370})
[('的', 13270), ('國', 4801), ('中', 3944), ('在', 3708), ('一', 3659)]
[('中國', 827), ('一個', 674), ('使用', 592), ('年月', 556), ('可以', 510)]
[('年月日', 372), ('西班牙', 225), ('聯合國', 212), ('共和國', 212), ('人民共', 188)]


### 建立N-Gram模型
N-Gram模型在計算詞機率時為(以Trigram為例)
$$
P(W_i|W_{i-1},W_{i-2}) = \frac{count(W_i,W_{i-1},W_{i-2})}{count(W_{i-1},W_{i-2})}
$$

舉例來說
$$
P(the|this,is) = \frac{count(this\ is\ the)}{count(this\ is)}
$$

In [9]:
class Ngram:
    def __init__(self, n, counters):
        #assert n <= counters.n
        self.n = n
        self.major_counter = counters[n]
        #self.minor_counter = counters[n-1]

    def predict_proba(self, prefix='', top_k=5):
        assert len(prefix) >= self.n - 1
        # 使用 Ngram 的公式計算出下一個字出現的機率
        # 輸出為機率與字的tuple列表，詳見下方輸出範例
        # 應該為：[(0.035732269174118744, '的')
        
        minor_gram_freq = dict()
        probs = dict()
        
        major_gram_freq = sorted(self.major_counter.items(), key = lambda word_count: word_count[1], reverse=True)
        #minor_gram_freq = sorted(self.minor_counter.items(), key = lambda word_count: word_count[1], reverse=True)
        #('年月日', 372)
        for key in major_gram_freq:
            minor_gram_freq[key[0][:2]] = minor_gram_freq.get(key[0][:2],0) + key[1]
            
        for key2 in major_gram_freq:
            if key2[0][:2] not in probs.keys():
                probs[key2[0][:2]] = []
                probs[key2[0][:2]] = [[key2[1]/minor_gram_freq[key2[0][:2]], key2[0][2]]]
            else:
                probs[key2[0][:2]] = probs[key2[0][:2]] + [[key2[1]/minor_gram_freq[key2[0][:2]], key2[0][2]]]
        
        #按第一維排序
        probs[prefix].sort(reverse=True)
        
        #按第二維排序(若有別的案例第二維是數字的話使用)
        #probs[prefix].sort(key=lambda x:x[1])
        #retrun probs[prefix][:top_k]
         
        #return sorted_probs[:top_k] if top_k > 0 and len(sorted_probs)>=top_k else sorted_probs   
        if len(probs[prefix])>=top_k and top_k>0:
            return probs[prefix][:top_k]
        else:
            return probs[prefix]

In [10]:
trigram = Ngram(3, counters)

In [11]:
trigram.predict_proba('我思')

[[0.75, '故'], [0.25, '維']]

In [12]:
trigram.predict_proba('可以')

[[0.07254901960784314, '用'],
 [0.06666666666666667, '在'],
 [0.045098039215686274, '使'],
 [0.025490196078431372, '是'],
 [0.023529411764705882, '被']]

## 使用Ngram來建立第一版選字系統

In [13]:
class ChineseWordRecommenderV1:
    def __init__(self, n=0):
        self.n = 0
        #self.unigram = unigram
        #self.bigram = bigram
        #self.trigram = trigram
    
    def predict_proba(self, prefix='', top_k=5):
        minor_gram_freq = dict()
        probs = dict()
        
        #trigram
        if len(prefix)>=2:            
            prefix = prefix[-2:]
            self.n = 3
            self.major_counter = counters[self.n]
            major_gram_freq = sorted(self.major_counter.items(), key = lambda word_count: word_count[1], reverse=True)
            for key in major_gram_freq:
                minor_gram_freq[key[0][:2]] = minor_gram_freq.get(key[0][:2],0) + key[1]
            for key2 in major_gram_freq:
                if key2[0][:2] not in probs.keys():
                    probs[key2[0][:2]] = []
                    probs[key2[0][:2]] = [[key2[1]/minor_gram_freq[key2[0][:2]], key2[0][2]]]
                else:
                    probs[key2[0][:2]] = probs[key2[0][:2]] + [[key2[1]/minor_gram_freq[key2[0][:2]], key2[0][2]]]
                    
            probs[prefix].sort(reverse=True)
            
            #return sorted_probs[:top_k] if top_k > 0 and len(sorted_probs)>=top_k else sorted_probs 
            if len(probs[prefix])>=top_k and top_k>0:
                return probs[prefix][:top_k]
            else:
                return probs[prefix]
            
        #bigram    
        elif len(prefix)>=1:
            self.n = 2
            self.major_counter = counters[self.n]
            major_gram_freq = sorted(self.major_counter.items(), key = lambda word_count: word_count[1], reverse=True)
            for key in major_gram_freq:
                minor_gram_freq[key[0][:1]] = minor_gram_freq.get(key[0][:1],0) + key[1]
            for key2 in major_gram_freq:
                if key2[0][:1] not in probs.keys():
                    probs[key2[0][:1]] = []
                    probs[key2[0][:1]] = [[key2[1]/minor_gram_freq[key2[0][:1]], key2[0][1]]]
                else:
                    probs[key2[0][:1]] = probs[key2[0][:1]] + [[key2[1]/minor_gram_freq[key2[0][:1]], key2[0][1]]]
                    
            probs[prefix].sort(reverse=True)
            
            #return sorted_probs[:top_k] if top_k > 0 and len(sorted_probs)>=top_k else sorted_probs 
            if len(probs[prefix])>=top_k and top_k>0:
                return probs[prefix][:top_k]
            else:
                return probs[prefix] 
        #unigram    
        else:
            self.n=1
            self.major_counter = counters[self.n]
            major_gram_freq = sorted(self.major_counter.items(), key = lambda word_count: word_count[1], reverse=True) 
            
            return major_gram_freq[:top_k]

In [14]:
model = ChineseWordRecommenderV1()

In [15]:
probs = model.predict_proba('我思故我', top_k=10)
probs

[[0.8333333333333334, '在'], [0.16666666666666666, '有']]

In [16]:
probs = model.predict_proba('我思', top_k=10)
probs

[[0.75, '故'], [0.25, '維']]

In [17]:
probs = model.predict_proba('我', top_k=10)
probs

[[0.40298507462686567, '們'],
 [0.07960199004975124, '的'],
 [0.04477611940298507, '在'],
 [0.01990049751243781, '是'],
 [0.01990049751243781, '思'],
 [0.014925373134328358, '比'],
 [0.014925373134328358, '不'],
 [0.009950248756218905, '這'],
 [0.009950248756218905, '深'],
 [0.009950248756218905, '最']]

In [18]:
probs = model.predict_proba('', top_k=10)
probs

[('的', 13270),
 ('國', 4801),
 ('中', 3944),
 ('在', 3708),
 ('一', 3659),
 ('爲', 3637),
 ('是', 3600),
 ('有', 3252),
 ('人', 3201),
 ('年', 3188)]

## Demo

In [19]:
#!pip install -U pip
#!pip install -q ipywidgets

In [20]:
import ipywidgets as widgets

text = widgets.Textarea()
label = widgets.Label()
display(label, text)

def func(change):
    probs = model.predict_proba(change.new, top_k=10)
    label.value = ' ' + '\t'.join([word for prob, word in probs])

text.observe(func, names='value')

Label(value='')

Textarea(value='')