## 数据稀疏和数据平滑

In [10]:
from match import Match
from hmm import HMM
from nlputils import * 
from bigram import Bigram

import jieba
import os
import numpy as np

text_file_path = './corpus/2021.txt'
dest_folder = './states/'
states_file_path = dest_folder + os.path.basename(text_file_path)
# text_to_state(text_file_path, dest_folder)
corpus = read_corpus_or_states_for_hmm(text_file_path)
states = read_corpus_or_states_for_hmm(states_file_path)

train, test = yield_data(zip(corpus, states), shuffle=False)
corpus_train, states_train = zip(*train)
corpus_test, states_test = zip(*test)

训练二元文法模型和HMM模型

In [11]:
bigram_model = Bigram()
bigram_model.train(corpus=corpus_train)

In [16]:
hmm = HMM()
hmm.train(corpus_train, states_train)

使用句子测试数据稀疏

In [4]:
text = [
    "我不知道你有多少钱",
    "中国国家航天总局发表演讲",
    "美国在下个月有十条裤子穿"
]

for t in text:
    states, res = hmm.tokenize(t)
    prob, details = bigram_model.get_prob(res)
    print(prob, details)

1.6496403599350137e-16 [{('<BOS>', '我'): 492, '<BOS>': 340964}, {('我', '不'): 75, '我': 18548}, {('不', '知道'): 213, '不': 20832}, {('知道', '你'): 18, '知道': 1039}, {('你', '有'): 19, '你': 2273}, {('有', '多少'): 105, '有': 34660}, {('多少', '钱'): 46, '多少': 795}, {('钱', '<EOS>'): 1, '钱': 918}]
0.0 [{('<BOS>', '中国'): 3259, '<BOS>': 340964}, {('中国', '国家'): 194, '中国': 76064}, {('国家', '航天'): 28, '国家': 39369}, {('航天', '总局'): 0, '航天': 1272}, {('总局', '发表'): 0, '总局': 344}, {('发表', '演讲'): 22, '发表': 2063}, {('演讲', '<EOS>'): 0, '演讲': 367}]
0.0 [{('<BOS>', '美国'): 362, '<BOS>': 340964}, {('美国', '在'): 147, '美国': 6267}, {('在', '下个'): 2, '在': 137717}, {('下个', '月'): 0, '下个': 10}, {('月', '有'): 35, '月': 43971}, {('有', '十条'): 0, '有': 34660}, {('十条', '裤子'): 0, '十条': 50}, {('裤子', '穿'): 0, '裤子': 7}, {('穿', '<EOS>'): 0, '穿': 550}]


使用bigram模块中基于加一法实现的数据平滑，再测试上述句子

In [5]:
for t in text:
    states, res = hmm.tokenize(t)
    prob, details = bigram_model.get_prob_smooth(res)
    print(prob)

6.022257637733543e+120
1.830890240596848e+121
5.287413711904295e+108


训练正向逆向分词模型

In [13]:
vocab_train = corpus_to_vocab(corpus_train)
vocab_train[:10] 
forward_match = Match("max_forward", vocab_train)
backwad_match = Match("max_backward", vocab_train)

使用数据平滑后的消岐函数进行消岐，并计算PRF值

In [None]:
res = []
jieba_res = []
output_folder = './results/'
f = open(os.path.join(output_folder, 'disambiguation_result_smooth.txt'), 'w', encoding='utf-8')
for test in corpus_test:
    # test是jieba分词的结果
    test = test.replace(" ", "")
    states, hmm_tokens = hmm.tokenize(test)
    forward_tokens = forward_match.tokenize(test)
    backward_tokens = backwad_match.tokenize(test)
    jieba_tokens = jieba.lcut(test, cut_all=False)

    tokens = bigram_model.disambiguation([hmm_tokens, forward_tokens, backward_tokens], smooth=True)
    
    # 写入
    for token in tokens:
        if token != tokens[-1]:
            f.write(token + ' ')
        else:
            f.write(token + '\n')
            
    res += tokens
    jieba_res += jieba_tokens
f.close()

In [10]:
real, pred, correct = cal_count(jieba_res, res)
p, r, f = cal_prf(real, pred, correct)
p, r, f

(0.9043959898725205, 0.8980334262176466, 0.9012034781417373)

测试在未平滑下出现数据稀疏导致随机选择的概率的的概率

In [18]:
need = 0
hmms = []
forward = []
backwad = []
for text in corpus_test:
    text = text.replace(" ", "")
    states, hmm_tokens = hmm.tokenize(text)
    forward_tokens = forward_match.tokenize(text)
    backward_tokens = backwad_match.tokenize(text)

    hmms.append(hmm_tokens)
    forward.append(forward_tokens)
    backwad.append(backward_tokens)

    if bigram_model.get_prob(hmm_tokens)[0] == 0 and bigram_model.get_prob(forward_tokens)[0] == 0 and bigram_model.get_prob(backward_tokens)[0] == 0:
        need += 1
need/len(corpus_test)


0.9022195631261585