### 二元文法文本消岐

In [1]:
from match import Match
from hmm import HMM
from nlputils import * 
from bigram import Bigram

import jieba
import os
import numpy as np

导入训练语料，并处理成训练集和测试集

In [2]:
text_file_path = './corpus/2021.txt'
dest_folder = './states/'
states_file_path = dest_folder + os.path.basename(text_file_path)
# text_to_state(text_file_path, dest_folder)
corpus = read_corpus_or_states_for_hmm(text_file_path)
states = read_corpus_or_states_for_hmm(states_file_path)

在前面的步骤中，使用五折交叉验证说明了训练语料具有很强的泛化性

因此，为了保证和后续平滑的数据一致，这里选择不打乱数据集

In [3]:
train, test = yield_data(zip(corpus, states), shuffle=False)
corpus_train, states_train = zip(*train)
corpus_test, states_test = zip(*test)

将训练语料处理成可以给Match使用的格式

In [4]:
vocab_train = corpus_to_vocab(corpus_train)
vocab_train[:10] 

['站', '在', '“', '两个', '一百年', '”', '奋斗目标', '的', '历史', '交汇点']

传入训练语料，训练获得最大正向、最大逆向和HMM模型

In [5]:
hmm = HMM()
hmm.train(corpus_train, states_train)
forward_match = Match("max_forward", vocab_train)
backwad_match = Match("max_backward", vocab_train)

测试正向逆向模型的分词结果

In [6]:
forward_match.tokenize("今天天气不错"), backwad_match.tokenize("今天天气不错")

(['今天', '天气', '不错'], ['今天', '天气', '不错'])

训练使用训练语料训练消岐二元模型

In [7]:
from bigram import Bigram
bigram_model = Bigram()
bigram_model.train(corpus=corpus_train)

消岐，将概率最大的句子返回

In [16]:
res = []
jieba_res = []
output_folder = './results/'
f = open(os.path.join(output_folder, 'disambiguation_result.txt'), 'w', encoding='utf-8')
for test in corpus_test:
    # test是jieba分词的结果
    test = test.replace(" ", "")
    states, hmm_tokens = hmm.tokenize(test)
    forward_tokens = forward_match.tokenize(test)
    backward_tokens = backwad_match.tokenize(test)
    jieba_tokens = jieba.lcut(test, cut_all=False)

    tokens = bigram_model.disambiguation([hmm_tokens, forward_tokens, backward_tokens])
    
    # 写入
    for token in tokens:
        if token != tokens[-1]:
            f.write(token + ' ')
        else:
            f.write(token + '\n')
            
    res += tokens
    jieba_res += jieba_tokens
f.close()

计算消岐后的PRF值

In [9]:
real, pred, correct = cal_count(jieba_res, res)
p, r, f = cal_prf(real, pred, correct)
p, r, f

(0.9003080292107398, 0.8857426426702301, 0.8929659449925084)

计算正向匹配和逆向匹配的PRF值

In [10]:
forward_res = []
backward_res = []
jieba_res = []
i = 0
for test in corpus_test:
    test = test.replace(" ", "")
    forward_tokens = forward_match.tokenize(test)
    backward_tokens = backwad_match.tokenize(test)
    jieba_tokens = jieba.lcut(test, cut_all=False)

    forward_res += forward_tokens
    backward_res +=  backward_tokens
    jieba_res += jieba_tokens

In [11]:
real_f, pred_f, correct_f = cal_count(jieba_res, forward_res)
p, r, f = cal_prf(real_f, pred_f, correct_f)
p, r, f

(0.9336170412659082, 0.9250713972441629, 0.9293245742111881)

In [12]:
real_b, pred_b, correct_b = cal_count(jieba_res, backward_res)
p, r, f = cal_prf(real_b, pred_b, correct_b)
p, r, f

(0.9368913434645046, 0.9288318815299679, 0.9328442050581789)

In [14]:
import random
res = []
jieba_res = []
output_folder = './results/'
f = open(os.path.join(output_folder, 'disambiguation_result_supplement.txt'), 'w', encoding='utf-8')
for test in corpus_test:
    # test是jieba分词的结果
    test = test.replace(" ", "")
    states, hmm_tokens = hmm.tokenize(test)
    forward_tokens = forward_match.tokenize(test)
    backward_tokens = backwad_match.tokenize(test)
    jieba_tokens = jieba.lcut(test, cut_all=False)
    # 等概率选择正向逆向的结果
    relative_tokens = forward_tokens if random.random()*100 >= 50 else backward_tokens

    tokens = bigram_model.disambiguation([hmm_tokens, relative_tokens])
    
    # 写入
    for token in tokens:
        if token != tokens[-1]:
            f.write(token + ' ')
        else:
            f.write(token + '\n')
            
    res += tokens
    jieba_res += jieba_tokens
f.close()

In [15]:
real, pred, correct = cal_count(jieba_res, res)
p, r, f = cal_prf(real, pred, correct)
p, r, f

(0.8820670705870199, 0.864406852610636, 0.8731476721754845)