# 基础配置
https://blog.csdn.net/Yellow_python/article/details/89066366

In [1]:
import numpy as np, pandas as pd
PATH_TRAIN = 'train.txt'  # 训练数据路径
log = lambda p: np.log(p + 1e-9)  # 平滑处理的对数函数
START = '<start>'  # 起始tag

# 数据读取、预处理

In [2]:
train = np.loadtxt(PATH_TRAIN, dtype=str, delimiter='/')
pd.DataFrame(train, columns=['word', 'tag']).head()

Unnamed: 0,word,tag
0,Newsweek,NNP
1,",",","
2,trying,VBG
3,to,TO
4,keep,VB


In [3]:
words = sorted(set(train[:, 0]))
tags = sorted(set(train[:, 1]))

W = len(words)  # 词汇量
T = len(tags)   # 词性种类数

word2id = {words[i]: i for i in range(W)}
tag2id = {tags[i]: i for i in range(T)}
id2tag = {i: tags[i] for i in range(T)}

W, T

(18977, 53)

# HMM模型训练
发射概率矩阵、起始概率矩阵、转移概率矩阵

In [4]:
# 矩阵初始化
emit_p = np.zeros((T, W))  # emission_probability
start_p = np.zeros(T)  # start_probability
trans_p = np.zeros((T, T))  # transition_probability

# 训练
prev_tag = START  # 前一个tag
for word, tag in train:
    wid, tid = word2id[word], tag2id[tag]
    emit_p[tid][wid] += 1
    if prev_tag == START:
        start_p[tid] += 1
    else:
        trans_p[tag2id[prev_tag]][tid] += 1
    prev_tag = START if word == '.' else tag  # 句尾判断

# 频数 --> 概率对数
start_p = log(start_p / sum(start_p))
for i in range(T):
    emit_p[i] = log(emit_p[i] / sum(emit_p[i]))
    trans_p[i] = log(trans_p[i] / sum(trans_p[i]))

## 发射概率矩阵

In [5]:
pd.DataFrame(emit_p[7:14, 9988:9999], index=tags[7:14], columns=words[9988:9999])

Unnamed: 0,developers,developing,development,developments,develops,deviant,deviation,device,devices,devils,devise
CC,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266
CD,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266
DT,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266
EX,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266
FW,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266
IN,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266
JJ,-20.723266,-9.475534,-20.723266,-20.723266,-20.723266,-9.475534,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266


## 起始概率矩阵

In [6]:
pd.DataFrame(start_p[:14].reshape(1, 14), columns=tags[:14])

Unnamed: 0,$,'',(,),",",.,:,CC,CD,DT,EX,FW,IN,JJ
0,-20.723266,-2.754958,-5.127924,-5.127924,-20.723266,-20.723266,-6.360067,-2.958871,-4.736445,-1.526625,-5.908083,-8.999117,-2.192296,-3.305393


## 隐状态转移概率矩阵

In [7]:
pd.DataFrame(trans_p[7:14, 7:14], index=tags[7:14], columns=tags[7:14])

Unnamed: 0,CC,CD,DT,EX,FW,IN,JJ
CC,-7.81278,-3.312973,-2.160294,-5.210093,-20.723266,-2.996542,-2.214361
CD,-3.879582,-1.666299,-3.737806,-20.723266,-8.896854,-2.462315,-3.276461
DT,-7.454313,-3.811479,-6.461062,-20.723266,-8.658282,-4.73962,-1.521804
EX,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266,-20.723266
FW,-3.970292,-20.723266,-20.723266,-20.723266,-1.405343,-3.277145,-3.970292
IN,-6.569239,-2.779579,-1.11983,-6.758481,-8.327093,-3.947574,-2.330645
JJ,-4.055012,-4.100268,-5.604345,-20.723266,-9.475534,-2.871603,-2.599282


# 维特比算法

In [8]:
sentence = 'Newsweek , trying to keep pace with rival Time magazine '\
           ', announced new advertising rates for 1990 and said it '\
           'will introduce a new incentive plan for advertisers .'.strip().split()

In [9]:
obs = [word2id[w] for w in sentence]  # 观测序列
le = len(obs)  # 句子长度

dp = np.array([[-1e99] * T] * le)  # 记录节点最大概率对数
path = np.zeros((le, T), dtype=int)  # 记录上个转移节点

for j in range(T):
    dp[0][j] = start_p[j] + emit_p[j][obs[0]]

for i in range(1, le):
    for j in range(T):
        dp[i][j], path[i][j] = max(
            (dp[i - 1][k] + trans_p[k][j] + emit_p[j][obs[i]], k)
            for k in range(T))

# 隐序列
states = [np.argmax(dp[le - 1])]
# 从后到前的循环来依次求出每个单词的词性
for i in range(le - 2, -1, -1):
    states.append(path[i + 1][states[0]])
states = [id2tag[i] for i in states[::-1]]

## 动态规划矩阵

In [10]:
pd.DataFrame(dp.T[7:14], index=tags[7:14], columns=sentence)

Unnamed: 0,Newsweek,",",trying,to,keep,pace,with,rival,Time,magazine,...,it,will,introduce,a,new,incentive,plan,for,advertisers,.
CC,-23.682136,-33.189414,-34.425319,-43.722532,-49.053794,-52.198943,-61.138477,-69.012975,-76.521668,-86.013875,...,-150.422061,-153.126019,-159.302255,-162.8025,-168.498196,-170.31242,-178.714511,-186.586369,-193.832408,-199.626142
CD,-25.459711,-33.913098,-35.729313,-43.308098,-44.150993,-51.225494,-63.049824,-65.223315,-76.963899,-87.14031,...,-147.232958,-156.622523,-168.066622,-161.829051,-164.855362,-170.357677,-180.625858,-188.497717,-190.042748,-203.080089
DT,-22.24989,-35.90123,-33.964612,-40.94321,-43.846132,-48.98321,-62.951194,-63.563566,-78.334385,-87.826592,...,-146.129777,-153.067178,-157.222815,-140.320617,-167.504945,-171.861754,-180.527227,-188.399086,-188.382999,-201.104852
EX,-26.631348,-48.000536,-37.508169,-60.000289,-57.276186,-56.130763,-67.05107,-69.202217,-82.43426,-91.926467,...,-151.743816,-169.123776,-172.067867,-166.73432,-176.947744,-186.980675,-184.627103,-192.498962,-194.021649,-206.258131
FW,-29.722383,-37.568935,-39.562291,-60.000289,-58.844798,-56.130763,-67.05107,-70.770829,-82.339164,-90.796147,...,-165.093395,-169.123776,-172.067867,-149.981346,-169.702165,-175.732943,-184.627103,-192.498962,-195.590262,-217.471142
IN,-22.915562,-33.152509,-34.503503,-29.794439,-46.933531,-49.649192,-41.72047,-66.39131,-74.686097,-84.178304,...,-146.504719,-151.747329,-159.302255,-149.465997,-165.783503,-169.129012,-176.87894,-166.539903,-191.210743,-198.248116
JJ,-24.028659,-34.741624,-35.073657,-30.660768,-45.083503,-49.93229,-62.690105,-52.140365,-75.462913,-87.565503,...,-147.292381,-153.487032,-159.302255,-148.594974,-145.534143,-168.856691,-180.266139,-188.137998,-189.593814,-200.920605


## 记录节点转移矩阵

In [11]:
pd.DataFrame(path.T, index=tags, columns=sentence)

Unnamed: 0,Newsweek,",",trying,to,keep,pace,with,rival,Time,magazine,...,it,will,introduce,a,new,incentive,plan,for,advertisers,.
$,0,21,4,41,37,39,20,12,13,21,...,40,28,20,39,9,13,20,20,12,23
'',0,21,4,41,37,39,20,12,13,20,...,40,28,19,39,9,13,20,20,12,23
(,0,21,4,41,37,39,20,12,20,21,...,40,28,19,39,9,13,20,20,12,23
),0,21,4,41,13,39,20,12,20,21,...,40,28,19,39,9,13,20,20,30,23
",",0,21,4,41,37,39,20,12,20,20,...,40,28,19,39,9,13,20,20,12,23
.,0,21,4,41,37,39,20,12,20,20,...,40,28,19,39,9,13,20,20,12,23
:,0,21,4,41,13,39,20,12,20,20,...,40,28,20,39,9,13,20,20,12,23
CC,0,21,4,41,37,39,20,12,20,20,...,40,28,19,39,9,13,20,20,12,23
CD,0,21,4,41,37,39,20,12,13,21,...,40,28,20,39,9,13,20,20,12,23
DT,0,21,4,41,37,39,20,12,20,20,...,40,28,19,39,9,13,20,20,12,23


In [12]:
for word, tag in zip(sentence, states):
    print(word, tag)

Newsweek NNP
, ,
trying VBG
to TO
keep VB
pace NN
with IN
rival NN
Time NN
magazine NN
, ,
announced VBD
new JJ
advertising NN
rates NNS
for IN
1990 CD
and NN
said VBD
it PRP
will MD
introduce VB
a DT
new JJ
incentive NN
plan NN
for IN
advertisers NNS
. .
