# 运用条件随机场(CRF)进行医疗数据的实体识别 -- CCKS Task2

In [1]:
import pandas as pd

In [2]:
label = ['检查和检验', '治疗', '疾病和诊断', '症状和体征', '身体部位']
label2id = {l:ind for ind, l in enumerate(label)}
id2label = {ind:l for ind, l in enumerate(label)}

### 读取数据

In [3]:
from collections import defaultdict
import os
import csv

#jieba.load_userdict("字典w词性.txt")


label_set = {}
label_data_list = []
entity_stats = False

feature_dict = defaultdict(dict)
directory = os.path.join("C:\\Users\\Dawei\\Downloads\\NER","task2data")
for root,dirs,files in os.walk(directory):
    for ind, file in enumerate(files):
        if file.endswith(".txt"):
            if ind % 2 == 0:
                #tree = Trie()
                with open(root+'\\'+file,'r',encoding='utf-8') as infile:
                    reader = csv.reader(infile,delimiter='\t')
                    tag_seq = []
                    word_set = set()
                    label_dict ={0:[], 1:[], 2:[], 3:[], 4:[]}
                    
                    for line in reader:
                        if line:
                            #记录Entity的位置
                            tag_seq.append((line[1],line[2],label2id[line[3]]))
                            label_dict[label2id[line[3]]].append(line[0])   
                            ##统计Entity
                            if entity_stats:
                                
                                #统计每类Entity的个数    
                                if line[-1] in label_set:
                                    label_set[line[-1]] += 1
                                else:
                                    label_set[line[-1]] = 1
                                
                            if line[0] not in word_set:
                                word_set.add(line[0])
                                #tree.insert(list(pseg.cut(line[0])),label2id[line[-1]])
                            
                    
                #label_df = pd.read_csv(root+'\\'+file,encoding='utf-8', sep='\t',names=['实体','start_pos','end_pos','label'])
            else:
                #print(file)
                data_label = []
                with open(root+'\\'+file,'r',encoding='utf-8') as infile:
                    reader = csv.reader(infile,delimiter='\n')
                    for line in reader:
                        if line:
                            temp = line[0].replace("\t"," ")
                            data_label += list(temp)
                        
                #print(question)
                data_label = [[i,j] for i, j in zip(data_label,['O',]*len(data_label))]
                #print(len(data_label),len(tag_seq))
                if len(tag_seq):
                    for start,end,tag in tag_seq:
                        for i in range(int(start),int(end)+1):
                            if i == int(start):

                                data_label[i][-1] ='B-'+str(tag)
                            else:
                                data_label[i][-1] ='I-'+str(tag)
                
                            
                label_data_list.append((data_label,label_dict))
      

In [4]:
len(label_data_list)

1198

In [31]:
label_set

{'检查和检验': 5785, '治疗': 712, '疾病和诊断': 604, '症状和体征': 6187, '身体部位': 8310}

### 加载外部字典，总结字典n-gram的规律 (字典来源： ICD9, ICD10等 合计共7万医学名词

In [5]:
def load_med_set(dict_name):
    med_set = set()
    med_dict = pd.read_csv(dict_name,names=['word'])
    for word in med_dict['word'].tolist():
        if word not in med_set:
            med_set.add(word)
    return med_set

med_set = load_med_set('字典全.txt')

### 总结常见的医学前缀和后缀

In [6]:
def common_suffix(med_set, cutoff):
    suffix_dict = {}
    for i in med_set:
        if len(i)>=4:
            word_list = [i[-2:],i[-3:],i[-4:]]
        elif len(i)>=3:
            word_list = [i[-2:],i[-3:]]
        elif len(i)>=2:
            word_list = [i[-2:]]
        else:
            word_list=[]
        
        for word in word_list:
            if word not in suffix_dict:
                suffix_dict[word] = 1
            else:
                suffix_dict[word] += 1
    return {key:value for key,value in suffix_dict.items() if value>=cutoff}

def common_prefix(med_set,cutoff):
    prefix_dict = {}
    for i in med_set:
        if len(i)>=4:
            word_list = [i[:2],i[:3],i[:4]]
        elif len(i)>=3:
            word_list = [i[:2],i[:3]]
        elif len(i)>=2:
            word_list = [i[:2]]
        else:
            word_list=[]
        
        for word in word_list:
            if word not in prefix_dict:
                prefix_dict[word] = 1
            else:
                prefix_dict[word] += 1
    return {key:value for key,value in prefix_dict.items() if value>=cutoff}

In [8]:
def add_word_from_dict(dict_name,label,current_set,current_dict):
    med_dict = pd.read_csv('./dictionary/CLEAN_DICT/'+dict_name,names=['word'])
    for word in med_dict['word'].tolist():
        word = word.strip("\u3000\u3000")
        if word not in current_set:
            current_set.add(word)
        if word not in current_dict:
            current_dict[word] = label
    return current_set, current_dict

curr_set, curr_dict = add_word_from_dict('icd_9.csv',1,med_set,{})
curr_set, curr_dict = add_word_from_dict('ICD10.csv',2,curr_set, curr_dict)
curr_set, curr_dict = add_word_from_dict('med_online.csv',1,curr_set, curr_dict)
curr_set, curr_dict = add_word_from_dict('medicine.csv',1,curr_set, curr_dict)
curr_set, curr_dict = add_word_from_dict('pathology.csv',2,curr_set, curr_dict)
label_set_all,label_dict_all = add_word_from_dict('中文身体部位名称.txt',4,curr_set, curr_dict)
print(len(label_set_all),len(label_dict_all))

72517 31385


In [19]:
word_set_suffix =set()
word_set_prefix = set()

suffix = common_suffix(med_set,1)
prefix = common_prefix(med_set,1)
for key, value in suffix.items():    
    word_set_suffix.add(key)

for key, value in prefix.items():    
    word_set_prefix.add(key)
        
    
print(len(word_set_suffix),len(word_set_prefix))

77658 92422


###  根据外部字典所有可能出现在医学里的字

In [9]:
def all_possible_char(word_set):
    all_char_set = set()
    for word in word_set:
        for char in list(word):
            all_char_set.add(char)
    return all_char_set
dict_set = all_possible_char(label_set_all)
print(len(dict_set))

3123


In [9]:
from itertools import chain

import nltk
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

### 总结以每个字为中心，左右两个字的窗口内的特征

In [23]:
import re
def word2features_1(sent, i):
    word = sent[i][0]
    #postag = sent[i][1]
    #head = sent[i][2]

    features = {
        'bias': 1.0,
        #'head': head,
        'word':word,
        'word.ispunc()': 1 if re.match('^[a-zA-Z0-9_]*$',word) else 0,
        'word.isdigit()': word.isdigit(),
        'word.isalpha()': word.isalpha(),
        #'word.prefix': word in word_set_prefix,
        #'word.suffix':word in word_set_suffix,
        #'word.dict': word in dict_set,
        #'postag': postag,
        #'postag_n': postag[0]=='n' and postag!='ng',
        #'head_postag':str(head)+"|"+postag,
        
       # 'postag[:1]': postag[:1]
    }
    if i > 0:
        word1 = sent[i-1][0]
        #postag1 = sent[i-1][1]
        #head1 = sent[i-1][2]
        features.update({
           #'-1:head': head1,
            '-1:word':word1,
            #'-1:word.dict': word1 in dict_set,
            '-1:word.isdigit()': word1.isdigit(),
            '-1:word.isalpha()': word1.isalpha(),
            '-1:word.ispunc()': 1 if re.match('^[a-zA-Z0-9_]*$',word1) else 0,
            #'-1:word.prefix': word1 in word_set_prefix,
            #'-1:word.suffix':word1 in word_set_suffix,
            '-1:0.word.suffix':   word1+word in word_set_suffix,
            '-1:0.word.prefix':   word1+word in word_set_prefix,   
           # '-1:postag': postag1,
            
            
            '-1:0': word1+word,
            #'-1:0_pos': postag1+'|'+postag,
           # '-1:postag_n': postag1[0]=='n' and postag1!='ng',
           
             
        })
    else:
        features['BOS'] = True
        
    
    

    if i < len(sent)-1:
        word2 = sent[i+1][0]
        #postag2 = sent[i+1][1]
        #head2 = sent[i+1][2]
        features.update({
           #'+1:head': head2, 
            '+1:word':word2,
            #'+1:word.dict': word2 in dict_set,
            '+1:word.isdigit()': word2.isdigit(),
            '+1:word.isalpha()': word2.isalpha(),
            '+1:word.ispunc()': 1 if re.match('^[a-zA-Z0-9_]*$',word2) else 0,
            #'+1:postag': postag2,
            #'+1:word.prefix': word2 in word_set_prefix,
            #'+1:word.suffix':word2 in word_set_suffix,
            '+1:0': word+word2,
            '+1:0.word.suffix':   word+word2 in word_set_suffix,
            '+1:0.word.prefix':   word+word2 in word_set_prefix,
            #'+1:0_pos': postag+'|'+postag2,
            #'+1:postag_n': postag2[0]=='n' and postag2!='ng',
              
           
        })
    else:
        features['EOS'] = True
        
   
        
    if i > 1:
        word3 = sent[i-2][0]
        #postag3 = sent[i-2][1]
        #head3 = sent[i-2][2]
        features.update({
            #'-2:head': head3,
            '-2:word': word3,
           # '-2:word.prefix': word3 in word_set_prefix,
           # '-2:word.suffix':word3 in word_set_suffix,
            '-2:-1:word': word3+word1,
             #'-2:-1:postag': postag3+'|'+postag1,
            '-2:-1:0_word': word3+word1+word,
            #'-2:-1:0_pos': postag3+'|'+postag1+'|'+postag
           '-2:-1.word.suffix':  word3+word1 in word_set_suffix,
            '-2:-1.word.prefix':   word3+word1 in word_set_prefix,
            '-2:-1:0.word.suffix':  word3+word1+word in word_set_suffix,
            '-2:-1:0.word.prefix':   word3+word1+word in word_set_prefix,
           
        })
        
    if i < len(sent)-2:
        word4 = sent[i+2][0]
        #postag4 = sent[i+2][1]
        #head4 = sent[i+2][2]
        features.update({
            #'+2:head': head4,
                '+2:word': word4,
            #'+2:word.prefix': word4 in word_set_prefix,
           # '+2:word.suffix':word4 in word_set_suffix,
             
            '+2:+1:word': word2+word4,
            '+2:+1.word.suffix': word2+word4 in word_set_suffix,
            '+2:+1.word.prefix':   word2+word4 in word_set_prefix,
            #'+2:+1:postag': postag2+'|'+postag4,
            '+2:+1:0_word': word+word2+word4,
            '+2:+1:0.word.suffix':  word+word2+word4 in word_set_suffix,
            '+2:+1:0.word.prefix':   word+word2+word4 in word_set_prefix,
            #'+2:+1:0_pos': postag+'|'+postag2+'|'+postag4
              
           
        })
   

    return features


In [11]:
def sent2features(sent):
    return [word2features_1(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token in sent]

### 输出数据，评估数据

In [12]:
def input_data_transform(docs):
    input_data = []
    for i in map(lambda x:list(zip(*x)),docs):
        input_data.append([list(j) for j in i])

    return input_data

def write_conll(fstream, data):
    """
    Writes to an output stream @fstream (e.g. output of `open(fname, 'r')`) in CoNLL file format.
    @data a list of examples [(tokens), (labels), (predictions)]. @tokens, @labels, @predictions are lists of string.
    """
    for cols in data:
        for row in zip(*cols):
            fstream.write("\t".join([str(i) for i in row]))
            fstream.write("\n")
        fstream.write("\n")

def test_ner(index):
    script_file = "C:\\Users\\Dawei\\Downloads\\NER\\Assignment3\\assignment3\\conlleval"
    output_file = "C:\\Users\\Dawei\\Downloads\\NER\\Assignment3\\assignment3\\final\\pred{}.conll".format(index)
    result_file = "C:\\Users\\Dawei\\Downloads\\NER\\Assignment3\\assignment3\\final\\ner_result{}.utf8".format(index)
    
    os.system("perl {} < {} > {}".format(script_file, output_file, result_file))

### 将数据转化为可提交的格式的函数

In [4]:
import os,re
def get_entity_type(tag_list):
    normal_tag_list = ['0','1','2','3','4']
    max_count = 0
    for tag in normal_tag_list:
        if tag_list.count(tag) > max_count:
            max_count = tag_list.count(tag)
            real_tag = tag
    return real_tag

def generate_result(file_path):
    sents=[]  #用于保存结果的列表
    tempLine=[] #用于保存当前句的临时列表
    for eachLine in open(file_path,'r',encoding='utf8'): #逐行读取标结果文件
        if(eachLine!='\n'): #如果当前行不为空
            colList=eachLine.strip('\n').split('\t') #根据制表符进行分隔，得到当前行的各列数据
            #print(colList)
            tempLine.append([colList[0],colList[1]]) # 将其追加到临时列表中
        else: #如果当前为空行
            sents.append(tempLine[:]) #说明一句读完了，则添加到结果列表中
            tempLine=[] #清空临时列表，等待下一句
    #print(sents)
    #print(len(sents))

    final_results=[] #用于保存最终结果的列表
    for sentId in range(len(sents)): #遍历上述代码片断的结果中的句子
        sentence_result = []
        entity_word='' #用于保存当前捕捉到的结果的临时列表
        tag_list = []
        firstWordId=0 #第一个遍历当前句子的游标
        while(firstWordId<len(sents[sentId])-1): # 开始循环
            if(sents[sentId][firstWordId][-1]!='O' ) and ('B-' in sents[sentId][firstWordId][-1]): #如果发现有非O行
                secondWordId=firstWordId+1 #设置第二个遍历当前句子的游标，从第一个游标的下一元素开始
                tag_list.append(sents[sentId][firstWordId][-1].split('-')[-1])
                entity_word += sents[sentId][firstWordId][0] #将当前行添加到临时表中
                start_index = firstWordId
                while(secondWordId<len(sents[sentId])): #开始第二个游标的循环
                    if(sents[sentId][secondWordId][-1]!='O') and ('B-' not in sents[sentId][secondWordId][-1]): #如果发现非O行
                    #if(sents[sentId][secondWordId][-1]!='O'): #如果发现非O行
                        entity_word += sents[sentId][secondWordId][0] #将当前行添加到临时表中
                        tag_list.append(sents[sentId][secondWordId][-1].split('-')[-1])
                    else: #如果当前行的标注结果是O，说明前面发现的标结果已经捕捉完毕，
                        break #中断第二个游标的循环
                    secondWordId+=1
                    firstWordId+=1
                real_tag = get_entity_type(tag_list)
                index = (start_index,secondWordId)
                sentence_result.append({'index':index, 'value':(real_tag, entity_word)})
                #当内层循环结束时，说明已经发现了一组标注结果了，将他们整体添加到最终结果列表中
                entity_word='' #清空临时列表
                tag_list = []
            firstWordId+=1 #改变外层循环变量
        final_results.append(sentence_result)
        #print(final_results)
    return final_results



def get_str_result(file_path):
    tag_mapping_dict = {'0':u'检查和检验','1':u'治疗', '2':u'疾病和诊断', '3':u'症状和体征', '4':u'身体部位'}
    final_result = generate_result(file_path)
    #final_result = post_process(final_result)
    final_str_result =[]
    for i in range(len(final_result)):
        all_text_str = ''
        all_text_list = []
        for dict_ in final_result[i]:
            text_str = ' '.join([dict_['value'][1],str(dict_['index'][0]),\
                                 str(dict_['index'][1]-1),tag_mapping_dict[dict_['value'][0]]])
            all_text_list.append(text_str[:])
        all_text_str = ';'.join(all_text_list)
        final_str_result.append(all_text_str)
    return final_str_result

### 读取测试数据

In [15]:
import os,csv
data_list = []
file_list = []
directory = os.path.join("C:\\Users\\Dawei\\Downloads\\NER","task2test")
for root,dirs,files in os.walk(directory):
    for ind, file in enumerate(files):
        if file.endswith(".txt"):
            file_cat, file_ind = file.split(".")[0].split("-")
            file_list.append(",".join([file_ind,file_cat]))
                #print(file)
            data_label = []
            with open(root+'\\'+file,'r',encoding='utf-8') as infile:
                reader = csv.reader(infile,delimiter='\n')
                for line in reader:
                    if line:
                        temp = line[0].replace("\t"," ")
                        data_label += list(temp)

            
            data_list.append(data_label[:])

### 寻找合适的hyperparameter

In [13]:
import scipy
result = [label_data_list[i][0] for i in range(len(label_data_list))]

X = [sent2features(s) for s in result]
y = [sent2labels(s) for s in result]

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    max_iterations=500,
    all_possible_transitions=True
)
params_space = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
}

# use the same metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score,
                        average='weighted',labels=['B-0','I-0','B-1','I-1','B-2','I-2','B-3','I-3','B-4','I-4'])

# search
rs = RandomizedSearchCV(crf, params_space,
                        cv=3,
                        verbose=1,
                        n_jobs=3,
                        n_iter=50,
                        scoring=f1_scorer)
rs.fit(X, y)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed: 77.2min
[Parallel(n_jobs=3)]: Done 150 out of 150 | elapsed: 421.8min finished


RandomizedSearchCV(cv=3, error_score='raise',
          estimator=CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=True, averaging=None, c=None, c1=None, c2=None,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error...e,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False),
          fit_params={}, iid=True, n_iter=50, n_jobs=3,
          param_distributions={'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000002D903BDA20>, 'c1': <scipy.stats._distn_infrastructure.rv_frozen object at 0x0000002DEE22C940>},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          scoring=make_scorer(flat_f1_score, labels=['B-0', 'I-0', 'B-1', 'I-1', 'B-2', 'I-2', 'B-3', 'I-3', 'B-4', 'I-4'], average=weighted),
          verbose=1)

In [15]:
rs.best_params_

{'c1': 0.088599253829169486, 'c2': 0.0046537586495652107}

#### 交叉验证评估模型表现

In [24]:
from sklearn.cross_validation import KFold
f1_score = []
result = [label_data_list[i][0] for i in range(len(label_data_list))]

X = [sent2features(s) for s in result]
y = [sent2labels(s) for s in result]

kf = KFold(len(result), n_folds=10,shuffle=True,random_state=1)

index = 0
for train_index, test_index in kf:
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = [X[i] for i in train_index], [X[i] for i in test_index]
    y_train, y_test = [y[i] for i in train_index], [y[i] for i in test_index]


    crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs',
        c1= 0.089,
        c2= 0.004,
        max_iterations=500,
        all_possible_transitions=True
    )
    crf.fit(X_train, y_train)
    
    y_pred = crf.predict(X_test)
    
    f1 = metrics.flat_f1_score(y_test, y_pred,
                          average='weighted', labels=['B-0','I-0', 'B-1','I-1','B-2','I-2','B-3','I-3','B-4','I-4'])
    
        
    f1_score.append(f1)
    
    test_char = [result[i] for i in test_index]
    datawpred = [[[data[0],data[-1]]+[pred] for data, pred in zip(test_char[j],y_pred[j])] for j in range(len(y_pred))]
    with open("C:\\Users\\Dawei\\Downloads\\NER\\Assignment3\\assignment3\\final\\pred{}.conll".format(index),\
              'w',encoding='utf-8') as f:
        write_conll(f, input_data_transform(datawpred))
    test_ner(index)

    index += 1
print(f1_score)

[0.92994874237062708]


### 得到测试数据的预测值

In [41]:
#from sklearn.cross_validation import KFold
#f1_score = []
result = [i[0] for i in label_data_list]
X = [sent2features(s) for s in result]
y = [sent2labels(s) for s in result]
X_test = [sent2features(s) for s in data_list]

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=500,
    all_possible_transitions=True
)
crf.fit(X, y)

y_pred = crf.predict(X_test)
    
    
    
test_char = X_test
datawpred = [[[data,pred] for data, pred in zip(data_list[j],y_pred[j])] for j in range(len(y_pred))]
with open("C:\\Users\\Dawei\\Downloads\\NER\\Assignment3\\assignment3\\final\\pred{}.conll".format(11),\
          'w',encoding='utf-8') as f:
    write_conll(f, input_data_transform(datawpred))
    

In [42]:

final = get_str_result("C:\\Users\\Dawei\\Downloads\\NER\\Assignment3\\assignment3\\final\\pred{}.conll".format(11))

In [43]:
with open('result.csv','w',encoding='utf-8') as f:
   
    for i in range(len(final)):
        if final[i]:
            f.write(file_list[i]+","+final[i]+";")
            f.write("\n")
        else:
            f.write(file_list[i]+",")
            f.write("\n")

### 由CRF得到的状态转移矩阵以及特征效果

In [14]:
from collections import Counter

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(crf.transition_features_).most_common(10))

print("\nTop unlikely transitions:")
print_transitions(Counter(crf.transition_features_).most_common()[-10:][::-1])

Top likely transitions:
I-1    -> I-1     4.968283
I-2    -> I-2     4.243605
I-4    -> I-4     4.080847
O      -> B-1     4.026180
B-3    -> I-3     3.795849
I-0    -> I-0     3.646458
O      -> O       3.488403
B-0    -> I-0     3.073345
I-3    -> I-3     3.047348
B-2    -> I-2     2.955577

Top unlikely transitions:
O      -> I-4     -14.387559
O      -> I-0     -12.631001
O      -> I-1     -11.502036
O      -> I-2     -11.324096
I-4    -> I-0     -10.761696
I-4    -> I-2     -10.099810
B-4    -> I-0     -9.719350
B-4    -> I-2     -9.673938
O      -> I-3     -9.505175
I-4    -> I-1     -8.633114


In [25]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))    

print("Top positive:")
print_state_features(Counter(crf.state_features_).most_common(100))

print("\nTop negative:")
print_state_features(Counter(crf.state_features_).most_common()[-100:][::-1])

Top positive:
10.806019 O        word:，
10.283376 O        word:。
10.094325 O        word:：
9.248702 B-0      +1:0:压痛
9.208959 O        word::
8.920277 O        word:、
8.413235 O        word:及
8.241644 O        word:；
8.130099 O        word:"
7.501996 I-4      -1:word:脊
7.496708 O        word:与
7.482643 O        word:未
7.170733 I-0      -1:0:查体
6.710941 O        +2:+1:0_word:木，结
6.590223 O        word:1
6.456654 O        word:无
6.436837 O        word:伴
6.434000 B-4      +2:+1:0_word:痰，左
6.314114 I-4      word:趾
6.195015 B-4      word:肺
6.180917 O        +2:+1:0_word:吐。4
6.179230 I-0      word:片
6.177268 I-2      word:炎
6.155166 I-4      word:区
6.129004 I-4      word:道
6.113960 I-3      -1:0:无力
6.079527 B-4      +1:0:腹软
6.069166 I-4      word:膜
6.062591 B-4      -2:-1:0_word:肿，心
6.027395 I-4      word:便
6.004802 B-3      +1:0:渗出
5.930228 I-0      -1:0:压痛
5.888827 O        word:可
5.841146 B-4      -2:-1:0_word:可，肌
5.734185 I-0      word:音
5.666206 I-4      -1:0:口唇
5.631642 B-0      -1:wo