In [1]:
import sys
import os
from pyltp import Segmentor, Postagger, Parser, NamedEntityRecognizer# 需装Python后再下载模型，此次运行的模型版本为3.4.0

In [155]:
print("正在加载LTP模型... ...")
# Set your own model path
MODELDIR="E:\pyltp_model\ltp_data_v3.4.0"

segmentor = Segmentor()
# segmentor.load(os.path.join(MODELDIR, "cws.model"))
cws_model_path=os.path.join(MODELDIR, "cws.model")
segmentor.load_with_lexicon(cws_model_path, './dict.txt') #请把字典文件放进当前文件夹
               
postagger = Postagger()
postagger.load(os.path.join(MODELDIR, "pos.model"))

parser = Parser()
parser.load(os.path.join(MODELDIR, "parser.model"))

recognizer = NamedEntityRecognizer()
recognizer.load(os.path.join(MODELDIR, "ner.model"))

#labeller = SementicRoleLabeller()
#labeller.load(os.path.join(MODELDIR, "srl/"))

print("加载模型完毕。")

正在加载LTP模型... ...
加载模型完毕。


In [192]:
sentence='区块链是分布式数据存储、点对点传输、共识机制、加密算法等计算机技术的新型应用模式'

In [193]:
    # 切割句子
    words = segmentor.segment(sentence)
    # 词性标注
    postags = postagger.postag(words)
    # 命名实体识别
    netags = recognizer.recognize(words, postags)
    # 依存句法分析，其中已有模型可将句子中的主谓宾等结构抽取出来
    arcs = parser.parse(words, postags)

    child_dict_list = build_parse_child_dict(words, postags, arcs)

是 [0]
存储 [3]
传输 [6]


In [194]:
i=1
for word in words:
    print(str(i)+word)
    i+=1

1区块链
2是
3分布式
4数据
5存储
6、
7点对点
8传输
9、
10共识
11机制
12、
13加密
14算法
15等
16计算机
17技术
18的
19新型
20应用
21模式


In [195]:
i=1
for postag in postags:
    print(str(i)+postag)
    i+=1

1n
2v
3b
4n
5v
6wp
7n
8v
9wp
10n
11n
12wp
13v
14n
15u
16n
17n
18u
19b
20v
21n


In [196]:
i=1
for arc in arcs:
    print(str(i)+'->'+str(arc.head)+'：'+arc.relation)
    i+=1

1->2：SBV
2->0：HED
3->4：ATT
4->5：SBV
5->17：ATT
6->5：WP
7->8：SBV
8->5：COO
9->11：WP
10->11：ATT
11->8：COO
12->13：WP
13->14：ATT
14->8：COO
15->8：RAD
16->17：ATT
17->21：ATT
18->17：RAD
19->21：ATT
20->21：ATT
21->2：VOB


In [197]:
child_dict_list

[{},
 {'SBV': [0], 'VOB': [20]},
 {},
 {'ATT': [2]},
 {'SBV': [3], 'WP': [5], 'COO': [7]},
 {},
 {},
 {'SBV': [6], 'COO': [10, 13], 'RAD': [14]},
 {},
 {},
 {'WP': [8], 'ATT': [9]},
 {},
 {'WP': [11]},
 {'ATT': [12]},
 {},
 {},
 {'ATT': [4, 15], 'RAD': [17]},
 {},
 {},
 {},
 {'ATT': [16, 18, 19]}]

In [198]:
child_dict_list

[{},
 {'SBV': [0], 'VOB': [20]},
 {},
 {'ATT': [2]},
 {'SBV': [3], 'WP': [5], 'COO': [7]},
 {},
 {},
 {'SBV': [6], 'COO': [10, 13], 'RAD': [14]},
 {},
 {},
 {'WP': [8], 'ATT': [9]},
 {},
 {'WP': [11]},
 {'ATT': [12]},
 {},
 {},
 {'ATT': [4, 15], 'RAD': [17]},
 {},
 {},
 {},
 {'ATT': [16, 18, 19]}]

In [199]:
def build_parse_child_dict(words, postags, arcs):
    """
    为句子中的每个词语维护一个保存句法依存儿子节点的字典
    Args:
        words: 分词列表
        postags: 词性列表
        arcs: 句法依存列表
    """
    child_dict_list = []
    for index in range(len(words)):
        child_dict = dict()
        for arc_index in range(len(arcs)):
            if arcs[arc_index].head == index + 1:
#                 if child_dict.has_key(arcs[arc_index].relation):
                if (arcs[arc_index].relation) in child_dict:
                    child_dict[arcs[arc_index].relation].append(arc_index)
                else:
                    child_dict[arcs[arc_index].relation] = []
                    child_dict[arcs[arc_index].relation].append(arc_index)
        if ('SBV') in child_dict:
            print(words[index],child_dict['SBV'])
        child_dict_list.append(child_dict)
    return child_dict_list

In [200]:
def complete_e(words, postags, child_dict_list, word_index):
    """
    完善识别的部分实体
    """
    child_dict = child_dict_list[word_index]
    prefix = ''
    if ('ATT') in child_dict:
        for i in range(len(child_dict['ATT'])):
            prefix = prefix+','+complete_e(words, postags, child_dict_list, child_dict['ATT'][i])
    
    postfix = ''
    if postags[word_index] == 'v':
        if ('VOB') in child_dict:
            postfix =postfix+','+ complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
        if ('SBV') in child_dict:
            prefix =prefix+','+ complete_e(words, postags, child_dict_list, child_dict['SBV'][0])

    return prefix + words[word_index] + postfix

In [201]:
    for index in range(len(postags)):
        # 抽取以谓词为中心的事实三元组
        if postags[index] == 'v':# 找到谓语
            child_dict = child_dict_list[index]
            print(child_dict)
            # 主谓宾
            if   ('SBV') in child_dict and  ('VOB') in child_dict:
                e1 = complete_e(words, postags, child_dict_list, child_dict['SBV'][0])
                r = words[index]
                e2 = complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
                print("%s\t%s\t%s\n" % (e1, r, e2)) #此处三元组的输出格式有待规范
                # 此部分代码绝密，翻版必究-------------------------------------
                if ('COO') in child_dict:
                    for i in child_dict['COO']:
                        child_dict_list[i]['SBV']=child_dict['SBV']
                #----------------------------------------------------
                # out_file.flush()
            # 定语后置，动宾关系
            
            if arcs[index].relation == 'ATT':
                if  ('VOB') in child_dict:
                    e1 = complete_e(words, postags, child_dict_list, arcs[index].head - 1)
                    r = words[index]
                    e2 = complete_e(words, postags, child_dict_list, child_dict['VOB'][0])
                    temp_string = r+e2
                    if temp_string == e1[:len(temp_string)]:
                        e1 = e1[len(temp_string):]
                    if temp_string not in e1:
                        print("%s\t%s\t%s\n" % (e1, r, e2)) #此处三元组的输出格式有待规范
                        # out_file.flush()
            # 含有介宾关系的主谓动补关系
            if  ('SBV') in child_dict and  ('CMP') in child_dict:
                #e1 = words[child_dict['SBV'][0]]
                e1 = complete_e(words, postags, child_dict_list, child_dict['SBV'][0])
                cmp_index = child_dict['CMP'][0]
                r = words[index] + words[cmp_index]
                if  ('POB') in child_dict_list[cmp_index]:
                    e2 = complete_e(words, postags, child_dict_list, child_dict_list[cmp_index]['POB'][0])
                    print("%s\t%s\t%s\n" % (e1, r, e2)) #此处三元组的输出格式有待规范
                    # out_file.flush()

        # 尝试抽取命名实体有关的三元组
        if netags[index][0] == 'S' or netags[index][0] == 'B':
            ni = index
            if netags[ni][0] == 'B':
                while netags[ni][0] != 'E':
                    ni += 1
                e1 = ''.join(words[index:ni+1])
            else:
                e1 = words[ni]
            if arcs[ni].relation == 'ATT' and postags[arcs[ni].head-1] == 'n' and netags[arcs[ni].head-1] == 'O':
                r = complete_e(words, postags, child_dict_list, arcs[ni].head-1)
                if e1 in r:
                    r = r[(r.index(e1)+len(e1)):]
                if arcs[arcs[ni].head-1].relation == 'ATT' and netags[arcs[arcs[ni].head-1].head-1] != 'O':
                    e2 = complete_e(words, postags, child_dict_list, arcs[arcs[ni].head-1].head-1)
                    mi = arcs[arcs[ni].head-1].head-1
                    li = mi
                    if netags[mi][0] == 'B':
                        while netags[mi][0] != 'E':
                            mi += 1
                        e = ''.join(words[li+1:mi+1])
                        e2 += e
                    if r in e2:
                        e2 = e2[(e2.index(r)+len(r)):]
                    if r+e2 in sentence:
                        print("%s\t%s\t%s\n" % (e1, r, e2)) #此处三元组的输出格式有待规范
                        # out_file.flush()

{'SBV': [0], 'VOB': [20]}
区块链	是	,,,,分布式数据存储,计算机技术,新型,应用模式

{'SBV': [3], 'WP': [5], 'COO': [7]}
{'SBV': [6], 'COO': [10, 13], 'RAD': [14]}
{'WP': [11]}
{}


In [16]:
arc.head

117

In [17]:
arc.relation

'WP'