## pyltp  LTP提供了一系列中文自然语言处理工具，用户可以使用这些工具对于中文文本进行分词、词性标注、句法分析等等工作。从应用角度来看，LTP为用户提供了下列组件：

In [26]:
import os
from pyltp import Segmentor
LTP_DATA_DIR='C:/Users/Administrator/Downloads/ltp_data_v3.4.0/'
cws_model_path=os.path.join(LTP_DATA_DIR,'cws.model')
segmentor=Segmentor()
segmentor.load(cws_model_path)
words=segmentor.segment('亚硝酸盐是一种化学物质')
print(type(words))
print('\t'.join(words))

segmentor.release()

<class 'pyltp.VectorOfString'>
亚硝酸盐	是	一	种	化学	物质


In [32]:
import os
## 分词 使用自定义词典
LTP_DATA_DIR = 'C:/Users/Administrator/Downloads/ltp_data_v3.4.0/'
from pyltp import Segmentor
cws_model_path = os.path.join(LTP_DATA_DIR, 'cws.model')
segmentor = Segmentor()  # 初始化实例
segmentor.load_with_lexicon(cws_model_path, 'lexicon') # 加载模型，第二个参数是您的外部词典文件路径
words = segmentor.segment('亚硝酸盐是一种化学物质')
print('\t'.join(words))
segmentor.release()

亚硝酸盐	是	一	种	化学物质


In [33]:
import os
LTP_DATA_DIR='C:/Users/Administrator/Downloads/ltp_data_v3.4.0/'
# ltp模型目录的路径
pos_model_path = os.path.join(LTP_DATA_DIR, 'pos.model')  # 词性标注模型路径，模型名称为`pos.model`
## 词性标注
from pyltp import Postagger
postagger = Postagger() # 初始化实例
postagger.load(pos_model_path)  # 加载模型

words = ['元芳', '你', '怎么', '看']  # 分词结果
postags = postagger.postag(words)  # 词性标注

print('\t'.join(postags))
postagger.release()  # 释放模型

nh	r	r	v


In [37]:
import os
LTP_DATA_DIR='C:/Users/Administrator/Downloads/ltp_data_v3.4.0/'  # ltp模型目录的路径
ner_model_path = os.path.join(LTP_DATA_DIR, 'ner.model')  # 命名实体识别模型路径，模型名称为`pos.model`

from pyltp import NamedEntityRecognizer
recognizer = NamedEntityRecognizer() # 初始化实例
recognizer.load(ner_model_path)  # 加载模型

words = ['元芳', '你', '怎么', '看']
postags = ['nh', 'r', 'r', 'v']
netags = recognizer.recognize(words, postags)  # 命名实体识别

print('\t'.join(netags))
recognizer.release()  # 释放模型

S-Nh	O	O	O


In [2]:
## 句法分析
import os  
LTP_DATA_DIR='C:/Users/Administrator/Downloads/ltp_data_v3.4.0/'  # ltp模型目录的路径
par_model_path = os.path.join(LTP_DATA_DIR, 'parser.model')  # 依存句法分析模型路径，模型名称为`parser.model`

from pyltp import Parser
parser = Parser() # 初始化实例
parser.load(par_model_path)  # 加载模型

words = ['元芳', '你', '怎么', '看']
postags = ['nh', 'r', 'r', 'v']
arcs = parser.parse(words, postags)  # 句法分析

print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))
parser.release()  # 释放模型

4:SBV	4:SBV	4:ADV	0:HED


In [4]:
import os
LTP_DATA_DIR='C:/Users/Administrator/Downloads/ltp_data_v3.4.0/'  # ltp模型目录的路径

srl_model_path = os.path.join(LTP_DATA_DIR,"pisrl_win.model")  # 语义角色标注模型目录路径，模型目录为`srl`。注意该模型路径是一个目录，而不是一个文件。

from pyltp import SementicRoleLabeller
labeller = SementicRoleLabeller() # 初始化实例
labeller.load(srl_model_path)  # 加载模型

words = ['元芳', '你', '怎么', '看']
postags = ['nh', 'r', 'r', 'v']
# arcs 使用依存句法分析的结果
roles = labeller.label(words, postags, arcs)  # 语义角色标注

# 打印结果
for role in roles:
    print (role.index, "".join(
        ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]))
labeller.release()  # 释放模型

3 A0:(1,1)ADV:(2,2)


In [10]:
import sys, os

# ROOTDIR = os.path.join(os.path.dirname(__file__), os.pardir)
# sys.path = [os.path.join(ROOTDIR, "lib")] + sys.path

# Set your own model path
MODELDIR='C:/Users/Administrator/Downloads/ltp_data_v3.4.0/'

from pyltp import SentenceSplitter, Segmentor, Postagger, Parser, NamedEntityRecognizer, SementicRoleLabeller

paragraph = '中国进出口银行与中国银行加强合作。中国进出口银行与中国银行加强合作！'
# paragraph = '杜军平教授在四月七号参加了人工智能协会第五次大会'
sentence = SentenceSplitter.split(paragraph)[0]

segmentor = Segmentor()
segmentor.load(os.path.join(MODELDIR, "cws.model"))
words = segmentor.segment(sentence)
print("\t".join(words))

postagger = Postagger()
postagger.load(os.path.join(MODELDIR, "pos.model"))
postags = postagger.postag(words)
# list-of-string parameter is support in 0.1.5
# postags = postagger.postag(["中国","进出口","银行","与","中国银行","加强","合作"])
print("\t".join(postags))

recognizer = NamedEntityRecognizer()
recognizer.load(os.path.join(MODELDIR, "ner.model"))
netags = recognizer.recognize(words, postags)
print("\t".join(netags))
print()
print()
parser = Parser()
parser.load(os.path.join(MODELDIR, "parser.model"))
arcs = parser.parse(words, postags)
print("\t".join("%d:%s" % (arc.head, arc.relation) for arc in arcs))
print()
rely_id = [arc.head for arc in arcs]    # 提取依存父节点id
relation = [arc.relation for arc in arcs]   # 提取依存关系
heads = ['Root' if id == 0 else words[id-1] for id in rely_id]  # 匹配依存父节点词语

for i in range(len(words)):
    print (relation[i] + '(' + words[i] + ', ' + heads[i] + ')')
print()
print()
labeller = SementicRoleLabeller()
labeller.load(os.path.join(MODELDIR, "pisrl_win.model"))
roles = labeller.label(words, postags, arcs)

for role in roles:
    print(role.index, "".join(
            ["%s:(%d,%d)" % (arg.name, arg.range.start, arg.range.end) for arg in role.arguments]))

segmentor.release()
postagger.release()
parser.release()
recognizer.release()
labeller.release()

中国	进出口	银行	与	中国银行	加强	合作	。
ns	v	n	p	ni	v	v	wp
B-Ni	I-Ni	E-Ni	O	S-Ni	O	O	O


3:ATT	3:ATT	6:SBV	6:ADV	4:POB	0:HED	6:VOB	6:WP

ATT(中国, 银行)
ATT(进出口, 银行)
SBV(银行, 加强)
ADV(与, 加强)
POB(中国银行, 与)
HED(加强, Root)
VOB(合作, 加强)
WP(。, 加强)


5 A0:(0,2)A1:(6,6)
