Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Whoa, MAJOR fix to NER NamedEntity object creation; MUCH simpler code…

… as a result. Yay. Meh progress on coreference though.
  • Loading branch information...
commit 062f5f86d46057b21d7100414b1aa067d62e46d8 1 parent 7e83f57
@dmnapolitano dmnapolitano authored
View
5 corenlp.thrift
@@ -23,10 +23,11 @@ exception SerializedException
service StanfordCoreNLP
{
void ping(),
+ oneway void zip(),
list<ParseTree> parse_text(1:string text, 2:list<string> outputFormat),
ParseTree parse_tokens(1:list<string> tokens, 2:list<string> outputFormat),
- oneway void zip(),
list<NamedEntity> get_entities_from_text(1:string text),
list<NamedEntity> get_entities_from_tokens(1:list<string> tokens),
- list<NamedEntity> get_entities_from_trees(1:list<string> trees)
+ list<NamedEntity> get_entities_from_trees(1:list<string> trees),
+ list<string> resolve_coreferences_in_trees(1:list<string> trees)
}
View
21 gen-py/corenlp/StanfordCoreNLP-remote
@@ -24,12 +24,13 @@ if len(sys.argv) <= 1 or sys.argv[1] == '--help':
print ''
print 'Functions:'
print ' void ping()'
+ print ' void zip()'
print ' parse_text(string text, outputFormat)'
print ' ParseTree parse_tokens( tokens, outputFormat)'
- print ' void zip()'
print ' get_entities_from_text(string text)'
print ' get_entities_from_tokens( tokens)'
print ' get_entities_from_trees( trees)'
+ print ' resolve_coreferences_in_trees( trees)'
print ''
sys.exit(0)
@@ -87,6 +88,12 @@ if cmd == 'ping':
sys.exit(1)
pp.pprint(client.ping())
+elif cmd == 'zip':
+ if len(args) != 0:
+ print 'zip requires 0 args'
+ sys.exit(1)
+ pp.pprint(client.zip())
+
elif cmd == 'parse_text':
if len(args) != 2:
print 'parse_text requires 2 args'
@@ -99,12 +106,6 @@ elif cmd == 'parse_tokens':
sys.exit(1)
pp.pprint(client.parse_tokens(eval(args[0]),eval(args[1]),))
-elif cmd == 'zip':
- if len(args) != 0:
- print 'zip requires 0 args'
- sys.exit(1)
- pp.pprint(client.zip())
-
elif cmd == 'get_entities_from_text':
if len(args) != 1:
print 'get_entities_from_text requires 1 args'
@@ -123,6 +124,12 @@ elif cmd == 'get_entities_from_trees':
sys.exit(1)
pp.pprint(client.get_entities_from_trees(eval(args[0]),))
+elif cmd == 'resolve_coreferences_in_trees':
+ if len(args) != 1:
+ print 'resolve_coreferences_in_trees requires 1 args'
+ sys.exit(1)
+ pp.pprint(client.resolve_coreferences_in_trees(eval(args[0]),))
+
else:
print 'Unrecognized method %s' % cmd
sys.exit(1)
View
354 gen-py/corenlp/StanfordCoreNLP.py
@@ -21,6 +21,9 @@ class Iface(object):
def ping(self, ):
pass
+ def zip(self, ):
+ pass
+
def parse_text(self, text, outputFormat):
"""
Parameters:
@@ -37,9 +40,6 @@ def parse_tokens(self, tokens, outputFormat):
"""
pass
- def zip(self, ):
- pass
-
def get_entities_from_text(self, text):
"""
Parameters:
@@ -61,6 +61,13 @@ def get_entities_from_trees(self, trees):
"""
pass
+ def resolve_coreferences_in_trees(self, trees):
+ """
+ Parameters:
+ - trees
+ """
+ pass
+
class Client(Iface):
def __init__(self, iprot, oprot=None):
@@ -92,6 +99,15 @@ def recv_ping(self, ):
self._iprot.readMessageEnd()
return
+ def zip(self, ):
+ self.send_zip()
+
+ def send_zip(self, ):
+ self._oprot.writeMessageBegin('zip', TMessageType.CALL, self._seqid)
+ args = zip_args()
+ args.write(self._oprot)
+ self._oprot.writeMessageEnd()
+ self._oprot.trans.flush()
def parse_text(self, text, outputFormat):
"""
Parameters:
@@ -156,15 +172,6 @@ def recv_parse_tokens(self, ):
return result.success
raise TApplicationException(TApplicationException.MISSING_RESULT, "parse_tokens failed: unknown result");
- def zip(self, ):
- self.send_zip()
-
- def send_zip(self, ):
- self._oprot.writeMessageBegin('zip', TMessageType.CALL, self._seqid)
- args = zip_args()
- args.write(self._oprot)
- self._oprot.writeMessageEnd()
- self._oprot.trans.flush()
def get_entities_from_text(self, text):
"""
Parameters:
@@ -255,18 +262,49 @@ def recv_get_entities_from_trees(self, ):
return result.success
raise TApplicationException(TApplicationException.MISSING_RESULT, "get_entities_from_trees failed: unknown result");
+ def resolve_coreferences_in_trees(self, trees):
+ """
+ Parameters:
+ - trees
+ """
+ self.send_resolve_coreferences_in_trees(trees)
+ return self.recv_resolve_coreferences_in_trees()
+
+ def send_resolve_coreferences_in_trees(self, trees):
+ self._oprot.writeMessageBegin('resolve_coreferences_in_trees', TMessageType.CALL, self._seqid)
+ args = resolve_coreferences_in_trees_args()
+ args.trees = trees
+ args.write(self._oprot)
+ self._oprot.writeMessageEnd()
+ self._oprot.trans.flush()
+
+ def recv_resolve_coreferences_in_trees(self, ):
+ (fname, mtype, rseqid) = self._iprot.readMessageBegin()
+ if mtype == TMessageType.EXCEPTION:
+ x = TApplicationException()
+ x.read(self._iprot)
+ self._iprot.readMessageEnd()
+ raise x
+ result = resolve_coreferences_in_trees_result()
+ result.read(self._iprot)
+ self._iprot.readMessageEnd()
+ if result.success is not None:
+ return result.success
+ raise TApplicationException(TApplicationException.MISSING_RESULT, "resolve_coreferences_in_trees failed: unknown result");
+
class Processor(Iface, TProcessor):
def __init__(self, handler):
self._handler = handler
self._processMap = {}
self._processMap["ping"] = Processor.process_ping
+ self._processMap["zip"] = Processor.process_zip
self._processMap["parse_text"] = Processor.process_parse_text
self._processMap["parse_tokens"] = Processor.process_parse_tokens
- self._processMap["zip"] = Processor.process_zip
self._processMap["get_entities_from_text"] = Processor.process_get_entities_from_text
self._processMap["get_entities_from_tokens"] = Processor.process_get_entities_from_tokens
self._processMap["get_entities_from_trees"] = Processor.process_get_entities_from_trees
+ self._processMap["resolve_coreferences_in_trees"] = Processor.process_resolve_coreferences_in_trees
def process(self, iprot, oprot):
(name, type, seqid) = iprot.readMessageBegin()
@@ -294,6 +332,13 @@ def process_ping(self, seqid, iprot, oprot):
oprot.writeMessageEnd()
oprot.trans.flush()
+ def process_zip(self, seqid, iprot, oprot):
+ args = zip_args()
+ args.read(iprot)
+ iprot.readMessageEnd()
+ self._handler.zip()
+ return
+
def process_parse_text(self, seqid, iprot, oprot):
args = parse_text_args()
args.read(iprot)
@@ -316,13 +361,6 @@ def process_parse_tokens(self, seqid, iprot, oprot):
oprot.writeMessageEnd()
oprot.trans.flush()
- def process_zip(self, seqid, iprot, oprot):
- args = zip_args()
- args.read(iprot)
- iprot.readMessageEnd()
- self._handler.zip()
- return
-
def process_get_entities_from_text(self, seqid, iprot, oprot):
args = get_entities_from_text_args()
args.read(iprot)
@@ -356,6 +394,17 @@ def process_get_entities_from_trees(self, seqid, iprot, oprot):
oprot.writeMessageEnd()
oprot.trans.flush()
+ def process_resolve_coreferences_in_trees(self, seqid, iprot, oprot):
+ args = resolve_coreferences_in_trees_args()
+ args.read(iprot)
+ iprot.readMessageEnd()
+ result = resolve_coreferences_in_trees_result()
+ result.success = self._handler.resolve_coreferences_in_trees(args.trees)
+ oprot.writeMessageBegin("resolve_coreferences_in_trees", TMessageType.REPLY, seqid)
+ result.write(oprot)
+ oprot.writeMessageEnd()
+ oprot.trans.flush()
+
# HELPER FUNCTIONS AND STRUCTURES
@@ -465,6 +514,59 @@ def __ne__(self, other):
return not (self == other)
+class zip_args(object):
+
+ __slots__ = [
+ ]
+
+ thrift_spec = (
+ )
+
+ def read(self, iprot):
+ if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None:
+ fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec))
+ return
+ iprot.readStructBegin()
+ while True:
+ (fname, ftype, fid) = iprot.readFieldBegin()
+ if ftype == TType.STOP:
+ break
+ else:
+ iprot.skip(ftype)
+ iprot.readFieldEnd()
+ iprot.readStructEnd()
+
+ def write(self, oprot):
+ if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None:
+ oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec)))
+ return
+ oprot.writeStructBegin('zip_args')
+ oprot.writeFieldStop()
+ oprot.writeStructEnd()
+
+ def validate(self):
+ return
+
+
+ def __repr__(self):
+ L = ['%s=%r' % (key, getattr(self, key))
+ for key in self.__slots__]
+ return '%s(%s)' % (self.__class__.__name__, ', '.join(L))
+
+ def __eq__(self, other):
+ if not isinstance(other, self.__class__):
+ return False
+ for attr in self.__slots__:
+ my_val = getattr(self, attr)
+ other_val = getattr(other, attr)
+ if my_val != other_val:
+ return False
+ return True
+
+ def __ne__(self, other):
+ return not (self == other)
+
+
class parse_text_args(object):
"""
Attributes:
@@ -811,59 +913,6 @@ def __ne__(self, other):
return not (self == other)
-class zip_args(object):
-
- __slots__ = [
- ]
-
- thrift_spec = (
- )
-
- def read(self, iprot):
- if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None:
- fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec))
- return
- iprot.readStructBegin()
- while True:
- (fname, ftype, fid) = iprot.readFieldBegin()
- if ftype == TType.STOP:
- break
- else:
- iprot.skip(ftype)
- iprot.readFieldEnd()
- iprot.readStructEnd()
-
- def write(self, oprot):
- if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None:
- oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec)))
- return
- oprot.writeStructBegin('zip_args')
- oprot.writeFieldStop()
- oprot.writeStructEnd()
-
- def validate(self):
- return
-
-
- def __repr__(self):
- L = ['%s=%r' % (key, getattr(self, key))
- for key in self.__slots__]
- return '%s(%s)' % (self.__class__.__name__, ', '.join(L))
-
- def __eq__(self, other):
- if not isinstance(other, self.__class__):
- return False
- for attr in self.__slots__:
- my_val = getattr(self, attr)
- other_val = getattr(other, attr)
- if my_val != other_val:
- return False
- return True
-
- def __ne__(self, other):
- return not (self == other)
-
-
class get_entities_from_text_args(object):
"""
Attributes:
@@ -1335,3 +1384,162 @@ def __eq__(self, other):
def __ne__(self, other):
return not (self == other)
+
+class resolve_coreferences_in_trees_args(object):
+ """
+ Attributes:
+ - trees
+ """
+
+ __slots__ = [
+ 'trees',
+ ]
+
+ thrift_spec = (
+ None, # 0
+ (1, TType.LIST, 'trees', (TType.STRING,None), None, ), # 1
+ )
+
+ def __init__(self, trees=None,):
+ self.trees = trees
+
+ def read(self, iprot):
+ if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None:
+ fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec))
+ return
+ iprot.readStructBegin()
+ while True:
+ (fname, ftype, fid) = iprot.readFieldBegin()
+ if ftype == TType.STOP:
+ break
+ if fid == 1:
+ if ftype == TType.LIST:
+ self.trees = []
+ (_etype66, _size63) = iprot.readListBegin()
+ for _i67 in xrange(_size63):
+ _elem68 = iprot.readString().decode('utf-8')
+ self.trees.append(_elem68)
+ iprot.readListEnd()
+ else:
+ iprot.skip(ftype)
+ else:
+ iprot.skip(ftype)
+ iprot.readFieldEnd()
+ iprot.readStructEnd()
+
+ def write(self, oprot):
+ if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None:
+ oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec)))
+ return
+ oprot.writeStructBegin('resolve_coreferences_in_trees_args')
+ if self.trees is not None:
+ oprot.writeFieldBegin('trees', TType.LIST, 1)
+ oprot.writeListBegin(TType.STRING, len(self.trees))
+ for iter69 in self.trees:
+ oprot.writeString(iter69.encode('utf-8'))
+ oprot.writeListEnd()
+ oprot.writeFieldEnd()
+ oprot.writeFieldStop()
+ oprot.writeStructEnd()
+
+ def validate(self):
+ return
+
+
+ def __repr__(self):
+ L = ['%s=%r' % (key, getattr(self, key))
+ for key in self.__slots__]
+ return '%s(%s)' % (self.__class__.__name__, ', '.join(L))
+
+ def __eq__(self, other):
+ if not isinstance(other, self.__class__):
+ return False
+ for attr in self.__slots__:
+ my_val = getattr(self, attr)
+ other_val = getattr(other, attr)
+ if my_val != other_val:
+ return False
+ return True
+
+ def __ne__(self, other):
+ return not (self == other)
+
+
+class resolve_coreferences_in_trees_result(object):
+ """
+ Attributes:
+ - success
+ """
+
+ __slots__ = [
+ 'success',
+ ]
+
+ thrift_spec = (
+ (0, TType.LIST, 'success', (TType.STRING,None), None, ), # 0
+ )
+
+ def __init__(self, success=None,):
+ self.success = success
+
+ def read(self, iprot):
+ if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None:
+ fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec))
+ return
+ iprot.readStructBegin()
+ while True:
+ (fname, ftype, fid) = iprot.readFieldBegin()
+ if ftype == TType.STOP:
+ break
+ if fid == 0:
+ if ftype == TType.LIST:
+ self.success = []
+ (_etype73, _size70) = iprot.readListBegin()
+ for _i74 in xrange(_size70):
+ _elem75 = iprot.readString().decode('utf-8')
+ self.success.append(_elem75)
+ iprot.readListEnd()
+ else:
+ iprot.skip(ftype)
+ else:
+ iprot.skip(ftype)
+ iprot.readFieldEnd()
+ iprot.readStructEnd()
+
+ def write(self, oprot):
+ if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None:
+ oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec)))
+ return
+ oprot.writeStructBegin('resolve_coreferences_in_trees_result')
+ if self.success is not None:
+ oprot.writeFieldBegin('success', TType.LIST, 0)
+ oprot.writeListBegin(TType.STRING, len(self.success))
+ for iter76 in self.success:
+ oprot.writeString(iter76.encode('utf-8'))
+ oprot.writeListEnd()
+ oprot.writeFieldEnd()
+ oprot.writeFieldStop()
+ oprot.writeStructEnd()
+
+ def validate(self):
+ return
+
+
+ def __repr__(self):
+ L = ['%s=%r' % (key, getattr(self, key))
+ for key in self.__slots__]
+ return '%s(%s)' % (self.__class__.__name__, ', '.join(L))
+
+ def __eq__(self, other):
+ if not isinstance(other, self.__class__):
+ return False
+ for attr in self.__slots__:
+ my_val = getattr(self, attr)
+ other_val = getattr(other, attr)
+ if my_val != other_val:
+ return False
+ return True
+
+ def __ne__(self, other):
+ return not (self == other)
+
View
60 scripts/coref_client.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python
+
+from corenlp import StanfordCoreNLP
+from corenlp.ttypes import *
+from thrift import Thrift
+from thrift.transport import TSocket, TTransport
+from thrift.protocol import TBinaryProtocol
+
+#from bs4 import UnicodeDammit
+#import re
+import sys
+
+
+# get command line arguments
+args = sys.argv[1:]
+if len(args) != 2:
+ sys.stderr.write('Usage: coref_client.py <server> <port>\n')
+ sys.exit(2)
+else:
+ server = args[0]
+ port = int(args[1])
+
+
+trees = ["(ROOT (S (NP (NNP Barack) (NNP Hussein) (NNP Obama) (NNP II)) (VP (VBZ is) (NP (NP (DT the) (JJ 44th) (CC and) (JJ current) (NN President)) (PP (IN of) (NP (DT the) (NNP United) (NNPS States)))) (, ,) (PP (IN in) (NP (NP (NN office)) (PP (IN since) (NP (CD 2009)))))) (. .)))"]
+# "(ROOT (S (NP (PRP He)) (VP (VBZ is) (NP (DT the) (JJ first) (NNP African) (NNP American)) (S (VP (TO to) (VP (VB hold) (NP (DT the) (NN office)))))) (. .)))",
+# "(ROOT (S (S (VP (VBN Born) (PP (IN in) (NP (NNP Honolulu) (, ,) (NNP Hawaii))))) (, ,) (NP (NNP Obama)) (VP (VBZ is) (NP (NP (DT a) (NN graduate)) (PP (IN of) (NP (NP (NNP Columbia) (NNP University)) (CC and) (NP (NNP Harvard) (NNP Law) (NNP School))))) (, ,) (SBAR (WHADVP (WRB where)) (S (NP (PRP he)) (VP (VBD was) (NP (NP (NN president)) (PP (IN of) (NP (DT the) (NNP Harvard) (NNP Law) (NNP Review)))))))) (. .)))",
+# "(ROOT (S (NP (PRP He)) (VP (VBD was) (NP (NP (DT a) (NN community) (NN organizer)) (PP (IN in) (NP (NNP Chicago)))) (PP (IN before) (S (VP (VBG earning) (NP (PRP$ his) (NN law) (NN degree)))))) (. .)))",
+# "(ROOT (S (NP (PRP He)) (VP (VP (VBD worked) (PP (IN as) (NP (NP (DT a) (JJ civil) (NNS rights) (NN attorney)) (PP (IN in) (NP (NNP Chicago)))))) (CC and) (VP (VBD taught) (NP (JJ constitutional) (NN law)) (PP (IN at) (NP (NP (DT the) (NNP University)) (PP (IN of) (NP (NP (NNP Chicago) (NNP Law) (NNP School)) (PP (IN from) (NP (CD 1992))))))) (PP (TO to) (NP (CD 2004))))) (. .)))",
+# "(ROOT (S (NP (PRP He)) (VP (VBD served) (NP (NP (CD three) (NNS terms)) (VP (VBG representing) (NP (NP (DT the) (NAC (JJ 13th) (NNP District) (PP (IN in) (NP (DT the) (NNP Illinois)))) (NNP Senate)) (PP (IN from) (NP (CD 1997) (TO to) (CD 2004)))))) (, ,) (S (VP (VBG running) (ADVP (RB unsuccessfully)) (PP (IN for) (NP (NP (DT the) (NNP United) (NNPS States) (NNP House)) (PP (IN of) (NP (NP (NNS Representatives)) (PP (IN in) (NP (CD 2000)))))))))) (. .)))"]
+
+#text = "My name is Diane and I live in New Jersey. I sometimes go to New York. The Food and Drug Administration is an organization."
+more_trees = ["(ROOT (S (S (NP (PRP$ My) (NN name)) (VP (VBZ is) (NP (NNP Diane)))) (CC and) (S (NP (PRP I)) (VP (VBP live) (PP (IN in) (NP (NNP New) (NNP Jersey))))) (. .)))",
+ "(ROOT (S (NP (PRP I)) (ADVP (RB sometimes)) (VP (VBP go) (PP (TO to) (NP (NNP New) (NNP York)))) (. .)))",
+ "(ROOT (S (NP (DT The) (NNP Food) (CC and) (NNP Drug) (NNP Administration)) (VP (VBZ is) (NP (DT an) (NN organization))) (. .)))"]
+#tokenized_sentences = [["My", "name", "is", "Diane", "and", "I", "live", "in", "New", "Jersey", "."],
+# ["I", "sometimes", "go", "to", "New", "York", "."],
+# ["The", "Food", "and", "Drug", "Administration", "is", "an", "organization", "."]]
+
+transport = TSocket.TSocket(server, port)
+transport = TTransport.TBufferedTransport(transport)
+protocol = TBinaryProtocol.TBinaryProtocol(transport)
+client = StanfordCoreNLP.Client(protocol)
+
+transport.open()
+
+try:
+ result = client.resolve_coreferences_in_trees(trees)
+ print result
+# print
+# result = client.resolve_coreferences_in_trees(more_trees)
+# print result
+# print
+# for sentence in tokenized_sentences:
+# result = client.get_entities_from_tokens(sentence)
+# print result
+
+except Exception as e:
+ print e
+
+transport.close()
View
27 scripts/ner_client.py
@@ -1,4 +1,5 @@
#!/usr/bin/env python
+# -*- coding: utf-8 -*-
from corenlp import StanfordCoreNLP
from corenlp.ttypes import *
@@ -6,8 +7,6 @@
from thrift.transport import TSocket, TTransport
from thrift.protocol import TBinaryProtocol
-from bs4 import UnicodeDammit
-import re
import sys
@@ -23,13 +22,21 @@
# for testing named entity systems
text = "My name is Diane and I live in New Jersey. I sometimes go to New York. The Food and Drug Administration is an organization."
-trees = ["(ROOT (S (S (NP (PRP$ My) (NN name)) (VP (VBZ is) (NP (NNP Diane)))) (CC and) (S (NP (PRP I)) (VP (VBP live) (PP (IN in) (NP (NNP New) (NNP Jersey))))) (. .)))",
- "(ROOT (S (NP (PRP I)) (ADVP (RB sometimes)) (VP (VBP go) (PP (TO to) (NP (NNP New) (NNP York)))) (. .)))",
- "(ROOT (S (NP (DT The) (NNP Food) (CC and) (NNP Drug) (NNP Administration)) (VP (VBZ is) (NP (DT an) (NN organization))) (. .)))"]
+#trees = ["(ROOT (S (S (NP (PRP$ My) (NN name)) (VP (VBZ is) (NP (NNP Diane)))) (CC and) (S (NP (PRP I)) (VP (VBP live) (PP (IN in) (NP (NNP New) (NNP Jersey))))) (. .)))",
+# "(ROOT (S (NP (PRP I)) (ADVP (RB sometimes)) (VP (VBP go) (PP (TO to) (NP (NNP New) (NNP York)))) (. .)))",
+# "(ROOT (S (NP (DT The) (NNP Food) (CC and) (NNP Drug) (NNP Administration)) (VP (VBZ is) (NP (DT an) (NN organization))) (. .)))"]
tokenized_sentences = [["My", "name", "is", "Diane", "and", "I", "live", "in", "New", "Jersey", "."],
["I", "sometimes", "go", "to", "New", "York", "."],
["The", "Food", "and", "Drug", "Administration", "is", "an", "organization", "."]]
+trees = ["(ROOT (S (NP (NNP Barack) (NNP Hussein) (NNP Obama) (NNP II)) (VP (VBZ is) (NP (NP (DT the) (JJ 44th) (CC and) (JJ current) (NN President)) (PP (IN of) (NP (DT the) (NNP United) (NNPS States)))) (, ,) (PP (IN in) (NP (NP (NN office)) (PP (IN since) (NP (CD 2009)))))) (. .)))",
+ "(ROOT (S (NP (PRP He)) (VP (VBZ is) (NP (DT the) (JJ first) (NNP African) (NNP American)) (S (VP (TO to) (VP (VB hold) (NP (DT the) (NN office)))))) (. .)))",
+ "(ROOT (S (S (VP (VBN Born) (PP (IN in) (NP (NNP Honolulu) (, ,) (NNP Hawaii))))) (, ,) (NP (NNP Obama)) (VP (VBZ is) (NP (NP (DT a) (NN graduate)) (PP (IN of) (NP (NP (NNP Columbia) (NNP University)) (CC and) (NP (NNP Harvard) (NNP Law) (NNP School))))) (, ,) (SBAR (WHADVP (WRB where)) (S (NP (PRP he)) (VP (VBD was) (NP (NP (NN president)) (PP (IN of) (NP (DT the) (NNP Harvard) (NNP Law) (NNP Review)))))))) (. .)))",
+ "(ROOT (S (NP (PRP He)) (VP (VBD was) (NP (NP (DT a) (NN community) (NN organizer)) (PP (IN in) (NP (NNP Chicago)))) (PP (IN before) (S (VP (VBG earning) (NP (PRP$ his) (NN law) (NN degree)))))) (. .)))",
+ "(ROOT (S (NP (PRP He)) (VP (VP (VBD worked) (PP (IN as) (NP (NP (DT a) (JJ civil) (NNS rights) (NN attorney)) (PP (IN in) (NP (NNP Chicago)))))) (CC and) (VP (VBD taught) (NP (JJ constitutional) (NN law)) (PP (IN at) (NP (NP (DT the) (NNP University)) (PP (IN of) (NP (NP (NNP Chicago) (NNP Law) (NNP School)) (PP (IN from) (NP (CD 1992))))))) (PP (TO to) (NP (CD 2004))))) (. .)))",
+ "(ROOT (S (NP (PRP He)) (VP (VBD served) (NP (NP (CD three) (NNS terms)) (VP (VBG representing) (NP (NP (DT the) (NAC (JJ 13th) (NNP District) (PP (IN in) (NP (DT the) (NNP Illinois)))) (NNP Senate)) (PP (IN from) (NP (CD 1997) (TO to) (CD 2004)))))) (, ,) (S (VP (VBG running) (ADVP (RB unsuccessfully)) (PP (IN for) (NP (NP (DT the) (NNP United) (NNPS States) (NNP House)) (PP (IN of) (NP (NP (NNS Representatives)) (PP (IN in) (NP (CD 2000)))))))))) (. .)))"]
+
+
transport = TSocket.TSocket(server, port)
transport = TTransport.TBufferedTransport(transport)
protocol = TBinaryProtocol.TBinaryProtocol(transport)
@@ -44,12 +51,12 @@
print
result = client.get_entities_from_trees(trees)
print result
- print
- for sentence in tokenized_sentences:
- result = client.get_entities_from_tokens(sentence)
- print result
+# print
+# for sentence in tokenized_sentences:
+# result = client.get_entities_from_tokens(sentence)
+# print result
except Exception as e:
print e
-transport.close()
+transport.close()
View
17 scripts/parser_client.py
@@ -34,6 +34,14 @@
u"The presence of fox-like carnivores all over the globe , together with their widespread reputation for cunning , has contributed to their appearance in popular culture and folklore in many societies around the world -LRB- see also Foxes in culture -RRB- .".split(" "),
u"The hunting of foxes with packs of hounds , long an established pursuit in Europe , especially the British Isles , was exported by European settlers to various parts of the New World .".split(" ")]
+# a particularly useful example for coreference
+more_tokenized_sentences = [u"Barack Hussein Obama II is the 44th and current President of the United States , in office since 2009 .".split(" "),
+ u"He is the first African American to hold the office .".split(" "),
+ u"Born in Honolulu , Hawaii , Obama is a graduate of Columbia University and Harvard Law School , where he was president of the Harvard Law Review .".split(" "),
+ u"He was a community organizer in Chicago before earning his law degree .".split(" "),
+ u"He worked as a civil rights attorney in Chicago and taught constitutional law at the University of Chicago Law School from 1992 to 2004 .".split(" "),
+ u"He served three terms representing the 13th District in the Illinois Senate from 1997 to 2004 , running unsuccessfully for the United States House of Representatives in 2000 .".split(" ")]
+
# Make socket
transport = TSocket.TSocket(server, port)
@@ -73,5 +81,14 @@
except Exception as e:
print e
+print
+
+for sentence in more_tokenized_sentences:
+ try:
+ tree = client.parse_tokens(sentence, outputOptions)
+ sys.stdout.write(tree.tree.strip()+"\n")
+ except Exception as e:
+ print e
+
# All done
transport.close()
View
23 src/StanfordCoreNLPHandler.java
@@ -3,6 +3,8 @@
import org.apache.thrift.TApplicationException;
import org.apache.thrift.TException;
+import coref.StanfordCorefThrift;
+
import parser.StanfordParserThrift;
import CoreNLP.*;
@@ -17,11 +19,16 @@
{
private StanfordParserThrift parser;
private StanfordNERThrift ner;
+ private StanfordCorefThrift coref;
public StanfordCoreNLPHandler()
{
+ System.err.println("Initializing Parser...");
parser = new StanfordParserThrift("");
+ System.err.println("Initializing Named Entity Recognizer...");
ner = new StanfordNERThrift();
+ System.err.println("Initializing Coreference Resolver...");
+ coref = new StanfordCorefThrift();
}
/* Begin Stanford Parser methods */
@@ -67,7 +74,21 @@ public ParseTree parse_tokens(List<String> tokens, List<String> outputFormat) th
{
return ner.getNamedEntitiesFromTrees(trees);
}
- /* End Stanford NER Methods */
+ /* End Stanford NER methods */
+
+ /* Begin Stanford Coref methods */
+// public List<String> resolve_coreferences_in_text(String text)
+// {
+// List<ParseTree> parseTreeObjects = parser.parse_text(text, null);
+// List<String> parseTrees = CoreNLPThriftUtil.ParseTreeObjectsToString(parseTreeObjects);
+// return coref.getCoreferencesFromTrees(parseTrees, ner);
+// }
+
+ public List<String> resolve_coreferences_in_trees(List<String> trees)
+ {
+ return coref.getCoreferencesFromTrees(trees, ner);
+ }
+ /* End Stanford Coref methods */
public void ping()
{
View
130 src/coref/StanfordCorefThrift.java
@@ -6,11 +6,14 @@
import java.util.Map;
import java.util.Properties;
+import ner.StanfordNERThrift;
+
import edu.stanford.nlp.dcoref.CorefChain;
import edu.stanford.nlp.dcoref.CorefCoreAnnotations;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.pipeline.Annotation;
+import edu.stanford.nlp.pipeline.DeterministicCorefAnnotator;
import edu.stanford.nlp.pipeline.StanfordCoreNLP;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Pair;
@@ -19,25 +22,32 @@
public class StanfordCorefThrift
{
- public static List<String> getCoreferencesFromText(String text)
+// private StanfordCoreNLP pipeline;
+ private DeterministicCorefAnnotator coref;
+
+ public StanfordCorefThrift()
{
- Properties props = new Properties();
- props.put("annotators", "tokenize, ssplit, parse, lemma, ner, dcoref");
- StanfordCoreNLP pipeline = new StanfordCoreNLP(props, true);
- Annotation annotation = new Annotation(text);
- pipeline.annotate(annotation);
-
-// List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
-// for (CoreMap sentence : sentences)
-// {
-// System.out.println(sentences.indexOf(sentence)+1 + ": " + sentence);
-// }
-
-// newStyleCoreferenceGraphOutput(annotation);
+// Properties props = new Properties();
+// props.put("annotators", "tokenize, ssplit, parse, lemma, ner, dcoref");
+// pipeline = new StanfordCoreNLP(props, true);
+ coref = new DeterministicCorefAnnotator(new Properties());
+ }
+
+// public List<String> getCoreferencesFromText(String text)
+// {
+// Annotation annotation = new Annotation(text);
+// pipeline.annotate(annotation);
+// return MUCStyleOutput(annotation);
+// }
+
+ public List<String> getCoreferencesFromTrees(List<String> parseTrees, StanfordNERThrift ner)
+ {
+ Annotation annotation = ner.getNamedEntityAnnotationFromTrees(parseTrees);
+ coref.annotate(annotation);
return MUCStyleOutput(annotation);
}
- public static void newStyleCoreferenceGraphOutput(Annotation annotation)
+ private void newStyleCoreferenceGraphOutput(Annotation annotation)
{
// display the new-style coreference graph
//List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
@@ -70,7 +80,7 @@ public static void newStyleCoreferenceGraphOutput(Annotation annotation)
}
}
- private static List<String> MUCStyleOutput(Annotation annotation)
+ private List<String> MUCStyleOutput(Annotation annotation)
{
Map<Integer, CorefChain> corefChains = annotation.get(CorefCoreAnnotations.CorefChainAnnotation.class);
Map<Integer, Map<Integer, Pair<CorefChain.CorefMention, CorefChain.CorefMention>>> mentionMap =
@@ -159,66 +169,36 @@ public static void newStyleCoreferenceGraphOutput(Annotation annotation)
return mucOutput;
}
-
- // public static List<String> getCoreferencesFromTrees(List<String> parseTrees)
- // {
- // StanfordNERThrift ner = new StanfordNERThrift();
- // Annotation annotation = ner.getNamedEntityAnnotationFromTrees(parseTrees);
- //
- // DeterministicCorefAnnotator coref = new DeterministicCorefAnnotator(new Properties());
- // coref.annotate(annotation);
- //
- // // display the new-style coreference graph
- // Map<Integer, CorefChain> corefChains = annotation.get(CorefCoreAnnotations.CorefChainAnnotation.class);
- // List<CoreMap> sentences = annotation.get(CoreAnnotations.SentencesAnnotation.class);
- // if (corefChains != null && sentences != null)
- // {
- // for (CorefChain chain : corefChains.values())
- // {
- // CorefChain.CorefMention representative = chain.getRepresentativeMention();
- // for (CorefChain.CorefMention mention : chain.getMentionsInTextualOrder())
- // {
- // if (mention == representative)
- // continue;
- // // all offsets start at 1!
- // System.out.println("\t(" + mention.sentNum + "," +
- // mention.headIndex + ",[" +
- // mention.startIndex + "," +
- // mention.endIndex + ")) -> (" +
- // representative.sentNum + "," +
- // representative.headIndex + ",[" +
- // representative.startIndex + "," +
- // representative.endIndex + ")), that is: \"" +
- // mention.mentionSpan + "\" -> \"" +
- // representative.mentionSpan + "\"");
- // }
- // }
- // }
- // return null;
- // }
-
-
-
+
/**
* @param args
*/
- public static void main(String[] args)
- {
- //String testSentences = "By proposing a meeting date, Eastern moved one step closer toward reopening current high-cost contract agreements with its unions.";
- String testSentences = "Barack Hussein Obama II is the 44th and current President of the United States, in office since 2009. "
- + "He is the first African American to hold the office. "
- + "Born in Honolulu, Hawaii, Obama is a graduate of Columbia University and Harvard Law School, where he was president of the Harvard Law Review. "
- + "He was a community organizer in Chicago before earning his law degree. "
- + "He worked as a civil rights attorney in Chicago and taught constitutional law at the University of Chicago Law School from 1992 to 2004. "
- + "He served three terms representing the 13th District in the Illinois Senate from 1997 to 2004, running unsuccessfully for the United States House of Representatives in 2000.";
- List<String> results = getCoreferencesFromText(testSentences);
- for (String s : results)
- {
- System.out.println(s);
- }
- //List<String> trees = new ArrayList<String>();
- //trees.add("(ROOT (S (NP (NP (NNS Members)) (PP (IN of) (NP (QP (RB about) (CD 37)) (NNS species)))) (VP (VBP are) (VP (VBN referred) (PP (TO to) (NP (NP (RB as) (NNS foxes)) (, ,) (SBAR (WHPP (IN of) (WHNP (WDT which))) (S (NP (QP (RB only) (CD 12)) (NNS species)) (ADVP (RB actually)) (VP (VBP belong) (PP (TO to) (NP (NP (DT the) (NNP Vulpes) (NNS genus)) (PP (IN of) (NP (`` ``) (JJ true) (NNS foxes) ('' '')))))))))))) (. .)))");
- //String[] tokensArr = "Members of about 37 species are referred to as foxes , of which only 12 species actually belong to the Vulpes genus of `` true foxes '' .".split(" ");
- //getCoreferencesFromTrees(trees);
- }
+// public static void main(String[] args)
+// {
+// StanfordCorefThrift coref = new StanfordCorefThrift();
+//
+// //String testSentences = "By proposing a meeting date, Eastern moved one step closer toward reopening current high-cost contract agreements with its unions.";
+// String testSentences = "Barack Hussein Obama II is the 44th and current President of the United States, in office since 2009. "
+// + "He is the first African American to hold the office. "
+// + "Born in Honolulu, Hawaii, Obama is a graduate of Columbia University and Harvard Law School, where he was president of the Harvard Law Review. "
+// + "He was a community organizer in Chicago before earning his law degree. "
+// + "He worked as a civil rights attorney in Chicago and taught constitutional law at the University of Chicago Law School from 1992 to 2004. "
+// + "He served three terms representing the 13th District in the Illinois Senate from 1997 to 2004, running unsuccessfully for the United States House of Representatives in 2000.";
+// List<String> results = coref.getCoreferencesFromText(testSentences);
+// for (String s : results)
+// {
+// System.out.println(s);
+// }
+//
+// System.out.println();
+//
+// List<String> trees = new ArrayList<String>();
+// trees.add("(ROOT (S (NP (NP (NNS Members)) (PP (IN of) (NP (QP (RB about) (CD 37)) (NNS species)))) (VP (VBP are) (VP (VBN referred) (PP (TO to) (NP (NP (RB as) (NNS foxes)) (, ,) (SBAR (WHPP (IN of) (WHNP (WDT which))) (S (NP (QP (RB only) (CD 12)) (NNS species)) (ADVP (RB actually)) (VP (VBP belong) (PP (TO to) (NP (NP (DT the) (NNP Vulpes) (NNS genus)) (PP (IN of) (NP (`` ``) (JJ true) (NNS foxes) ('' '')))))))))))) (. .)))");
+// //String[] tokensArr = "Members of about 37 species are referred to as foxes , of which only 12 species actually belong to the Vulpes genus of `` true foxes '' .".split(" ");
+// results = coref.getCoreferencesFromTrees(trees);
+// for (String s : results)
+// {
+// System.out.println(s);
+// }
+// }
}
View
26 src/general/CoreNLPThriftUtil.java
@@ -1,6 +1,6 @@
package general;
-//import CoreNLP.ParseTree;
+import CoreNLP.ParseTree;
import java.util.ArrayList;
import java.util.List;
@@ -113,15 +113,15 @@ public static Annotation getAnnotationFromTokens(List<String> tokens)
return sentencesCopy;
}
-// public static List<String> ParseTreeObjectsToString(List<ParseTree> parseTrees)
-// {
-// List<String> trees = new ArrayList<String>();
-// for (ParseTree tree : parseTrees)
-// {
-// trees.add(tree.tree);
-// }
-// return trees;
-// }
+ public static List<String> ParseTreeObjectsToString(List<ParseTree> parseTrees)
+ {
+ List<String> trees = new ArrayList<String>();
+ for (ParseTree tree : parseTrees)
+ {
+ trees.add(tree.tree);
+ }
+ return trees;
+ }
public static String closeHTMLTags(String original)
{
@@ -140,10 +140,4 @@ public static String closeHTMLTags(String original)
}
return improved;
}
-
- public static void main(String[] args)
- {
- String test = "current President of <COREF ID=\"9\">the <COREF ID=\"4\" REF=\"9\">United States</COREF> ";
- System.out.println(CoreNLPThriftUtil.closeHTMLTags(test));
- }
}
View
124 src/ner/StanfordNERThrift.java
@@ -5,24 +5,14 @@
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
-import java.util.Properties;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
+import java.util.Stack;
import edu.stanford.nlp.ling.CoreAnnotations;
-import edu.stanford.nlp.ling.CoreAnnotations.TextAnnotation;
import edu.stanford.nlp.ling.CoreLabel;
-import edu.stanford.nlp.ling.Sentence;
-import edu.stanford.nlp.ling.CoreAnnotations.AfterAnnotation;
-import edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetBeginAnnotation;
-import edu.stanford.nlp.ling.CoreAnnotations.CharacterOffsetEndAnnotation;
import edu.stanford.nlp.pipeline.Annotation;
import edu.stanford.nlp.pipeline.NERCombinerAnnotator;
-import edu.stanford.nlp.pipeline.ParserAnnotatorUtils;
-import edu.stanford.nlp.pipeline.StanfordCoreNLP;
-import edu.stanford.nlp.trees.Tree;
import edu.stanford.nlp.util.CoreMap;
-import edu.stanford.nlp.util.StringUtils;
+
import general.CoreNLPThriftUtil;
@@ -63,34 +53,18 @@ public StanfordNERThrift()
// {
// Annotation annotation = new Annotation(text);
// pipeline.annotate(annotation);
-//
-// List<NamedEntity> allFoundEntities = new ArrayList<NamedEntity>();
-//
+
// List<CoreMap> sentences = CoreNLPThriftUtil.adjustCharacterOffsets(annotation.get(CoreAnnotations.SentencesAnnotation.class), false);
-//
-// for (CoreMap sentence : sentences)
-// {
-// List<NamedEntity> thisSentencesEntities = toNamedEntityObjects(sentence);
-// //System.out.println(thisSentencesEntities);
-// allFoundEntities.addAll(thisSentencesEntities);
-// }
-// return allFoundEntities;
+// return toNamedEntityObjects(sentences);
// }
public List<NamedEntity> getNamedEntitiesFromTrees(List<String> parseTrees)
{
- List<NamedEntity> allFoundEntities = new ArrayList<NamedEntity>();
Annotation sentences = getNamedEntityAnnotationFromTrees(parseTrees);
- List<CoreMap> sentenceMap = sentences.get(CoreAnnotations.SentencesAnnotation.class);
- for (CoreMap sentence : sentenceMap)
- {
- List<NamedEntity> thisSentencesEntities = toNamedEntityObjects(sentence);
- allFoundEntities.addAll(thisSentencesEntities);
- }
-
- return allFoundEntities;
+ List<CoreMap> sentenceMap = sentences.get(CoreAnnotations.SentencesAnnotation.class);
+ return toNamedEntityObjects(sentenceMap);
}
public Annotation getNamedEntityAnnotationFromTrees(List<String> parseTrees)
@@ -101,72 +75,46 @@ public Annotation getNamedEntityAnnotationFromTrees(List<String> parseTrees)
return sentences;
}
- private List<NamedEntity> toNamedEntityObjects(CoreMap results)
+ private List<NamedEntity> toNamedEntityObjects(List<CoreMap> results)
{
List<NamedEntity> entities = new ArrayList<NamedEntity>();
- String inline = "";
-
- final String background = "O";
- String prevTag = background;
-
- List<CoreLabel> tokens = results.get(CoreAnnotations.TokensAnnotation.class);
- for (Iterator<CoreLabel> wordIter = tokens.iterator(); wordIter.hasNext();)
+
+ Stack<CoreLabel> namedEntityStack = new Stack<CoreLabel>();
+ for (CoreMap sentence : results)
{
- CoreLabel wi = wordIter.next();
- String tag = StringUtils.getNotNullString(wi.ner());
- String current = StringUtils.getNotNullString(wi.get(CoreAnnotations.OriginalTextAnnotation.class));
- Integer beginPosition = wi.get(CoreAnnotations.CharacterOffsetBeginAnnotation.class);
- Integer endPosition = wi.get(CoreAnnotations.CharacterOffsetEndAnnotation.class);
- if (!tag.equals(prevTag))
+ List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
+ for (Iterator<CoreLabel> wordIter = tokens.iterator(); wordIter.hasNext();)
{
- if (!prevTag.equals(background) && !tag.equals(background))
- {
- inline += "," + endPosition + ")" + "(" + tag + ",";
- inline += current;
- }
- else if (!prevTag.equals(background))
+ CoreLabel wi = wordIter.next();
+ if (namedEntityStack.empty() || wi.ner().equals(namedEntityStack.peek().ner()))
{
- inline += "," + endPosition + ")" + "(";
- inline += current;
- }
- else if (!tag.equals(background))
- {
- inline += "(" + tag + "," + beginPosition + ",";
- inline += current;
+ namedEntityStack.push(wi);
}
- }
- else
- {
- if (!tag.equals(background))
+ else
{
- inline += current;
+ String tag = "";
+ String entity = "";
+ int startIndex = namedEntityStack.peek().beginPosition();
+ int endIndex = 0;
+ while (!namedEntityStack.empty())
+ {
+ CoreLabel popped = namedEntityStack.pop();
+ tag = popped.ner();
+ entity = popped.word() + " " + entity;
+ if (popped.endPosition() > endIndex)
+ {
+ endIndex = popped.endPosition();
+ }
+ }
+ if (!tag.equals("O"))
+ {
+ entities.add(new NamedEntity(entity.trim(), tag, startIndex, endIndex));
+ }
+ namedEntityStack.push(wi);
}
}
- if (!tag.equals(background) && !wordIter.hasNext())
- {
- inline += "(" + tag + "," + beginPosition + ",";
- inline += current;
- prevTag = background;
- }
- else
- {
- prevTag = tag;
- }
- inline += StringUtils.getNotNullString(wi.get(AfterAnnotation.class));
- }
-
- Pattern pattern = Pattern.compile("\\(([A-Z].+?)\\)");
- Matcher matches = pattern.matcher(inline);
- while (matches.find())
- {
- String[] info = matches.group(0).split("\\,");
- for (int i = 0; i < info.length; i++)
- {
- info[i] = info[i].replaceAll("(^\\()|(\\))$", "");
- }
- entities.add(new NamedEntity(info[2].trim(), info[0], Integer.parseInt(info[1]), Integer.parseInt(info[3])));
}
-
+
return entities;
}
}
View
BIN  stanford-corenlp-wrapper.jar
Binary file not shown
Please sign in to comment.
Something went wrong with that request. Please try again.