Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Now with limited Tregex-ability.

  • Loading branch information...
commit 0bd2e64e5b406df85e902b915d46e32b0265e353 1 parent 94591b1
Diane M. Napolitano dmnapolitano authored
1  README.md
View
@@ -9,6 +9,7 @@ Things you can do with it:
- Parse Trees **See README_parser.md**
- Named Entities **See README_ner.md**
- Resolved Coreferences **See README_coref.md**
+ - Evaluate Stanford Tregex patterns over parse trees **See README_tregex.md **
* Send unicode (optional), receive unicode (always).
* Do these things in a multithreaded way without having to think about it too much (Thrift provides ten threads).
* Communicate with the server using the language of your choice (with some additional coding if your choice isn't "Java" or "Python").
8 README_tregex.md
View
@@ -0,0 +1,8 @@
+How to Run Regular Expressions over Parse Trees with Stanford Tregex via this Apache Thrift Server
+==================================================================================================
+
+## How to Interact with the Methods and Data Structures
+
+Presently, there is only one method, `evaluate_tregex_pattern(parse_tree, tregex_pattern)` where `parse_tree` is a Java `String`/Python `str` or `unicode` containing a single sentence's parse tree (probably the output from the Stanford Parser), and `tregex_pattern` is a Java `String`/Python `str` or `unicode` containing a valid Tregex pattern that you wish to evaluate on this tree.
+The return value is a Java `ArrayList<String>`/Python `unicode` list, where each element is a match against the parse tree of the specified Tregex pattern.
+I'm pretty sure `parse_tree` can be in ANY of the Stanford Parser output formats, although the only one I have tried is the `oneline` format.
3  corenlp.thrift
View
@@ -32,5 +32,6 @@ service StanfordCoreNLP
list<NamedEntity> get_entities_from_trees(1:list<string> trees),
list<string> resolve_coreferences_in_text(1:string text),
list<string> resolve_coreferences_in_tokenized_sentences(1:list<string> sentencesWithTokensSeparatedBySpace),
- list<string> resolve_coreferences_in_trees(1:list<string> trees)
+ list<string> resolve_coreferences_in_trees(1:list<string> trees),
+ list<string> evaluate_tregex_pattern(1:string parseTree, 2:string tregexPattern)
}
7 gen-py/corenlp/StanfordCoreNLP-remote
View
@@ -34,6 +34,7 @@ if len(sys.argv) <= 1 or sys.argv[1] == '--help':
print ' resolve_coreferences_in_text(string text)'
print ' resolve_coreferences_in_tokenized_sentences( sentencesWithTokensSeparatedBySpace)'
print ' resolve_coreferences_in_trees( trees)'
+ print ' evaluate_tregex_pattern(string parseTree, string tregexPattern)'
print ''
sys.exit(0)
@@ -151,6 +152,12 @@ elif cmd == 'resolve_coreferences_in_trees':
sys.exit(1)
pp.pprint(client.resolve_coreferences_in_trees(eval(args[0]),))
+elif cmd == 'evaluate_tregex_pattern':
+ if len(args) != 2:
+ print 'evaluate_tregex_pattern requires 2 args'
+ sys.exit(1)
+ pp.pprint(client.evaluate_tregex_pattern(args[0],args[1],))
+
else:
print 'Unrecognized method %s' % cmd
sys.exit(1)
216 gen-py/corenlp/StanfordCoreNLP.py
View
@@ -91,6 +91,14 @@ def resolve_coreferences_in_trees(self, trees):
"""
pass
+ def evaluate_tregex_pattern(self, parseTree, tregexPattern):
+ """
+ Parameters:
+ - parseTree
+ - tregexPattern
+ """
+ pass
+
class Client(Iface):
def __init__(self, iprot, oprot=None):
@@ -409,6 +417,38 @@ def recv_resolve_coreferences_in_trees(self, ):
return result.success
raise TApplicationException(TApplicationException.MISSING_RESULT, "resolve_coreferences_in_trees failed: unknown result");
+ def evaluate_tregex_pattern(self, parseTree, tregexPattern):
+ """
+ Parameters:
+ - parseTree
+ - tregexPattern
+ """
+ self.send_evaluate_tregex_pattern(parseTree, tregexPattern)
+ return self.recv_evaluate_tregex_pattern()
+
+ def send_evaluate_tregex_pattern(self, parseTree, tregexPattern):
+ self._oprot.writeMessageBegin('evaluate_tregex_pattern', TMessageType.CALL, self._seqid)
+ args = evaluate_tregex_pattern_args()
+ args.parseTree = parseTree
+ args.tregexPattern = tregexPattern
+ args.write(self._oprot)
+ self._oprot.writeMessageEnd()
+ self._oprot.trans.flush()
+
+ def recv_evaluate_tregex_pattern(self, ):
+ (fname, mtype, rseqid) = self._iprot.readMessageBegin()
+ if mtype == TMessageType.EXCEPTION:
+ x = TApplicationException()
+ x.read(self._iprot)
+ self._iprot.readMessageEnd()
+ raise x
+ result = evaluate_tregex_pattern_result()
+ result.read(self._iprot)
+ self._iprot.readMessageEnd()
+ if result.success is not None:
+ return result.success
+ raise TApplicationException(TApplicationException.MISSING_RESULT, "evaluate_tregex_pattern failed: unknown result");
+
class Processor(Iface, TProcessor):
def __init__(self, handler):
@@ -425,6 +465,7 @@ def __init__(self, handler):
self._processMap["resolve_coreferences_in_text"] = Processor.process_resolve_coreferences_in_text
self._processMap["resolve_coreferences_in_tokenized_sentences"] = Processor.process_resolve_coreferences_in_tokenized_sentences
self._processMap["resolve_coreferences_in_trees"] = Processor.process_resolve_coreferences_in_trees
+ self._processMap["evaluate_tregex_pattern"] = Processor.process_evaluate_tregex_pattern
def process(self, iprot, oprot):
(name, type, seqid) = iprot.readMessageBegin()
@@ -558,6 +599,17 @@ def process_resolve_coreferences_in_trees(self, seqid, iprot, oprot):
oprot.writeMessageEnd()
oprot.trans.flush()
+ def process_evaluate_tregex_pattern(self, seqid, iprot, oprot):
+ args = evaluate_tregex_pattern_args()
+ args.read(iprot)
+ iprot.readMessageEnd()
+ result = evaluate_tregex_pattern_result()
+ result.success = self._handler.evaluate_tregex_pattern(args.parseTree, args.tregexPattern)
+ oprot.writeMessageBegin("evaluate_tregex_pattern", TMessageType.REPLY, seqid)
+ result.write(oprot)
+ oprot.writeMessageEnd()
+ oprot.trans.flush()
+
# HELPER FUNCTIONS AND STRUCTURES
@@ -2184,3 +2236,167 @@ def __eq__(self, other):
def __ne__(self, other):
return not (self == other)
+
+class evaluate_tregex_pattern_args(object):
+ """
+ Attributes:
+ - parseTree
+ - tregexPattern
+ """
+
+ __slots__ = [
+ 'parseTree',
+ 'tregexPattern',
+ ]
+
+ thrift_spec = (
+ None, # 0
+ (1, TType.STRING, 'parseTree', None, None, ), # 1
+ (2, TType.STRING, 'tregexPattern', None, None, ), # 2
+ )
+
+ def __init__(self, parseTree=None, tregexPattern=None,):
+ self.parseTree = parseTree
+ self.tregexPattern = tregexPattern
+
+ def read(self, iprot):
+ if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None:
+ fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec))
+ return
+ iprot.readStructBegin()
+ while True:
+ (fname, ftype, fid) = iprot.readFieldBegin()
+ if ftype == TType.STOP:
+ break
+ if fid == 1:
+ if ftype == TType.STRING:
+ self.parseTree = iprot.readString().decode('utf-8')
+ else:
+ iprot.skip(ftype)
+ elif fid == 2:
+ if ftype == TType.STRING:
+ self.tregexPattern = iprot.readString().decode('utf-8')
+ else:
+ iprot.skip(ftype)
+ else:
+ iprot.skip(ftype)
+ iprot.readFieldEnd()
+ iprot.readStructEnd()
+
+ def write(self, oprot):
+ if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None:
+ oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec)))
+ return
+ oprot.writeStructBegin('evaluate_tregex_pattern_args')
+ if self.parseTree is not None:
+ oprot.writeFieldBegin('parseTree', TType.STRING, 1)
+ oprot.writeString(self.parseTree.encode('utf-8'))
+ oprot.writeFieldEnd()
+ if self.tregexPattern is not None:
+ oprot.writeFieldBegin('tregexPattern', TType.STRING, 2)
+ oprot.writeString(self.tregexPattern.encode('utf-8'))
+ oprot.writeFieldEnd()
+ oprot.writeFieldStop()
+ oprot.writeStructEnd()
+
+ def validate(self):
+ return
+
+
+ def __repr__(self):
+ L = ['%s=%r' % (key, getattr(self, key))
+ for key in self.__slots__]
+ return '%s(%s)' % (self.__class__.__name__, ', '.join(L))
+
+ def __eq__(self, other):
+ if not isinstance(other, self.__class__):
+ return False
+ for attr in self.__slots__:
+ my_val = getattr(self, attr)
+ other_val = getattr(other, attr)
+ if my_val != other_val:
+ return False
+ return True
+
+ def __ne__(self, other):
+ return not (self == other)
+
+
+class evaluate_tregex_pattern_result(object):
+ """
+ Attributes:
+ - success
+ """
+
+ __slots__ = [
+ 'success',
+ ]
+
+ thrift_spec = (
+ (0, TType.LIST, 'success', (TType.STRING,None), None, ), # 0
+ )
+
+ def __init__(self, success=None,):
+ self.success = success
+
+ def read(self, iprot):
+ if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None:
+ fastbinary.decode_binary(self, iprot.trans, (self.__class__, self.thrift_spec))
+ return
+ iprot.readStructBegin()
+ while True:
+ (fname, ftype, fid) = iprot.readFieldBegin()
+ if ftype == TType.STOP:
+ break
+ if fid == 0:
+ if ftype == TType.LIST:
+ self.success = []
+ (_etype108, _size105) = iprot.readListBegin()
+ for _i109 in xrange(_size105):
+ _elem110 = iprot.readString().decode('utf-8')
+ self.success.append(_elem110)
+ iprot.readListEnd()
+ else:
+ iprot.skip(ftype)
+ else:
+ iprot.skip(ftype)
+ iprot.readFieldEnd()
+ iprot.readStructEnd()
+
+ def write(self, oprot):
+ if oprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and self.thrift_spec is not None and fastbinary is not None:
+ oprot.trans.write(fastbinary.encode_binary(self, (self.__class__, self.thrift_spec)))
+ return
+ oprot.writeStructBegin('evaluate_tregex_pattern_result')
+ if self.success is not None:
+ oprot.writeFieldBegin('success', TType.LIST, 0)
+ oprot.writeListBegin(TType.STRING, len(self.success))
+ for iter111 in self.success:
+ oprot.writeString(iter111.encode('utf-8'))
+ oprot.writeListEnd()
+ oprot.writeFieldEnd()
+ oprot.writeFieldStop()
+ oprot.writeStructEnd()
+
+ def validate(self):
+ return
+
+
+ def __repr__(self):
+ L = ['%s=%r' % (key, getattr(self, key))
+ for key in self.__slots__]
+ return '%s(%s)' % (self.__class__.__name__, ', '.join(L))
+
+ def __eq__(self, other):
+ if not isinstance(other, self.__class__):
+ return False
+ for attr in self.__slots__:
+ my_val = getattr(self, attr)
+ other_val = getattr(other, attr)
+ if my_val != other_val:
+ return False
+ return True
+
+ def __ne__(self, other):
+ return not (self == other)
+
52 scripts/parser_client.py
View
@@ -68,32 +68,32 @@
#outputOptions = ["-outputFormat", "oneline"] # Same as specifying "None", as above.
-#try:
-# parse_trees = client.parse_text(arbitrary_text, outputOptions)
-# for result in parse_trees:
-# sys.stdout.write(result.tree.strip() + " [" + str(result.score) + "]\n")
-#except Exception as e:
-# print e
-
-#print
-
-#for sentence in tokenized_sentences:
-# try:
-# tree = client.parse_tokens(sentence, outputOptions)
-# sys.stdout.write(tree.tree.strip() + " [" + str(tree.score) + "]\n")
-# except Exception as e:
-# print e
-
-#print
-
-#for sentence in more_tokenized_sentences:
-# try:
-# tree = client.parse_tokens(sentence, outputOptions)
-# sys.stdout.write(tree.tree.strip()+"\n")
-# except Exception as e:
-# print e
-
-tree = client.parse_tokens(tokenized_sentence, outputOptions)
+try:
+ parse_trees = client.parse_text(arbitrary_text, outputOptions)
+ for result in parse_trees:
+ sys.stdout.write(result.tree.strip() + " [" + str(result.score) + "]\n")
+except Exception as e:
+ print e
+
+print
+
+for sentence in tokenized_sentences:
+ try:
+ tree = client.parse_tokens(sentence, outputOptions)
+ sys.stdout.write(tree.tree.strip() + " [" + str(tree.score) + "]\n")
+ except Exception as e:
+ print e
+
+print
+
+for sentence in more_tokenized_sentences:
+ try:
+ tree = client.parse_tokens(sentence, outputOptions)
+ sys.stdout.write(tree.tree.strip()+"\n")
+ except Exception as e:
+ print e
+
+tree = client.parse_tokens(tokenized_sentence, None)
sys.stdout.write(tree.tree.strip() + "\n")
tree = client.parse_tagged_sentence(tagged_sentence, outputOptions, "/")
12 src/StanfordCoreNLPHandler.java
View
@@ -7,6 +7,7 @@
import edu.stanford.nlp.pipeline.Annotation;
import parser.StanfordParserThrift;
+import tregex.StanfordTregexThrift;
import CoreNLP.*;
@@ -22,6 +23,7 @@
private StanfordParserThrift parser;
private StanfordNERThrift ner;
private StanfordCorefThrift coref;
+ private StanfordTregexThrift tregex;
public StanfordCoreNLPHandler()
{
@@ -31,6 +33,8 @@ public StanfordCoreNLPHandler()
ner = new StanfordNERThrift();
System.err.println("Initializing Coreference Resolver...");
coref = new StanfordCorefThrift();
+ System.err.println("Initializing Tregex...");
+ tregex = new StanfordTregexThrift();
}
@@ -123,6 +127,14 @@ public ParseTree parse_tagged_sentence(String taggedSentence, List<String> outpu
/* End Stanford Coref methods */
+ /* Begin Stanford Tregex methods */
+ public List<String> evaluate_tregex_pattern(String parseTree, String tregexPattern)
+ {
+ return tregex.evaluateTregexPattern(parseTree, tregexPattern);
+ }
+ /* End Stanford Tregex methods */
+
+
public void ping()
{
System.out.println("ping()");
29 src/tregex/StanfordTregexThrift.java
View
@@ -0,0 +1,29 @@
+package tregex;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import edu.stanford.nlp.trees.Tree;
+import edu.stanford.nlp.trees.tregex.TregexMatcher;
+import edu.stanford.nlp.trees.tregex.TregexPattern;
+
+public class StanfordTregexThrift
+{
+ public StanfordTregexThrift()
+ {
+
+ }
+
+ public List<String> evaluateTregexPattern(String parseTree, String tregexPattern)
+ {
+ List<String> foundMatches = new ArrayList<String>();
+
+ TregexPattern pattern = TregexPattern.compile(tregexPattern);
+ TregexMatcher matches = pattern.matcher(Tree.valueOf(parseTree));
+ while (matches.find())
+ {
+ foundMatches.add(matches.getMatch().pennString());
+ }
+ return foundMatches;
+ }
+}
BIN  stanford-corenlp-wrapper.jar
View
Binary file not shown
Please sign in to comment.
Something went wrong with that request. Please try again.