Skip to content
Browse files

NER now returns the index corresponding to the sentence each named en…

…tity was found in.
  • Loading branch information...
1 parent be489fb commit d7f63b5405c4cca85378b46a09a0a6bb462bfaa9 @dmnapolitano dmnapolitano committed
View
1 README_ner.md
@@ -9,6 +9,7 @@ The core return type here is a data structure called `NamedEntity` which has fou
* `tag`: A string containing the tag assigned to this named entity (PERSON, LOCATION, etc.). Should always be upper-case.
* `startOffset`: All named entities exist in some sentence. This integer represents the starting character offset of this named entity in its sentence.
* `endOffset`: Like `startOffset`, only tells you the character offset of the last character of the named entity in its sentence.
+* `sentence_num`: An `integer` referring to the index (starting from 0) into the original `list` of sentences provided to Stanford NER, showing in which sentence this named entity occurred.
In order to get these `NamedEntity` objects, you have three choices, depending on what kind of data you'd like to recognize named entities in. The return type for ALL of these is a Java `ArrayList`/Python list containing `NamedEntity` objects corresponding to entities recognized across the ENTIRETY of your text, no matter how many sentences, parse trees, etc. were passed in. If you'd like to recognize named entities in:
View
3 corenlp.thrift
@@ -12,7 +12,8 @@ struct NamedEntity
1:string entity,
2:string tag,
3:i32 startOffset,
- 4:i32 endOffset
+ 4:i32 endOffset,
+ 5:i32 sentence_number
}
struct TaggedToken
View
102 gen-java/CoreNLP/NamedEntity.java
@@ -37,6 +37,7 @@
private static final org.apache.thrift.protocol.TField TAG_FIELD_DESC = new org.apache.thrift.protocol.TField("tag", org.apache.thrift.protocol.TType.STRING, (short)2);
private static final org.apache.thrift.protocol.TField START_OFFSET_FIELD_DESC = new org.apache.thrift.protocol.TField("startOffset", org.apache.thrift.protocol.TType.I32, (short)3);
private static final org.apache.thrift.protocol.TField END_OFFSET_FIELD_DESC = new org.apache.thrift.protocol.TField("endOffset", org.apache.thrift.protocol.TType.I32, (short)4);
+ private static final org.apache.thrift.protocol.TField SENTENCE_NUMBER_FIELD_DESC = new org.apache.thrift.protocol.TField("sentence_number", org.apache.thrift.protocol.TType.I32, (short)5);
private static final Map<Class<? extends IScheme>, SchemeFactory> schemes = new HashMap<Class<? extends IScheme>, SchemeFactory>();
static {
@@ -48,13 +49,15 @@
public String tag; // required
public int startOffset; // required
public int endOffset; // required
+ public int sentence_number; // required
/** The set of fields this struct contains, along with convenience methods for finding and manipulating them. */
public enum _Fields implements org.apache.thrift.TFieldIdEnum {
ENTITY((short)1, "entity"),
TAG((short)2, "tag"),
START_OFFSET((short)3, "startOffset"),
- END_OFFSET((short)4, "endOffset");
+ END_OFFSET((short)4, "endOffset"),
+ SENTENCE_NUMBER((short)5, "sentence_number");
private static final Map<String, _Fields> byName = new HashMap<String, _Fields>();
@@ -77,6 +80,8 @@ public static _Fields findByThriftId(int fieldId) {
return START_OFFSET;
case 4: // END_OFFSET
return END_OFFSET;
+ case 5: // SENTENCE_NUMBER
+ return SENTENCE_NUMBER;
default:
return null;
}
@@ -119,6 +124,7 @@ public String getFieldName() {
// isset id assignments
private static final int __STARTOFFSET_ISSET_ID = 0;
private static final int __ENDOFFSET_ISSET_ID = 1;
+ private static final int __SENTENCE_NUMBER_ISSET_ID = 2;
private byte __isset_bitfield = 0;
public static final Map<_Fields, org.apache.thrift.meta_data.FieldMetaData> metaDataMap;
static {
@@ -131,6 +137,8 @@ public String getFieldName() {
new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.I32)));
tmpMap.put(_Fields.END_OFFSET, new org.apache.thrift.meta_data.FieldMetaData("endOffset", org.apache.thrift.TFieldRequirementType.DEFAULT,
new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.I32)));
+ tmpMap.put(_Fields.SENTENCE_NUMBER, new org.apache.thrift.meta_data.FieldMetaData("sentence_number", org.apache.thrift.TFieldRequirementType.DEFAULT,
+ new org.apache.thrift.meta_data.FieldValueMetaData(org.apache.thrift.protocol.TType.I32)));
metaDataMap = Collections.unmodifiableMap(tmpMap);
org.apache.thrift.meta_data.FieldMetaData.addStructMetaDataMap(NamedEntity.class, metaDataMap);
}
@@ -142,7 +150,8 @@ public NamedEntity(
String entity,
String tag,
int startOffset,
- int endOffset)
+ int endOffset,
+ int sentence_number)
{
this();
this.entity = entity;
@@ -151,6 +160,8 @@ public NamedEntity(
setStartOffsetIsSet(true);
this.endOffset = endOffset;
setEndOffsetIsSet(true);
+ this.sentence_number = sentence_number;
+ setSentence_numberIsSet(true);
}
/**
@@ -166,6 +177,7 @@ public NamedEntity(NamedEntity other) {
}
this.startOffset = other.startOffset;
this.endOffset = other.endOffset;
+ this.sentence_number = other.sentence_number;
}
public NamedEntity deepCopy() {
@@ -180,6 +192,8 @@ public void clear() {
this.startOffset = 0;
setEndOffsetIsSet(false);
this.endOffset = 0;
+ setSentence_numberIsSet(false);
+ this.sentence_number = 0;
}
public String getEntity() {
@@ -276,6 +290,29 @@ public void setEndOffsetIsSet(boolean value) {
__isset_bitfield = EncodingUtils.setBit(__isset_bitfield, __ENDOFFSET_ISSET_ID, value);
}
+ public int getSentence_number() {
+ return this.sentence_number;
+ }
+
+ public NamedEntity setSentence_number(int sentence_number) {
+ this.sentence_number = sentence_number;
+ setSentence_numberIsSet(true);
+ return this;
+ }
+
+ public void unsetSentence_number() {
+ __isset_bitfield = EncodingUtils.clearBit(__isset_bitfield, __SENTENCE_NUMBER_ISSET_ID);
+ }
+
+ /** Returns true if field sentence_number is set (has been assigned a value) and false otherwise */
+ public boolean isSetSentence_number() {
+ return EncodingUtils.testBit(__isset_bitfield, __SENTENCE_NUMBER_ISSET_ID);
+ }
+
+ public void setSentence_numberIsSet(boolean value) {
+ __isset_bitfield = EncodingUtils.setBit(__isset_bitfield, __SENTENCE_NUMBER_ISSET_ID, value);
+ }
+
public void setFieldValue(_Fields field, Object value) {
switch (field) {
case ENTITY:
@@ -310,6 +347,14 @@ public void setFieldValue(_Fields field, Object value) {
}
break;
+ case SENTENCE_NUMBER:
+ if (value == null) {
+ unsetSentence_number();
+ } else {
+ setSentence_number((Integer)value);
+ }
+ break;
+
}
}
@@ -327,6 +372,9 @@ public Object getFieldValue(_Fields field) {
case END_OFFSET:
return Integer.valueOf(getEndOffset());
+ case SENTENCE_NUMBER:
+ return Integer.valueOf(getSentence_number());
+
}
throw new IllegalStateException();
}
@@ -346,6 +394,8 @@ public boolean isSet(_Fields field) {
return isSetStartOffset();
case END_OFFSET:
return isSetEndOffset();
+ case SENTENCE_NUMBER:
+ return isSetSentence_number();
}
throw new IllegalStateException();
}
@@ -399,6 +449,15 @@ public boolean equals(NamedEntity that) {
return false;
}
+ boolean this_present_sentence_number = true;
+ boolean that_present_sentence_number = true;
+ if (this_present_sentence_number || that_present_sentence_number) {
+ if (!(this_present_sentence_number && that_present_sentence_number))
+ return false;
+ if (this.sentence_number != that.sentence_number)
+ return false;
+ }
+
return true;
}
@@ -455,6 +514,16 @@ public int compareTo(NamedEntity other) {
return lastComparison;
}
}
+ lastComparison = Boolean.valueOf(isSetSentence_number()).compareTo(typedOther.isSetSentence_number());
+ if (lastComparison != 0) {
+ return lastComparison;
+ }
+ if (isSetSentence_number()) {
+ lastComparison = org.apache.thrift.TBaseHelper.compareTo(this.sentence_number, typedOther.sentence_number);
+ if (lastComparison != 0) {
+ return lastComparison;
+ }
+ }
return 0;
}
@@ -498,6 +567,10 @@ public String toString() {
sb.append("endOffset:");
sb.append(this.endOffset);
first = false;
+ if (!first) sb.append(", ");
+ sb.append("sentence_number:");
+ sb.append(this.sentence_number);
+ first = false;
sb.append(")");
return sb.toString();
}
@@ -575,6 +648,14 @@ public void read(org.apache.thrift.protocol.TProtocol iprot, NamedEntity struct)
org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
}
break;
+ case 5: // SENTENCE_NUMBER
+ if (schemeField.type == org.apache.thrift.protocol.TType.I32) {
+ struct.sentence_number = iprot.readI32();
+ struct.setSentence_numberIsSet(true);
+ } else {
+ org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
+ }
+ break;
default:
org.apache.thrift.protocol.TProtocolUtil.skip(iprot, schemeField.type);
}
@@ -606,6 +687,9 @@ public void write(org.apache.thrift.protocol.TProtocol oprot, NamedEntity struct
oprot.writeFieldBegin(END_OFFSET_FIELD_DESC);
oprot.writeI32(struct.endOffset);
oprot.writeFieldEnd();
+ oprot.writeFieldBegin(SENTENCE_NUMBER_FIELD_DESC);
+ oprot.writeI32(struct.sentence_number);
+ oprot.writeFieldEnd();
oprot.writeFieldStop();
oprot.writeStructEnd();
}
@@ -636,7 +720,10 @@ public void write(org.apache.thrift.protocol.TProtocol prot, NamedEntity struct)
if (struct.isSetEndOffset()) {
optionals.set(3);
}
- oprot.writeBitSet(optionals, 4);
+ if (struct.isSetSentence_number()) {
+ optionals.set(4);
+ }
+ oprot.writeBitSet(optionals, 5);
if (struct.isSetEntity()) {
oprot.writeString(struct.entity);
}
@@ -649,12 +736,15 @@ public void write(org.apache.thrift.protocol.TProtocol prot, NamedEntity struct)
if (struct.isSetEndOffset()) {
oprot.writeI32(struct.endOffset);
}
+ if (struct.isSetSentence_number()) {
+ oprot.writeI32(struct.sentence_number);
+ }
}
@Override
public void read(org.apache.thrift.protocol.TProtocol prot, NamedEntity struct) throws org.apache.thrift.TException {
TTupleProtocol iprot = (TTupleProtocol) prot;
- BitSet incoming = iprot.readBitSet(4);
+ BitSet incoming = iprot.readBitSet(5);
if (incoming.get(0)) {
struct.entity = iprot.readString();
struct.setEntityIsSet(true);
@@ -671,6 +761,10 @@ public void read(org.apache.thrift.protocol.TProtocol prot, NamedEntity struct)
struct.endOffset = iprot.readI32();
struct.setEndOffsetIsSet(true);
}
+ if (incoming.get(4)) {
+ struct.sentence_number = iprot.readI32();
+ struct.setSentence_numberIsSet(true);
+ }
}
}
View
15 gen-py/corenlp/ttypes.py
@@ -109,6 +109,7 @@ class NamedEntity(object):
- tag
- startOffset
- endOffset
+ - sentence_number
"""
__slots__ = [
@@ -116,6 +117,7 @@ class NamedEntity(object):
'tag',
'startOffset',
'endOffset',
+ 'sentence_number',
]
thrift_spec = (
@@ -124,13 +126,15 @@ class NamedEntity(object):
(2, TType.STRING, 'tag', None, None, ), # 2
(3, TType.I32, 'startOffset', None, None, ), # 3
(4, TType.I32, 'endOffset', None, None, ), # 4
+ (5, TType.I32, 'sentence_number', None, None, ), # 5
)
- def __init__(self, entity=None, tag=None, startOffset=None, endOffset=None,):
+ def __init__(self, entity=None, tag=None, startOffset=None, endOffset=None, sentence_number=None,):
self.entity = entity
self.tag = tag
self.startOffset = startOffset
self.endOffset = endOffset
+ self.sentence_number = sentence_number
def read(self, iprot):
if iprot.__class__ == TBinaryProtocol.TBinaryProtocolAccelerated and isinstance(iprot.trans, TTransport.CReadableTransport) and self.thrift_spec is not None and fastbinary is not None:
@@ -161,6 +165,11 @@ def read(self, iprot):
self.endOffset = iprot.readI32();
else:
iprot.skip(ftype)
+ elif fid == 5:
+ if ftype == TType.I32:
+ self.sentence_number = iprot.readI32();
+ else:
+ iprot.skip(ftype)
else:
iprot.skip(ftype)
iprot.readFieldEnd()
@@ -187,6 +196,10 @@ def write(self, oprot):
oprot.writeFieldBegin('endOffset', TType.I32, 4)
oprot.writeI32(self.endOffset)
oprot.writeFieldEnd()
+ if self.sentence_number is not None:
+ oprot.writeFieldBegin('sentence_number', TType.I32, 5)
+ oprot.writeI32(self.sentence_number)
+ oprot.writeFieldEnd()
oprot.writeFieldStop()
oprot.writeStructEnd()
View
44 scripts/coref_client.py
@@ -21,21 +21,23 @@
port = int(args[1])
-trees = ["(ROOT (S (NP (NNP Barack) (NNP Hussein) (NNP Obama) (NNP II)) (VP (VBZ is) (NP (NP (DT the) (JJ 44th) (CC and) (JJ current) (NN President)) (PP (IN of) (NP (DT the) (NNP United) (NNPS States)))) (, ,) (PP (IN in) (NP (NP (NN office)) (PP (IN since) (NP (CD 2009)))))) (. .)))",
- "(ROOT (S (NP (PRP He)) (VP (VBZ is) (NP (DT the) (JJ first) (NNP African) (NNP American)) (S (VP (TO to) (VP (VB hold) (NP (DT the) (NN office)))))) (. .)))",
- "(ROOT (S (S (VP (VBN Born) (PP (IN in) (NP (NNP Honolulu) (, ,) (NNP Hawaii))))) (, ,) (NP (NNP Obama)) (VP (VBZ is) (NP (NP (DT a) (NN graduate)) (PP (IN of) (NP (NP (NNP Columbia) (NNP University)) (CC and) (NP (NNP Harvard) (NNP Law) (NNP School))))) (, ,) (SBAR (WHADVP (WRB where)) (S (NP (PRP he)) (VP (VBD was) (NP (NP (NN president)) (PP (IN of) (NP (DT the) (NNP Harvard) (NNP Law) (NNP Review)))))))) (. .)))",
- "(ROOT (S (NP (PRP He)) (VP (VBD was) (NP (NP (DT a) (NN community) (NN organizer)) (PP (IN in) (NP (NNP Chicago)))) (PP (IN before) (S (VP (VBG earning) (NP (PRP$ his) (NN law) (NN degree)))))) (. .)))",
- "(ROOT (S (NP (PRP He)) (VP (VP (VBD worked) (PP (IN as) (NP (NP (DT a) (JJ civil) (NNS rights) (NN attorney)) (PP (IN in) (NP (NNP Chicago)))))) (CC and) (VP (VBD taught) (NP (JJ constitutional) (NN law)) (PP (IN at) (NP (NP (DT the) (NNP University)) (PP (IN of) (NP (NP (NNP Chicago) (NNP Law) (NNP School)) (PP (IN from) (NP (CD 1992))))))) (PP (TO to) (NP (CD 2004))))) (. .)))",
- "(ROOT (S (NP (PRP He)) (VP (VBD served) (NP (NP (CD three) (NNS terms)) (VP (VBG representing) (NP (NP (DT the) (NAC (JJ 13th) (NNP District) (PP (IN in) (NP (DT the) (NNP Illinois)))) (NNP Senate)) (PP (IN from) (NP (CD 1997) (TO to) (CD 2004)))))) (, ,) (S (VP (VBG running) (ADVP (RB unsuccessfully)) (PP (IN for) (NP (NP (DT the) (NNP United) (NNPS States) (NNP House)) (PP (IN of) (NP (NP (NNS Representatives)) (PP (IN in) (NP (CD 2000)))))))))) (. .)))"]
+#trees = ["(ROOT (S (NP (NNP Barack) (NNP Hussein) (NNP Obama) (NNP II)) (VP (VBZ is) (NP (NP (DT the) (JJ 44th) (CC and) (JJ current) (NN President)) (PP (IN of) (NP (DT the) (NNP United) (NNPS States)))) (, ,) (PP (IN in) (NP (NP (NN office)) (PP (IN since) (NP (CD 2009)))))) (. .)))",
+# "(ROOT (S (NP (PRP He)) (VP (VBZ is) (NP (DT the) (JJ first) (NNP African) (NNP American)) (S (VP (TO to) (VP (VB hold) (NP (DT the) (NN office)))))) (. .)))",
+# "(ROOT (S (S (VP (VBN Born) (PP (IN in) (NP (NNP Honolulu) (, ,) (NNP Hawaii))))) (, ,) (NP (NNP Obama)) (VP (VBZ is) (NP (NP (DT a) (NN graduate)) (PP (IN of) (NP (NP (NNP Columbia) (NNP University)) (CC and) (NP (NNP Harvard) (NNP Law) (NNP School))))) (, ,) (SBAR (WHADVP (WRB where)) (S (NP (PRP he)) (VP (VBD was) (NP (NP (NN president)) (PP (IN of) (NP (DT the) (NNP Harvard) (NNP Law) (NNP Review)))))))) (. .)))",
+# "(ROOT (S (NP (PRP He)) (VP (VBD was) (NP (NP (DT a) (NN community) (NN organizer)) (PP (IN in) (NP (NNP Chicago)))) (PP (IN before) (S (VP (VBG earning) (NP (PRP$ his) (NN law) (NN degree)))))) (. .)))",
+# "(ROOT (S (NP (PRP He)) (VP (VP (VBD worked) (PP (IN as) (NP (NP (DT a) (JJ civil) (NNS rights) (NN attorney)) (PP (IN in) (NP (NNP Chicago)))))) (CC and) (VP (VBD taught) (NP (JJ constitutional) (NN law)) (PP (IN at) (NP (NP (DT the) (NNP University)) (PP (IN of) (NP (NP (NNP Chicago) (NNP Law) (NNP School)) (PP (IN from) (NP (CD 1992))))))) (PP (TO to) (NP (CD 2004))))) (. .)))",
+# "(ROOT (S (NP (PRP He)) (VP (VBD served) (NP (NP (CD three) (NNS terms)) (VP (VBG representing) (NP (NP (DT the) (NAC (JJ 13th) (NNP District) (PP (IN in) (NP (DT the) (NNP Illinois)))) (NNP Senate)) (PP (IN from) (NP (CD 1997) (TO to) (CD 2004)))))) (, ,) (S (VP (VBG running) (ADVP (RB unsuccessfully)) (PP (IN for) (NP (NP (DT the) (NNP United) (NNPS States) (NNP House)) (PP (IN of) (NP (NP (NNS Representatives)) (PP (IN in) (NP (CD 2000)))))))))) (. .)))"]
-tokenized_sentences = ["Barack Hussein Obama II is the 44th and current President of the United States , in office since 2009 .",
- u"He is the first African American to hold the office .",
- u"Born in Honolulu , Hawaii , Obama is a graduate of Columbia University and Harvard Law School , where he was president of the Harvard Law Review .",
- u"He was a community organizer in Chicago before earning his law degree .",
- u"He worked as a civil rights attorney in Chicago and taught constitutional law at the University of Chicago Law School from 1992 to 2004 .",
- u"He served three terms representing the 13th District in the Illinois Senate from 1997 to 2004 , running unsuccessfully for the United States House of Representatives in 2000 ."]
+trees = ["(ROOT (S (NP (DT This)) (VP (MD will) (VP (VB help) (S (NP (DT the) (NN firm)) (VP (VBP determine) (NP (NP (NP (NP (DT the) (NN size)) (PP (IN of) (NP (DT the) (NN job)))) (VP (VBN based) (PP (IN on) (NP (NP (JJ numerical) (NNS data)) (PP (IN for) (NP (NN example))))))) (, ,) (NP (NP (DT the) (NN size)) (PP (IN of) (NP (NP (DT the) (NN revenue) (NN number)) (, ,) (NP (NP (DT the) (JJ net) (NN loss)) (CC or) (NP (NP (JJ net) (NN income) (NN trend)) (PP (IN from) (NP (JJ previous) (NNS years))) (, ,) (NP (FW etc)))))))))))) (. .)))"]
-arbitrary_text = u"Barack Hussein Obama II is the 44th and current President of the United States, in office since 2009. He is the first African American to hold the office. Born in Honolulu, Hawaii, Obama is a graduate of Columbia University and Harvard Law School, where he was president of the Harvard Law Review. He was a community organizer in Chicago before earning his law degree. He worked as a civil rights attorney in Chicago and taught constitutional law at the University of Chicago Law School from 1992 to 2004. He served three terms representing the 13th District in the Illinois Senate from 1997 to 2004, running unsuccessfully for the United States House of Representatives in 2000."
+#tokenized_sentences = ["Barack Hussein Obama II is the 44th and current President of the United States , in office since 2009 .",
+# u"He is the first African American to hold the office .",
+# u"Born in Honolulu , Hawaii , Obama is a graduate of Columbia University and Harvard Law School , where he was president of the Harvard Law Review .",
+# u"He was a community organizer in Chicago before earning his law degree .",
+# u"He worked as a civil rights attorney in Chicago and taught constitutional law at the University of Chicago Law School from 1992 to 2004 .",
+# u"He served three terms representing the 13th District in the Illinois Senate from 1997 to 2004 , running unsuccessfully for the United States House of Representatives in 2000 ."]
+
+#arbitrary_text = u"Barack Hussein Obama II is the 44th and current President of the United States, in office since 2009. He is the first African American to hold the office. Born in Honolulu, Hawaii, Obama is a graduate of Columbia University and Harvard Law School, where he was president of the Harvard Law Review. He was a community organizer in Chicago before earning his law degree. He worked as a civil rights attorney in Chicago and taught constitutional law at the University of Chicago Law School from 1992 to 2004. He served three terms representing the 13th District in the Illinois Senate from 1997 to 2004, running unsuccessfully for the United States House of Representatives in 2000."
transport = TSocket.TSocket(server, port)
@@ -49,14 +51,14 @@
result = client.resolve_coreferences_in_trees(trees)
for r in result:
print r
- print
- result = client.resolve_coreferences_in_tokenized_sentences(tokenized_sentences)
- for r in result:
- print r
- print
- result = client.resolve_coreferences_in_text(arbitrary_text)
- for r in result:
- print r
+# print
+# result = client.resolve_coreferences_in_tokenized_sentences(tokenized_sentences)
+# for r in result:
+# print r
+# print
+# result = client.resolve_coreferences_in_text(arbitrary_text)
+# for r in result:
+# print r
except Exception as e:
print e
View
10 scripts/parser_client.py
@@ -41,6 +41,8 @@
tagged_sentence = u"Members/NNS of/IN about/IN 37/CD species/NNS are/VBP referred/VBN to/TO as/IN foxes/NNS ,/, of/IN which/WDT only/RB 12/CD species/NNS actually/RB belong/VBP to/TO the/DT Vulpes/NNP genus/NN of/IN ``/`` true/JJ foxes/NNS ''/'' ./."
+test_tagged_sentence = u"Jane's/DT dog/NN will/MD come/VB too/RB ./."
+
weird_sentence = [u'While', u'the', u'child', u'spends', u'about', u'five', u'hours', u'or', u'less', u'with', u'his', u'parents', u',', u'and', u'whenever', u'that', u'child', u'wants', u'to', u'go', u'out', u'he', u'will', u'most', u'probably', u'go', u'out', u'with', u'his', u'friends', u'which', u'are', u'his', u'classmates', u',', u'so', u'most', u'of', u'his', u'school', u'life', u'will', u'be', u'spent', u'with', u'his', u'classmates', u',', u'and', u'this', u'will', u'have', u'a', u'great', u'affect', u'on', u'his', u'personality', u'which', u'will', u'determine', u'the', u'way', u'the', u'child', u'will', u'react', u'towards', u'his', u'school', u'and', u'will', u'determine', u'how', u'he', u'will', u'use', u'his', u'life', u'.']
ahs_test = "And be it further enacted, That the seat of government of said Territory is hereby located temporarily at Fort Leavenworth; and that such portions of the public buildings as may not be actually used and needed for military purposes, may be occupied and used, under the direction of the Governor and Legislative Assembly, for such public purposes as may be required under the provisions of this act."
@@ -70,6 +72,7 @@
outputOptions = ["-outputFormat", "oneline"]
#outputOptions = ["-outputFormat", "typedDependencies"]
+'''
try:
parse_trees = client.parse_text(ahs_test, outputOptions)
for result in parse_trees:
@@ -89,7 +92,8 @@
print e
print
-
+'''
+'''
for sentence in tokenized_sentences:
try:
tree = client.parse_tokens(sentence, outputOptions)
@@ -98,7 +102,7 @@
print e
print
-
+'''
'''
try:
tree = client.parse_tokens(weird_sentence, outputOptions)
@@ -109,7 +113,7 @@
print
-tree = client.parse_tagged_sentence(tagged_sentence, outputOptions, "/")
+tree = client.parse_tagged_sentence(test_tagged_sentence, outputOptions, "/")
sys.stdout.write("\n" + tree.tree.strip() + "\n")
View
4 src/ner/StanfordNERThrift.java
@@ -93,6 +93,7 @@ public Annotation annotateForNamedEntities(Annotation annotation)
List<NamedEntity> entities = new ArrayList<NamedEntity>();
Stack<CoreLabel> namedEntityStack = new Stack<CoreLabel>();
+ int sentenceNum = 0;
for (CoreMap sentence : results)
{
List<CoreLabel> tokens = sentence.get(CoreAnnotations.TokensAnnotation.class);
@@ -121,11 +122,12 @@ public Annotation annotateForNamedEntities(Annotation annotation)
}
if (!tag.equals("O"))
{
- entities.add(new NamedEntity(entity.trim(), tag, startIndex, endIndex));
+ entities.add(new NamedEntity(entity.trim(), tag, startIndex, endIndex, sentenceNum));
}
namedEntityStack.push(wi);
}
}
+ sentenceNum++;
}
return entities;
View
BIN stanford-corenlp-wrapper.jar
Binary file not shown.

0 comments on commit d7f63b5

Please sign in to comment.
Something went wrong with that request. Please try again.