Skip to content

Commit 2032a50

Browse files
committed
Fix regression in tokenizer recognition of SGML character entities for dashes, etc.
- Fix regression so &MD; and — will be converted to -- in ascii dashStyle - Add unit test for that - Turn off DEBUG in PTBLexer (mistake in last commit) - Correct several paths to remove "projects/core/"
1 parent 9476a8e commit 2032a50

File tree

8 files changed

+14
-15
lines changed

8 files changed

+14
-15
lines changed

itest/src/edu/stanford/nlp/process/PTBTokenizerITest.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ private static BufferedReader getReaderFromInJavaNlp(String filename)
4747
reader = new BufferedReader(new InputStreamReader(PTBTokenizerITest.class.getResourceAsStream(filename), charset));
4848
} catch (NullPointerException npe) {
4949
Map<String,String> env = System.getenv();
50-
String path = "projects/core/data/edu/stanford/nlp/process" + File.separator + filename;
50+
String path = "data/edu/stanford/nlp/process" + File.separator + filename;
5151
String loc = env.get("JAVANLP_HOME");
5252
if (loc != null) {
5353
path = loc + File.separator + path;

src/edu/stanford/nlp/classify/mood.prop

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ useClassFeature=true
22
1.useSplitWords=true
33
1.splitWordsWithPTBTokenizer=true
44

5-
loadClassifier=projects/core/src/edu/stanford/nlp/classify/mood.classifier
5+
loadClassifier=src/edu/stanford/nlp/classify/mood.classifier
66

77
printClassifier=HighWeight
88
printClassifierParam=20
@@ -16,8 +16,8 @@ prior=no
1616

1717
# Training input
1818

19-
trainFile=projects/core/src/edu/stanford/nlp/classify/mood.train
20-
testFile=projects/core/src/edu/stanford/nlp/classify/mood.test
19+
trainFile=src/edu/stanford/nlp/classify/mood.train
20+
testFile=src/edu/stanford/nlp/classify/mood.test
2121

2222
# for the pipeline
2323
annotators=cdc

src/edu/stanford/nlp/examples/TokensRegexExample.java

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
package edu.stanford.nlp.examples;
22

3+
import edu.stanford.nlp.io.RuntimeIOException;
34
import edu.stanford.nlp.ling.*;
45
import edu.stanford.nlp.pipeline.*;
56

@@ -12,12 +13,9 @@ public static void main(String[] args) {
1213
Properties props = new Properties();
1314
props.setProperty("annotators", "tokenize,ssplit,pos,lemma,ner,tokensregex");
1415
// The .../ling/tokensregex/demo directory has a larger example
15-
File rules = new File("projects/core/src/edu/stanford/nlp/ling/tokensregex/demo/rules/colors.rules.txt");
16-
if (!rules.isFile()) {
17-
rules = new File("src/edu/stanford/nlp/ling/tokensregex/demo/rules/colors.rules.txt");
18-
}
19-
if (!rules.isFile()) {
20-
throw new RuntimeException("Unable to find colors.rules.txt");
16+
File rules = new File("src/edu/stanford/nlp/ling/tokensregex/demo/rules/colors.rules.txt");
17+
if ( ! rules.isFile()) {
18+
throw new RuntimeIOException("Unable to find colors.rules.txt");
2119
}
2220
props.setProperty("tokensregex.rules", rules.getPath());
2321
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
@@ -27,4 +25,5 @@ public static void main(String[] args) {
2725
System.out.println(token.word() + " " + token.ner());
2826
}
2927
}
28+
3029
}

src/edu/stanford/nlp/patterns/GetPatternsFromDataMultiClass.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,7 @@
7474
* For other flags, see individual comments for each flag.
7575
*
7676
* To use a properties file, see
77-
* projects/core/data/edu/stanford/nlp/patterns/surface/example.properties or patterns/example.properties (depends on which codebase you are using)
77+
* data/edu/stanford/nlp/patterns/surface/example.properties or patterns/example.properties (depends on which codebase you are using)
7878
* as an example for the flags and their brief descriptions. Run the code as:
7979
* {@code java -mx1000m -cp classpath edu.stanford.nlp.patterns.GetPatternsFromDataMultiClass -props dir-as-above/example.properties}
8080
*

src/edu/stanford/nlp/pipeline/ColumnDataClassifierAnnotator.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ public Set<Class<? extends CoreAnnotation>> requires() {
7979
//test - run from your top javanlp directory to get the files etc.
8080
public static void main(String[] args) {
8181

82-
Properties props = StringUtils.propFileToProperties("projects/core/src/edu/stanford/nlp/classify/mood.prop");
82+
Properties props = StringUtils.propFileToProperties("src/edu/stanford/nlp/classify/mood.prop");
8383
StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
8484

8585
Annotation happyAnnotation = new Annotation("I am so glad this is awesome");

src/edu/stanford/nlp/process/LexerUtils.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -271,7 +271,7 @@ public static String handleDashes(final String tok, DashesEnum dashesStyle) {
271271
// hyphen-dash, underscore, Armenian hyphen, hyphen, non-break hyphen, figure dash
272272
String mid = tok.replaceAll("[-_\u058A\u2010\u2011\u2012]","-");
273273
// cp1252 en dash, cp1252 em dash, en dash, em dash, horizontal bar
274-
mid = mid.replaceAll("[\u0096\u0097\u2013\u2014\u2015]", "--");
274+
mid = mid.replaceAll("[\u0096\u0097\u2013\u2014\u2015]|&MD;|&[mn]dash;", "--");
275275
if ("---".equals(mid)) {
276276
mid = "--";
277277
}

src/edu/stanford/nlp/process/PTBLexer.flex

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -266,7 +266,7 @@ import edu.stanford.nlp.util.logging.Redwood;
266266

267267

268268
/** Turn on to find out how things were tokenized. */
269-
private static final boolean DEBUG = true;
269+
private static final boolean DEBUG = false;
270270

271271
/** A logger for this class */
272272
private static final Redwood.RedwoodChannels logger = Redwood.channels(PTBLexer.class);

src/edu/stanford/nlp/process/PTBLexer.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61081,7 +61081,7 @@ public PTBLexer(Reader r, LexedTokenFactory<?> tf, String options) {
6108161081

6108261082

6108361083
/** Turn on to find out how things were tokenized. */
61084-
private static final boolean DEBUG = true;
61084+
private static final boolean DEBUG = false;
6108561085

6108661086
/** A logger for this class */
6108761087
private static final Redwood.RedwoodChannels logger = Redwood.channels(PTBLexer.class);

0 commit comments

Comments
 (0)