Fix regression in tokenizer recognition of SGML character entities for dashes, etc.

manning · manning · commit 2032a503c11c · 2022-07-18T22:01:44.000-07:00
- Fix regression so &amp;MD; and &amp;mdash; will be converted to -- in ascii dashStyle
 - Add unit test for that
 - Turn off DEBUG in PTBLexer (mistake in last commit)
 - Correct several paths to remove "projects/core/"
diff --git a/itest/src/edu/stanford/nlp/process/PTBTokenizerITest.java b/itest/src/edu/stanford/nlp/process/PTBTokenizerITest.java
@@ -47,7 +47,7 @@ private static BufferedReader getReaderFromInJavaNlp(String filename)
       reader = new BufferedReader(new InputStreamReader(PTBTokenizerITest.class.getResourceAsStream(filename), charset));
     } catch (NullPointerException npe) {
       Map<String,String> env = System.getenv();
-      String path = "projects/core/data/edu/stanford/nlp/process" + File.separator + filename;
+      String path = "data/edu/stanford/nlp/process" + File.separator + filename;
       String loc = env.get("JAVANLP_HOME");
       if (loc != null) {
         path = loc + File.separator + path;
diff --git a/src/edu/stanford/nlp/classify/mood.prop b/src/edu/stanford/nlp/classify/mood.prop
@@ -2,7 +2,7 @@ useClassFeature=true
 1.useSplitWords=true
 1.splitWordsWithPTBTokenizer=true
 
-loadClassifier=projects/core/src/edu/stanford/nlp/classify/mood.classifier
+loadClassifier=src/edu/stanford/nlp/classify/mood.classifier
 
 printClassifier=HighWeight
 printClassifierParam=20
@@ -16,8 +16,8 @@ prior=no
 
 # Training input
 
-trainFile=projects/core/src/edu/stanford/nlp/classify/mood.train
-testFile=projects/core/src/edu/stanford/nlp/classify/mood.test
+trainFile=src/edu/stanford/nlp/classify/mood.train
+testFile=src/edu/stanford/nlp/classify/mood.test
 
 # for the pipeline
 annotators=cdc
diff --git a/src/edu/stanford/nlp/examples/TokensRegexExample.java b/src/edu/stanford/nlp/examples/TokensRegexExample.java
@@ -1,5 +1,6 @@
 package edu.stanford.nlp.examples;
 
+import edu.stanford.nlp.io.RuntimeIOException;
 import edu.stanford.nlp.ling.*;
 import edu.stanford.nlp.pipeline.*;
 
@@ -12,12 +13,9 @@ public static void main(String[] args) {
     Properties props = new Properties();
     props.setProperty("annotators", "tokenize,ssplit,pos,lemma,ner,tokensregex");
     // The .../ling/tokensregex/demo directory has a larger example
-    File rules = new File("projects/core/src/edu/stanford/nlp/ling/tokensregex/demo/rules/colors.rules.txt");
-    if (!rules.isFile()) {
-      rules = new File("src/edu/stanford/nlp/ling/tokensregex/demo/rules/colors.rules.txt");
-    }
-    if (!rules.isFile()) {
-      throw new RuntimeException("Unable to find colors.rules.txt");
+    File rules = new File("src/edu/stanford/nlp/ling/tokensregex/demo/rules/colors.rules.txt");
+    if ( ! rules.isFile()) {
+      throw new RuntimeIOException("Unable to find colors.rules.txt");
     }
     props.setProperty("tokensregex.rules", rules.getPath());
     StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
@@ -27,4 +25,5 @@ public static void main(String[] args) {
       System.out.println(token.word() + " " + token.ner());
     }
   }
+
 }
diff --git a/src/edu/stanford/nlp/patterns/GetPatternsFromDataMultiClass.java b/src/edu/stanford/nlp/patterns/GetPatternsFromDataMultiClass.java
@@ -74,7 +74,7 @@
  * For other flags, see individual comments for each flag.
  *
  * To use a properties file, see
- * projects/core/data/edu/stanford/nlp/patterns/surface/example.properties or patterns/example.properties (depends on which codebase you are using)
+ * data/edu/stanford/nlp/patterns/surface/example.properties or patterns/example.properties (depends on which codebase you are using)
  * as an example for the flags and their brief descriptions. Run the code as:
  * {@code java -mx1000m -cp classpath edu.stanford.nlp.patterns.GetPatternsFromDataMultiClass -props dir-as-above/example.properties}
  *
diff --git a/src/edu/stanford/nlp/pipeline/ColumnDataClassifierAnnotator.java b/src/edu/stanford/nlp/pipeline/ColumnDataClassifierAnnotator.java
@@ -79,7 +79,7 @@ public Set<Class<? extends CoreAnnotation>> requires() {
   //test - run from your top javanlp directory to get the files etc.
   public static void main(String[] args) {
 
-    Properties props = StringUtils.propFileToProperties("projects/core/src/edu/stanford/nlp/classify/mood.prop");
+    Properties props = StringUtils.propFileToProperties("src/edu/stanford/nlp/classify/mood.prop");
     StanfordCoreNLP pipeline = new StanfordCoreNLP(props);
 
     Annotation happyAnnotation = new Annotation("I am so glad this is awesome");
diff --git a/src/edu/stanford/nlp/process/LexerUtils.java b/src/edu/stanford/nlp/process/LexerUtils.java
@@ -271,7 +271,7 @@ public static String handleDashes(final String tok, DashesEnum dashesStyle) {
         // hyphen-dash, underscore, Armenian hyphen, hyphen, non-break hyphen, figure dash
         String mid = tok.replaceAll("[-_\u058A\u2010\u2011\u2012]","-");
         // cp1252 en dash, cp1252 em dash, en dash, em dash, horizontal bar
-        mid = mid.replaceAll("[\u0096\u0097\u2013\u2014\u2015]", "--");
+        mid = mid.replaceAll("[\u0096\u0097\u2013\u2014\u2015]|&MD;|&[mn]dash;", "--");
         if ("---".equals(mid)) {
           mid = "--";
         }
diff --git a/src/edu/stanford/nlp/process/PTBLexer.flex b/src/edu/stanford/nlp/process/PTBLexer.flex
@@ -266,7 +266,7 @@ import edu.stanford.nlp.util.logging.Redwood;
 
 
       /** Turn on to find out how things were tokenized. */
-      private static final boolean DEBUG = true;
+      private static final boolean DEBUG = false;
 
       /** A logger for this class */
       private static final Redwood.RedwoodChannels logger = Redwood.channels(PTBLexer.class);
diff --git a/src/edu/stanford/nlp/process/PTBLexer.java b/src/edu/stanford/nlp/process/PTBLexer.java
@@ -61081,7 +61081,7 @@ public PTBLexer(Reader r, LexedTokenFactory<?> tf, String options) {
 
 
       /** Turn on to find out how things were tokenized. */
-      private static final boolean DEBUG = true;
+      private static final boolean DEBUG = false;
 
       /** A logger for this class */
       private static final Redwood.RedwoodChannels logger = Redwood.channels(PTBLexer.class);

Original file line number	Diff line number	Diff line change
`@@ -74,7 +74,7 @@`
`74`	`74`	`* For other flags, see individual comments for each flag.`
`75`	`75`	`*`
`76`	`76`	`* To use a properties file, see`
`77`		`- * projects/core/data/edu/stanford/nlp/patterns/surface/example.properties or patterns/example.properties (depends on which codebase you are using)`
	`77`	`+ * data/edu/stanford/nlp/patterns/surface/example.properties or patterns/example.properties (depends on which codebase you are using)`
`78`	`78`	`* as an example for the flags and their brief descriptions. Run the code as:`
`79`	`79`	`* {@code java -mx1000m -cp classpath edu.stanford.nlp.patterns.GetPatternsFromDataMultiClass -props dir-as-above/example.properties}`
`80`	`80`	`*`
Original file line number	Diff line number	Diff line change
`@@ -271,7 +271,7 @@ public static String handleDashes(final String tok, DashesEnum dashesStyle) {`
`271`	`271`	`// hyphen-dash, underscore, Armenian hyphen, hyphen, non-break hyphen, figure dash`
`272`	`272`	`String mid = tok.replaceAll("[-_\u058A\u2010\u2011\u2012]","-");`
`273`	`273`	`// cp1252 en dash, cp1252 em dash, en dash, em dash, horizontal bar`
`274`		`- mid = mid.replaceAll("[\u0096\u0097\u2013\u2014\u2015]", "--");`
	`274`	`+ mid = mid.replaceAll("[\u0096\u0097\u2013\u2014\u2015]\|&MD;\|&[mn]dash;", "--");`
`275`	`275`	`if ("---".equals(mid)) {`
`276`	`276`	`mid = "--";`
`277`	`277`	`}`