Remove editing bug

manning · Stanford NLP · commit 67328c0ab744 · 2016-10-09T12:28:38.000-07:00
diff --git a/itest/src/edu/stanford/nlp/ie/crf/TestThreadedCRFClassifier.java b/itest/src/edu/stanford/nlp/ie/crf/TestThreadedCRFClassifier.java
@@ -12,7 +12,6 @@
 import edu.stanford.nlp.util.Timing;
 
 public class TestThreadedCRFClassifier {
-
   TestThreadedCRFClassifier(Properties props) {
     inputEncoding = props.getProperty("inputEncoding", "UTF-8");
   }
@@ -24,8 +23,8 @@ public class TestThreadedCRFClassifier {
 
   private final String inputEncoding;
 
-  static CRFClassifier loadClassifier(String loadPath, Properties props) {
-    CRFClassifier crf = new CRFClassifier(props);
+  CRFClassifier loadClassifier(String loadPath, Properties props) {
+    CRFClassifier crf = new CRFClassifier(props);    
     crf.loadClassifierNoExceptions(loadPath, props);
     return crf;
   }
@@ -59,9 +58,9 @@ public void run() {
       Timing t = new Timing();
       resultsString = runClassifier(crf, filename);
       long millis = t.stop();
-      System.out.println("Thread " + threadName + " took " + millis +
+      System.out.println("Thread " + threadName + " took " + millis + 
                          "ms to tag file " + filename);
-    }
+    }        
   }
 
   /**
@@ -72,7 +71,7 @@ public void run() {
    * -crf2 ../stanford-releases/stanford-ner-models/dewac_175m_600.ser.gz
    * -testFile ../data/german-ner/deu.testa -inputEncoding iso-8859-1
    */
-  public static void main(String[] args) {
+  static public void main(String[] args) {
     try {
       System.setOut(new PrintStream(System.out, true, "UTF-8"));
       System.setErr(new PrintStream(System.err, true, "UTF-8"));
@@ -82,10 +81,10 @@ public static void main(String[] args) {
 
     runTest(StringUtils.argsToProperties(args));
   }
-
+  
   static public void runTest(Properties props) {
     TestThreadedCRFClassifier test = new TestThreadedCRFClassifier(props);
-    test.runThreadedTest(props);
+    test.runThreadedTest(props);    
   }
 
 
@@ -96,7 +95,7 @@ void runThreadedTest(Properties props) {
     ArrayList<String> modelNames = new ArrayList<String>();
     ArrayList<CRFClassifier> classifiers = new ArrayList<CRFClassifier>();
 
-    for (int i = 1;
+    for (int i = 1; 
          props.getProperty("crf" + Integer.toString(i)) != null; ++i) {
       String model = props.getProperty("crf" + Integer.toString(i));
       CRFClassifier crf = loadClassifier(model, props);
@@ -108,7 +107,7 @@ void runThreadedTest(Properties props) {
       // must run twice to account for "transductive learning"
       results = runClassifier(crf, testFile);
       baseResults.add(results);
-      System.out.println("Stored base results for " + model +
+      System.out.println("Stored base results for " + model + 
                          "; length " + results.length());
     }
 
@@ -122,13 +121,13 @@ void runThreadedTest(Properties props) {
       String repeated = runClassifier(crf, testFile);
       if (!base.equals(repeated)) {
         throw new RuntimeException("Repeated unthreaded results " +
-                                   "not the same for " + model +
+                                   "not the same for " + model + 
                                    " run on file " + testFile);
       }
     }
 
     // test the first classifier in several simultaneous threads
-    int numThreads = PropertiesUtils.getInt(props, "simThreads",
+    int numThreads = PropertiesUtils.getInt(props, "simThreads", 
                                             DEFAULT_SIM_THREADS);
 
     ArrayList<CRFThread> threads = new ArrayList<CRFThread>();
@@ -149,11 +148,11 @@ void runThreadedTest(Properties props) {
         System.out.println("Yay!");
       } else {
         throw new RuntimeException("Results not equal when running " +
-                                   modelNames.get(0) + " under " +
+                                   modelNames.get(0) + " under " + 
                                    numThreads + " simultaneous threads");
       }
     }
-
+    
     // test multiple classifiers (if given) in multiple threads each
     if (classifiers.size() > 1) {
       numThreads = PropertiesUtils.getInt(props, "multipleThreads",
@@ -163,11 +162,11 @@ void runThreadedTest(Properties props) {
         int classifierNum = i % classifiers.size();
         int repeatNum = i / classifiers.size();
         threads.add(new CRFThread(classifiers.get(classifierNum), testFile,
-                                  ("Simultaneous-" + classifierNum +
+                                  ("Simultaneous-" + classifierNum + 
                                    "-" + repeatNum)));
       }
-      for (CRFThread thread : threads) {
-        thread.start();
+      for (int i = 0; i < threads.size(); ++i) {
+        threads.get(i).start();
       }
       for (int i = 0; i < threads.size(); ++i) {
         int classifierNum = i % classifiers.size();
@@ -183,17 +182,16 @@ void runThreadedTest(Properties props) {
           System.out.println("Yay!");
         } else {
           throw new RuntimeException("Results not equal when running " +
-                                     modelNames.get(classifierNum) +
-                                     " under " + numThreads +
+                                     modelNames.get(classifierNum) + 
+                                     " under " + numThreads + 
                                      " threads with " +
-                                     classifiers.size() +
+                                     classifiers.size() + 
                                      " total classifiers");
         }
-      }
+      }      
     }
 
     // if no exceptions thrown, great success
     System.out.println("Everything worked!");
   }
-
 }
diff --git a/itest/src/edu/stanford/nlp/ie/crf/ThreadedCRFClassifierITest.java b/itest/src/edu/stanford/nlp/ie/crf/ThreadedCRFClassifierITest.java
@@ -4,33 +4,32 @@
 
 import java.util.Properties;
 
-/**
+/** 
  * Test that the CRFClassifier works when multiple classifiers are run
  * in multiple threads.
  *
  *  @author John Bauer
  */
 public class ThreadedCRFClassifierITest extends TestCase {
-
   Properties props;
 
-  private static final String german1 =
-    "edu/stanford/nlp/models/ner/german.conll.hgc_175m_600.crf.ser.gz";
+  private String german1 = 
+    "/u/nlp/data/ner/goodClassifiers/german.hgc_175m_600.crf.ser.gz";
   /** -- We're no longer supporting this one
-  private String german2 =
+  private String german2 = 
     "/u/nlp/data/ner/goodClassifiers/german.dewac_175m_600.crf.ser.gz";
   */
-  private static final String germanTestFile = "/u/nlp/data/german/ner/2016/deu.utf8.testa";
+  private String germanTestFile = "/u/nlp/data/german/ner/2016/deu.testa";
 
-  private static final String english1 =
+  private String english1 = 
     "/u/nlp/data/ner/goodClassifiers/english.all.3class.nodistsim.crf.ser.gz";
-  private static final String english2 =
+  private String english2 = 
     "/u/nlp/data/ner/goodClassifiers/english.conll.4class.distsim.crf.ser.gz";
-  private static final String englishTestFile = "/u/nlp/data/ner/column_data/conll.4class.testa";
-
-  private static final String germanEncoding = "utf-8";
-  private static final String englishEncoding = "utf-8";
+  private String englishTestFile = "/u/nlp/data/ner/column_data/conll.4class.testa";
 
+  private String germanEncoding = "iso-8859-1";
+  private String englishEncoding = "utf-8";
+  
   @Override
   public void setUp() {
     props = new Properties();
@@ -57,6 +56,5 @@ public void testTwoEnglishCRFs() {
     props.setProperty("inputEncoding", englishEncoding);
     TestThreadedCRFClassifier.runTest(props);
   }
-
 }
 
diff --git a/scripts/ner/spanish.ancora.distsim.s512.prop b/scripts/ner/spanish.ancora.distsim.s512.prop
@@ -1,6 +1,6 @@
 
-trainFileList = /u/nlp/data/spanish/ner/ancora.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.train.tsv
-testFiles = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.dev.tsv
+trainFile = /u/nlp/data/spanish/ner/ancora.ner.train.tsv
+testFile = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv
 serializeTo = spanish.ancora.distsim.s512.crf.ser.gz
 
 distSimLexicon = /u/nlp/data/spanish/distsim/spanish.spence512.cls
diff --git a/scripts/ner/spanish.ancora.prop b/scripts/ner/spanish.ancora.prop
@@ -1,6 +1,6 @@
 
-trainFileList = /u/nlp/data/spanish/ner/ancora.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.train.tsv
-testFiles = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.dev.tsv
+trainFile = /u/nlp/data/spanish/ner/ancora.ner.train.tsv
+testFile = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv
 serializeTo = spanish.ancora.crf.ser.gz
 
 useDistSim = false
diff --git a/scripts/ner/spanish.ancora2.prop b/scripts/ner/spanish.ancora2.prop
@@ -1,6 +1,6 @@
 
-trainFileList = /u/nlp/data/spanish/ner/ancora.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.train.tsv
-testFiles = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.dev.tsv
+trainFile = /u/nlp/data/spanish/ner/ancora.ner.train.tsv
+testFile = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv
 serializeTo = spanish.ancora2.crf.ser.gz
 
 useDistSim = false
diff --git a/scripts/pos-tagger/Makefile b/scripts/pos-tagger/Makefile
@@ -10,14 +10,14 @@ FRENCH_TEST = format=TREES,/u/nlp/data/lexparser/trees/French/FTB-Test.utf8.txt
 
 GERMAN_TEST = format=TREES,trf=edu.stanford.nlp.trees.international.negra.NegraPennTreeReaderFactory,/u/nlp/data/GermanACL08/negra/negra_3.mrg
 
-SPANISH_TEST = format=TREES,/u/nlp/data/spanish/ancora/ancora.test,/u/nlp/data/spanish/ldc/ldc-NW.test,/u/nlp/data/spanish/ldc/ldc-DF.test
+SPANISH_TEST = format=TREES,/u/nlp/data/spanish/ancora/ancora.test
 
 .SECONDEXPANSION:
 
 all: arabic chinese english french german spanish testing wsj
 .PHONY: all arabic chinese english french german spanish testing wsj
 
-arabic: arabic.tagger  arabic-train.tagger
+arabic: arabic.tagger  arabic-train.tagger 
 
 # we release an arabic model trained on everything, with a
 # corresponding model on train only for testing purposes
@@ -27,35 +27,35 @@ arabic.tagger arabic-train.tagger: $$@.props
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(ARABIC_TEST) -verboseResults false >> $@.out 2>&1
 
-chinese: chinese-distsim.tagger chinese-nodistsim.tagger
+chinese: chinese-distsim.tagger chinese-nodistsim.tagger 
 
 chinese-nodistsim.tagger chinese-distsim.tagger: $$@.props
 	@echo Training $@
-	@echo Will test on $(CHINESE_TEST)
+	@echo Will test on $(CHINESE_TEST) 
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(CHINESE_TEST)  -verboseResults false >> $@.out 2>&1
 
 english: english-bidirectional-distsim.tagger english-caseless-left3words-distsim.tagger english-left3words-distsim.tagger
 
 english-bidirectional-distsim.tagger english-caseless-left3words-distsim.tagger english-left3words-distsim.tagger: $$@.props
 	@echo Training $@
-	@echo Will test on $(ENGLISH_TEST)
+	@echo Will test on $(ENGLISH_TEST) 
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(ENGLISH_TEST)  -verboseResults false >> $@.out 2>&1
 
 french: french.tagger
 
 french.tagger: $$@.props
 	@echo Training $@
-	@echo Will test on $(FRENCH_TEST)
+	@echo Will test on $(FRENCH_TEST) 
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(FRENCH_TEST)  -verboseResults false >> $@.out 2>&1
 
 german: german-dewac.tagger german-fast.tagger german-fast-caseless.tagger german-hgc.tagger
 
 german-dewac.tagger german-fast.tagger german-fast-caseless.tagger german-hgc.tagger: $$@.props
 	@echo Training $@
-	@echo Will test on $(GERMAN_TEST)
+	@echo Will test on $(GERMAN_TEST) 
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(GERMAN_TEST)  -verboseResults false >> $@.out 2>&1
 
@@ -64,16 +64,16 @@ spanish: spanish.tagger spanish-distsim.tagger
 spanish.tagger spanish-distsim.tagger: $$@.props
 	@echo Training $@
 	@echo Will test on $(SPANISH_TEST)
-	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
-#	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(SPANISH_TEST) -verboseResults false >> $@.out 2>&1
+#java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
+	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(SPANISH_TEST) -verboseResults false >> $@.out 2>&1
 
 testing: testing.tagger
 
 testing.tagger:
 	@echo Training $@
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
 
-wsj: wsj-0-18-bidirectional-distsim.tagger  wsj-0-18-bidirectional-nodistsim.tagger  wsj-0-18-caseless-left3words-distsim.tagger  wsj-0-18-left3words-distsim.tagger wsj-0-18-left3words-nodistsim.tagger
+wsj: wsj-0-18-bidirectional-distsim.tagger  wsj-0-18-bidirectional-nodistsim.tagger  wsj-0-18-caseless-left3words-distsim.tagger  wsj-0-18-left3words-distsim.tagger wsj-0-18-left3words-nodistsim.tagger 
 
 wsj-0-18-bidirectional-distsim.tagger wsj-0-18-bidirectional-nodistsim.tagger wsj-0-18-caseless-left3words-distsim.tagger wsj-0-18-left3words-distsim.tagger wsj-0-18-left3words-nodistsim.tagger: $$@.props
 	@echo Training $@
diff --git a/scripts/pos-tagger/spanish.tagger.props b/scripts/pos-tagger/spanish.tagger.props
@@ -1,8 +1,8 @@
-## tagger training invoked at Sat Oct 08 12:21:50 PDT 2016 with arguments:
+## tagger training invoked at Wed Jul 30 08:33:18 PDT 2014 with arguments:
                    model = spanish.tagger
                     arch = left3words,naacl2003unknowns,allwordshapes(-1,1)
             wordFunction = 
-               trainFile = format=TREES,/u/nlp/data/spanish/ancora/ancora.train;format=TREES,/u/nlp/data/spanish/ldc/ldc-DF.train;format=TREES,/u/nlp/data/spanish/ldc/ldc-NW.train
+               trainFile = format=TREES,/u/nlp/data/spanish/ancora/ancora.train
          closedClassTags = 
  closedClassTagThreshold = 40
  curWordMinFeatureThresh = 2
diff --git a/scripts/srparser/Makefile b/scripts/srparser/Makefile
@@ -14,9 +14,9 @@ ENGLISH_TAGGER = /u/nlp/data/pos-tagger/distrib/english-left3words-distsim.tagge
 ENGLISH_TLPP   = $(WSJ_TLPP)
 
 
-FRENCH_TRAIN  = /u/nlp/data/lexparser/trees/FrenchCC/FTB-Train.utf8.txt
-FRENCH_DEV    = /u/nlp/data/lexparser/trees/FrenchCC/FTB-Dev.utf8.txt
-FRENCH_TEST   = /u/nlp/data/lexparser/trees/FrenchCC/FTB-Test.utf8.txt
+FRENCH_TRAIN  = /u/nlp/data/lexparser/trees/FrenchCC/FTB-Train.utf8.txt 
+FRENCH_DEV    = /u/nlp/data/lexparser/trees/FrenchCC/FTB-Dev.utf8.txt 
+FRENCH_TEST   = /u/nlp/data/lexparser/trees/FrenchCC/FTB-Test.utf8.txt 
 FRENCH_TAGGER = /u/nlp/data/pos-tagger/distrib-2014-06-09/french.tagger
 FRENCH_TLPP   = edu.stanford.nlp.parser.lexparser.FrenchTreebankParserParams
 
@@ -41,16 +41,10 @@ ARABIC_TEST   = /u/nlp/data/lexparser/trees/Arabic/2-Unvoc-Test.utf8.txt
 ARABIC_TAGGER = /u/nlp/data/pos-tagger/distrib/arabic-train.tagger
 ARABIC_TLPP   = edu.stanford.nlp.parser.lexparser.ArabicTreebankParserParams
 
+
 SPANISH_TRAIN = /u/nlp/data/spanish/ancora/ancora.train
-SPANISH_TRAIN2 = /u/nlp/data/spanish/ldc/ldc-NW.train
-SPANISH_TRAIN3 = /u/nlp/data/spanish/ldc/ldc-DF.train
 SPANISH_DEV   = /u/nlp/data/spanish/ancora/ancora.dev
-SPANISH_DEV2 = /u/nlp/data/spanish/ldc/ldc-NW.dev
-SPANISH_DEV3 = /u/nlp/data/spanish/ldc/ldc-DF.dev
-SPANISH_DEV_TMP = /u/nlp/data/spanish/all.dev.tmp
 SPANISH_TEST  = /u/nlp/data/spanish/ancora/ancora.test
-SPANISH_TEST2 = /u/nlp/data/spanish/ldc/ldc-NW.train
-SPANISH_TEST3 = /u/nlp/data/spanish/ldc/ldc-DF.train
 SPANISH_TAGGER= /u/nlp/data/pos-tagger/distrib/spanish-distsim.tagger
 SPANISH_TLPP  = edu.stanford.nlp.parser.lexparser.SpanishTreebankParserParams
 
@@ -118,14 +112,9 @@ arabicSR.ser.gz:
 
 spanishSR.ser.gz:
 	@echo Training $@
-	@echo Creating unified Spanish development data file $(SPANISH_DEV_TMP)
-	cat $(SPANISH_DEV) $(SPANISH_DEV2) $(SPANISH_DEV3) > $(SPANISH_DEV_TMP)
 	@echo Will test on $(SPANISH_TEST)
-	java -mx10g edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser -trainTreebank $(SPANISH_TRAIN) -trainTreebank $(SPANISH_TRAIN2) -trainTreebank $(SPANISH_TRAIN3) -devTreebank $(SPANISH_DEV_TMP) -serializedPath $@ -trainingThreads 4 -batchSize 12 -preTag -taggerSerializedFile $(SPANISH_TAGGER) -trainingIterations 100 -stalledIterationLimit 25 -tlpp $(SPANISH_TLPP) > $@.out 2>&1
-	rm $(SPANISH_DEV_TMP)
+	java -mx10g edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser -trainTreebank $(SPANISH_TRAIN) -devTreebank $(SPANISH_DEV) -serializedPath $@ -trainingThreads 4 -batchSize 12 -preTag -taggerSerializedFile $(SPANISH_TAGGER) -trainingIterations 100 -stalledIterationLimit 25 -tlpp $(SPANISH_TLPP) > $@.out 2>&1
 	java -mx5g edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser -testTreebank $(SPANISH_TEST) -serializedPath $@ -preTag -taggerSerializedFile $(SPANISH_TAGGER) -trainingIterations 100 -stalledIterationLimit 25 >> $@.out 2>&1
-	java -mx5g edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser -testTreebank $(SPANISH_TEST2) -serializedPath $@ -preTag -taggerSerializedFile $(SPANISH_TAGGER) -trainingIterations 100 -stalledIterationLimit 25 >> $@.out 2>&1
-	java -mx5g edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser -testTreebank $(SPANISH_TEST3) -serializedPath $@ -preTag -taggerSerializedFile $(SPANISH_TAGGER) -trainingIterations 100 -stalledIterationLimit 25 >> $@.out 2>&1
 
 spanishSR.beam.ser.gz:
 	@echo Training $@
diff --git a/src/edu/stanford/nlp/pipeline/StanfordCoreNLP.properties b/src/edu/stanford/nlp/pipeline/StanfordCoreNLP.properties
@@ -4,7 +4,7 @@ annotators = tokenize, ssplit, pos, lemma, ner, parse, mention, coref
 # annotators = tokenize, ssplit, pos, lemma, truecase
 # annotators = tokenize, ssplit, regexner
 # These include:
-# - truecase: A true-casing annotator (for fixing lowercase or all caps text)k
+# - truecase: A true-casing annotator (for fixing lowercase or all caps text)
 # - regexner: Simple rule or regular-expression based NER (via TokensRegex)
 # - cleanxml: Removes XML from documents prior to processing
 # - entitymentions:
diff --git a/src/edu/stanford/nlp/simple/Document.java b/src/edu/stanford/nlp/simple/Document.java
diff --git a/src/edu/stanford/nlp/util/TSVUtils.java b/src/edu/stanford/nlp/util/TSVUtils.java