stanfordnlp
diff --git a/‎doc/tagger/README-Models.txt
Lines changed: 1 addition & 5 deletions b/‎doc/tagger/README-Models.txt
Lines changed: 1 addition & 5 deletions
diff --git a/‎itest/src/edu/stanford/nlp/ie/crf/TestThreadedCRFClassifier.java
Lines changed: 22 additions & 20 deletions b/‎itest/src/edu/stanford/nlp/ie/crf/TestThreadedCRFClassifier.java
Lines changed: 22 additions & 20 deletions
diff --git a/‎itest/src/edu/stanford/nlp/ie/crf/ThreadedCRFClassifierITest.java
Lines changed: 21 additions & 17 deletions b/‎itest/src/edu/stanford/nlp/ie/crf/ThreadedCRFClassifierITest.java
Lines changed: 21 additions & 17 deletions
diff --git a/‎itest/src/edu/stanford/nlp/pipeline/TaggerParserPosTagCompatibilityITest.java
Lines changed: 19 additions & 7 deletions b/‎itest/src/edu/stanford/nlp/pipeline/TaggerParserPosTagCompatibilityITest.java
Lines changed: 19 additions & 7 deletions
diff --git a/‎scripts/ner/spanish.ancora.distsim.s512.prop
Lines changed: 2 additions & 2 deletions b/‎scripts/ner/spanish.ancora.distsim.s512.prop
Lines changed: 2 additions & 2 deletions
diff --git a/‎scripts/ner/spanish.ancora.prop
Lines changed: 2 additions & 2 deletions b/‎scripts/ner/spanish.ancora.prop
Lines changed: 2 additions & 2 deletions
diff --git a/‎scripts/ner/spanish.ancora2.prop
Lines changed: 2 additions & 2 deletions b/‎scripts/ner/spanish.ancora2.prop
Lines changed: 2 additions & 2 deletions
diff --git a/‎scripts/pos-tagger/Makefile
Lines changed: 10 additions & 10 deletions b/‎scripts/pos-tagger/Makefile
Lines changed: 10 additions & 10 deletions
@@ -105,15 +105,11 @@ University of Stuttgart and the Seminar für Sprachwissenschaft of the
 University of Tübingen. See: 
 http://www.ims.uni-stuttgart.de/projekte/CQPDemos/Bundestag/help-tagset.html
 This model uses features from the distributional similarity clusters
-built over the HGC.
+built over the HGC (Huge German Corpus).
 Performance:
 96.90% on the first half of the remaining 20% of the Negra corpus (dev set)
 (90.33% on unknown words)
 
-german-dewac.tagger
-This model uses features from the distributional similarity clusters
-built from the deWac web corpus.
-
 german-fast.tagger
 Lacks distributional similarity features, but is several times faster
 than the other alternatives.
 
@@ -12,6 +12,7 @@
 import edu.stanford.nlp.util.Timing;
 
 public class TestThreadedCRFClassifier {
+
   TestThreadedCRFClassifier(Properties props) {
     inputEncoding = props.getProperty("inputEncoding", "UTF-8");
   }
@@ -23,8 +24,8 @@ public class TestThreadedCRFClassifier {
 
   private final String inputEncoding;
 
-  CRFClassifier loadClassifier(String loadPath, Properties props) {
-    CRFClassifier crf = new CRFClassifier(props);    
+  static CRFClassifier loadClassifier(String loadPath, Properties props) {
+    CRFClassifier crf = new CRFClassifier(props);
     crf.loadClassifierNoExceptions(loadPath, props);
     return crf;
   }
@@ -58,9 +59,9 @@ public void run() {
       Timing t = new Timing();
       resultsString = runClassifier(crf, filename);
       long millis = t.stop();
-      System.out.println("Thread " + threadName + " took " + millis + 
+      System.out.println("Thread " + threadName + " took " + millis +
                          "ms to tag file " + filename);
-    }        
+    }
   }
 
   /**
@@ -71,7 +72,7 @@ public void run() {
    * -crf2 ../stanford-releases/stanford-ner-models/dewac_175m_600.ser.gz
    * -testFile ../data/german-ner/deu.testa -inputEncoding iso-8859-1
    */
-  static public void main(String[] args) {
+  public static void main(String[] args) {
     try {
       System.setOut(new PrintStream(System.out, true, "UTF-8"));
       System.setErr(new PrintStream(System.err, true, "UTF-8"));
@@ -81,10 +82,10 @@ static public void main(String[] args) {
 
     runTest(StringUtils.argsToProperties(args));
   }
-  
+
   static public void runTest(Properties props) {
     TestThreadedCRFClassifier test = new TestThreadedCRFClassifier(props);
-    test.runThreadedTest(props);    
+    test.runThreadedTest(props);
   }
 
 
@@ -95,7 +96,7 @@ void runThreadedTest(Properties props) {
     ArrayList<String> modelNames = new ArrayList<String>();
     ArrayList<CRFClassifier> classifiers = new ArrayList<CRFClassifier>();
 
-    for (int i = 1; 
+    for (int i = 1;
          props.getProperty("crf" + Integer.toString(i)) != null; ++i) {
       String model = props.getProperty("crf" + Integer.toString(i));
       CRFClassifier crf = loadClassifier(model, props);
@@ -107,7 +108,7 @@ void runThreadedTest(Properties props) {
       // must run twice to account for "transductive learning"
       results = runClassifier(crf, testFile);
       baseResults.add(results);
-      System.out.println("Stored base results for " + model + 
+      System.out.println("Stored base results for " + model +
                          "; length " + results.length());
     }
 
@@ -121,13 +122,13 @@ void runThreadedTest(Properties props) {
       String repeated = runClassifier(crf, testFile);
       if (!base.equals(repeated)) {
         throw new RuntimeException("Repeated unthreaded results " +
-                                   "not the same for " + model + 
+                                   "not the same for " + model +
                                    " run on file " + testFile);
       }
     }
 
     // test the first classifier in several simultaneous threads
-    int numThreads = PropertiesUtils.getInt(props, "simThreads", 
+    int numThreads = PropertiesUtils.getInt(props, "simThreads",
                                             DEFAULT_SIM_THREADS);
 
     ArrayList<CRFThread> threads = new ArrayList<CRFThread>();
@@ -148,11 +149,11 @@ void runThreadedTest(Properties props) {
         System.out.println("Yay!");
       } else {
         throw new RuntimeException("Results not equal when running " +
-                                   modelNames.get(0) + " under " + 
+                                   modelNames.get(0) + " under " +
                                    numThreads + " simultaneous threads");
       }
     }
-    
+
     // test multiple classifiers (if given) in multiple threads each
     if (classifiers.size() > 1) {
       numThreads = PropertiesUtils.getInt(props, "multipleThreads",
@@ -162,11 +163,11 @@ void runThreadedTest(Properties props) {
         int classifierNum = i % classifiers.size();
         int repeatNum = i / classifiers.size();
         threads.add(new CRFThread(classifiers.get(classifierNum), testFile,
-                                  ("Simultaneous-" + classifierNum + 
+                                  ("Simultaneous-" + classifierNum +
                                    "-" + repeatNum)));
       }
-      for (int i = 0; i < threads.size(); ++i) {
-        threads.get(i).start();
+      for (CRFThread thread : threads) {
+        thread.start();
       }
       for (int i = 0; i < threads.size(); ++i) {
         int classifierNum = i % classifiers.size();
@@ -182,16 +183,17 @@ void runThreadedTest(Properties props) {
           System.out.println("Yay!");
         } else {
           throw new RuntimeException("Results not equal when running " +
-                                     modelNames.get(classifierNum) + 
-                                     " under " + numThreads + 
+                                     modelNames.get(classifierNum) +
+                                     " under " + numThreads +
                                      " threads with " +
-                                     classifiers.size() + 
+                                     classifiers.size() +
                                      " total classifiers");
         }
-      }      
+      }
     }
 
     // if no exceptions thrown, great success
     System.out.println("Everything worked!");
   }
+
 }
@@ -4,30 +4,33 @@
 
 import java.util.Properties;
 
-/** 
+/**
  * Test that the CRFClassifier works when multiple classifiers are run
  * in multiple threads.
  *
  *  @author John Bauer
  */
 public class ThreadedCRFClassifierITest extends TestCase {
+
   Properties props;
 
-  private String german1 = 
-    "/u/nlp/data/ner/goodClassifiers/german.hgc_175m_600.crf.ser.gz";
-  private String german2 = 
+  private static final String german1 =
+    "edu/stanford/nlp/models/ner/german.conll.hgc_175m_600.crf.ser.gz";
+  /** -- We're no longer supporting this one
+  private String german2 =
     "/u/nlp/data/ner/goodClassifiers/german.dewac_175m_600.crf.ser.gz";
-  private String germanTestFile = "/u/nlp/data/german/ner/deu.testa";
+  */
+  private static final String germanTestFile = "/u/nlp/data/german/ner/2016/deu.utf8.testa";
 
-  private String english1 = 
+  private static final String english1 =
     "/u/nlp/data/ner/goodClassifiers/english.all.3class.nodistsim.crf.ser.gz";
-  private String english2 = 
-    "/u/nlp/data/ner/goodClassifiers/english.all.3class.distsim.crf.ser.gz";
-  private String englishTestFile = "/u/nlp/data/ner/column_data/conll.testa";
+  private static final String english2 =
+    "/u/nlp/data/ner/goodClassifiers/english.conll.4class.distsim.crf.ser.gz";
+  private static final String englishTestFile = "/u/nlp/data/ner/column_data/conll.4class.testa";
+
+  private static final String germanEncoding = "utf-8";
+  private static final String englishEncoding = "utf-8";
 
-  private String germanEncoding = "iso-8859-1";
-  private String englishEncoding = "utf-8";
-  
   @Override
   public void setUp() {
     props = new Properties();
@@ -47,12 +50,13 @@ public void testOneGermanCRF() {
     TestThreadedCRFClassifier.runTest(props);
   }
 
-  public void testTwoGermanCRFs() {
-    props.setProperty("crf1", german1);
-    props.setProperty("crf2", german2);
-    props.setProperty("testFile", germanTestFile);
-    props.setProperty("inputEncoding", germanEncoding);
+  public void testTwoEnglishCRFs() {
+    props.setProperty("crf1", english1);
+    props.setProperty("crf2", english2);
+    props.setProperty("testFile", englishTestFile);
+    props.setProperty("inputEncoding", englishEncoding);
     TestThreadedCRFClassifier.runTest(props);
   }
+
 }
 
@@ -2,12 +2,13 @@
 
 import java.util.Set;
 
-import edu.stanford.nlp.parser.nndep.DependencyParser;
-import edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser;
 import junit.framework.TestCase;
 
 import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
+import edu.stanford.nlp.parser.nndep.DependencyParser;
+import edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser;
 import edu.stanford.nlp.tagger.maxent.MaxentTagger;
+import edu.stanford.nlp.util.Sets;
 
 /** This test checks whether our trained POS tagger and parser models are using the identical POS tag set
  *  for the various languages that we support. It's a good idea if they are.
@@ -24,25 +25,34 @@ private static void testTagSet4(String[] lexParsers,
     Set<String> tagSet = lp.getLexicon().tagSet(lp.treebankLanguagePack().getBasicCategoryFunction());
     for (String name : maxentTaggers) {
       MaxentTagger tagger = new MaxentTagger(name);
-      assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch", tagSet, tagger.tagSet());
+      assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch:\n" +
+                   "left - right: " + Sets.diff(tagSet, tagger.tagSet()) +
+                   "; right - left: " + Sets.diff(tagger.tagSet(), tagSet) + "\n",
+                   tagSet, tagger.tagSet());
     }
     for (String name : lexParsers) {
       LexicalizedParser lp2 = LexicalizedParser.loadModel(name);
-      assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch",
+      assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch:\n" +
+                   "left - right: " + Sets.diff(tagSet, lp2.getLexicon().tagSet(lp.treebankLanguagePack().getBasicCategoryFunction())) + 
+                   "; right - left: " + Sets.diff(lp2.getLexicon().tagSet(lp.treebankLanguagePack().getBasicCategoryFunction()), tagSet) + "\n",
                    tagSet, lp2.getLexicon().tagSet(lp.treebankLanguagePack().getBasicCategoryFunction()));
     }
 
     for (String name : srParsers) {
       ShiftReduceParser srp = ShiftReduceParser.loadModel(name);
 
-      assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch",
+      assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch:\n" +
+                   "left - right: " + Sets.diff(tagSet, srp.tagSet()) +
+                   "; right - left: " + Sets.diff(srp.tagSet(), tagSet) + "\n",
                    tagSet, srp.tagSet());
     }
 
     for (String name : nnDepParsers) {
       DependencyParser dp = DependencyParser.loadFromModelFile(name);
 
-      assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch",
+      assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch:\n" +
+                   "left - right: " + Sets.diff(tagSet, dp.getPosSet()) +
+                   "; right - left: " + Sets.diff(dp.getPosSet(), tagSet) + "\n",
                    tagSet, dp.getPosSet());
     }
 
@@ -80,7 +90,7 @@ public void testEnglishTagSet() {
   private static final String[] germanTaggers = {
     "edu/stanford/nlp/models/pos-tagger/german/german-fast.tagger",
     "edu/stanford/nlp/models/pos-tagger/german/german-fast-caseless.tagger",
-    "edu/stanford/nlp/models/pos-tagger/german/german-dewac.tagger",
+    // "edu/stanford/nlp/models/pos-tagger/german/german-dewac.tagger", // No longer supported; always worse than hgc
     "edu/stanford/nlp/models/pos-tagger/german/german-hgc.tagger"
   };
 
@@ -94,6 +104,8 @@ public void testEnglishTagSet() {
   };
 
   private static final String[] germanNnParsers = {
+    // This one uses UD tag set not fine-grained tags!
+    // "edu/stanford/nlp/models/parser/nndep/UD_German.gz",
   };
 
   public void testGermanTagSet() {
 
@@ -1,6 +1,6 @@
 
-trainFile = /u/nlp/data/spanish/ner/ancora.ner.train.tsv
-testFile = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv
+trainFileList = /u/nlp/data/spanish/ner/ancora.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.train.tsv
+testFiles = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.dev.tsv
 serializeTo = spanish.ancora.distsim.s512.crf.ser.gz
 
 distSimLexicon = /u/nlp/data/spanish/distsim/spanish.spence512.cls
 
@@ -1,6 +1,6 @@
 
-trainFile = /u/nlp/data/spanish/ner/ancora.ner.train.tsv
-testFile = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv
+trainFileList = /u/nlp/data/spanish/ner/ancora.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.train.tsv
+testFiles = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.dev.tsv
 serializeTo = spanish.ancora.crf.ser.gz
 
 useDistSim = false
 
@@ -1,6 +1,6 @@
 
-trainFile = /u/nlp/data/spanish/ner/ancora.ner.train.tsv
-testFile = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv
+trainFileList = /u/nlp/data/spanish/ner/ancora.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.train.tsv
+testFiles = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.dev.tsv
 serializeTo = spanish.ancora2.crf.ser.gz
 
 useDistSim = false
 
@@ -10,14 +10,14 @@ FRENCH_TEST = format=TREES,/u/nlp/data/lexparser/trees/French/FTB-Test.utf8.txt
 
 GERMAN_TEST = format=TREES,trf=edu.stanford.nlp.trees.international.negra.NegraPennTreeReaderFactory,/u/nlp/data/GermanACL08/negra/negra_3.mrg
 
-SPANISH_TEST = format=TREES,/u/nlp/data/spanish/ancora/ancora.test
+SPANISH_TEST = format=TREES,/u/nlp/data/spanish/ancora/ancora.test,/u/nlp/data/spanish/ldc/ldc-NW.test,/u/nlp/data/spanish/ldc/ldc-DF.test
 
 .SECONDEXPANSION:
 
 all: arabic chinese english french german spanish testing wsj
 .PHONY: all arabic chinese english french german spanish testing wsj
 
-arabic: arabic.tagger  arabic-train.tagger 
+arabic: arabic.tagger  arabic-train.tagger
 
 # we release an arabic model trained on everything, with a
 # corresponding model on train only for testing purposes
@@ -27,35 +27,35 @@ arabic.tagger arabic-train.tagger: $$@.props
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(ARABIC_TEST) -verboseResults false >> $@.out 2>&1
 
-chinese: chinese-distsim.tagger chinese-nodistsim.tagger 
+chinese: chinese-distsim.tagger chinese-nodistsim.tagger
 
 chinese-nodistsim.tagger chinese-distsim.tagger: $$@.props
 	@echo Training $@
-	@echo Will test on $(CHINESE_TEST) 
+	@echo Will test on $(CHINESE_TEST)
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(CHINESE_TEST)  -verboseResults false >> $@.out 2>&1
 
 english: english-bidirectional-distsim.tagger english-caseless-left3words-distsim.tagger english-left3words-distsim.tagger
 
 english-bidirectional-distsim.tagger english-caseless-left3words-distsim.tagger english-left3words-distsim.tagger: $$@.props
 	@echo Training $@
-	@echo Will test on $(ENGLISH_TEST) 
+	@echo Will test on $(ENGLISH_TEST)
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(ENGLISH_TEST)  -verboseResults false >> $@.out 2>&1
 
 french: french.tagger
 
 french.tagger: $$@.props
 	@echo Training $@
-	@echo Will test on $(FRENCH_TEST) 
+	@echo Will test on $(FRENCH_TEST)
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(FRENCH_TEST)  -verboseResults false >> $@.out 2>&1
 
 german: german-dewac.tagger german-fast.tagger german-fast-caseless.tagger german-hgc.tagger
 
 german-dewac.tagger german-fast.tagger german-fast-caseless.tagger german-hgc.tagger: $$@.props
 	@echo Training $@
-	@echo Will test on $(GERMAN_TEST) 
+	@echo Will test on $(GERMAN_TEST)
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(GERMAN_TEST)  -verboseResults false >> $@.out 2>&1
 
@@ -64,16 +64,16 @@ spanish: spanish.tagger spanish-distsim.tagger
 spanish.tagger spanish-distsim.tagger: $$@.props
 	@echo Training $@
 	@echo Will test on $(SPANISH_TEST)
-#java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
-	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(SPANISH_TEST) -verboseResults false >> $@.out 2>&1
+	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
+#	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(SPANISH_TEST) -verboseResults false >> $@.out 2>&1
 
 testing: testing.tagger
 
 testing.tagger:
 	@echo Training $@
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
 
-wsj: wsj-0-18-bidirectional-distsim.tagger  wsj-0-18-bidirectional-nodistsim.tagger  wsj-0-18-caseless-left3words-distsim.tagger  wsj-0-18-left3words-distsim.tagger wsj-0-18-left3words-nodistsim.tagger 
+wsj: wsj-0-18-bidirectional-distsim.tagger  wsj-0-18-bidirectional-nodistsim.tagger  wsj-0-18-caseless-left3words-distsim.tagger  wsj-0-18-left3words-distsim.tagger wsj-0-18-left3words-nodistsim.tagger
 
 wsj-0-18-bidirectional-distsim.tagger wsj-0-18-bidirectional-nodistsim.tagger wsj-0-18-caseless-left3words-distsim.tagger wsj-0-18-left3words-distsim.tagger wsj-0-18-left3words-nodistsim.tagger: $$@.props
 	@echo Training $@