stanfordnlp
diff --git a/‎doc/tagger/README-Models.txt
Lines changed: 5 additions & 1 deletion b/‎doc/tagger/README-Models.txt
Lines changed: 5 additions & 1 deletion
diff --git a/‎itest/src/edu/stanford/nlp/ie/crf/TestThreadedCRFClassifier.java
Lines changed: 20 additions & 22 deletions b/‎itest/src/edu/stanford/nlp/ie/crf/TestThreadedCRFClassifier.java
Lines changed: 20 additions & 22 deletions
diff --git a/‎itest/src/edu/stanford/nlp/ie/crf/ThreadedCRFClassifierITest.java
Lines changed: 17 additions & 21 deletions b/‎itest/src/edu/stanford/nlp/ie/crf/ThreadedCRFClassifierITest.java
Lines changed: 17 additions & 21 deletions
diff --git a/‎itest/src/edu/stanford/nlp/pipeline/TaggerParserPosTagCompatibilityITest.java
Lines changed: 7 additions & 19 deletions b/‎itest/src/edu/stanford/nlp/pipeline/TaggerParserPosTagCompatibilityITest.java
Lines changed: 7 additions & 19 deletions
diff --git a/‎scripts/ner/spanish.ancora.distsim.s512.prop
Lines changed: 2 additions & 2 deletions b/‎scripts/ner/spanish.ancora.distsim.s512.prop
Lines changed: 2 additions & 2 deletions
diff --git a/‎scripts/ner/spanish.ancora.prop
Lines changed: 2 additions & 2 deletions b/‎scripts/ner/spanish.ancora.prop
Lines changed: 2 additions & 2 deletions
diff --git a/‎scripts/ner/spanish.ancora2.prop
Lines changed: 2 additions & 2 deletions b/‎scripts/ner/spanish.ancora2.prop
Lines changed: 2 additions & 2 deletions
diff --git a/‎scripts/pos-tagger/Makefile
Lines changed: 10 additions & 10 deletions b/‎scripts/pos-tagger/Makefile
Lines changed: 10 additions & 10 deletions
@@ -105,11 +105,15 @@ University of Stuttgart and the Seminar für Sprachwissenschaft of the
 University of Tübingen. See: 
 http://www.ims.uni-stuttgart.de/projekte/CQPDemos/Bundestag/help-tagset.html
 This model uses features from the distributional similarity clusters
-built over the HGC (Huge German Corpus).
+built over the HGC.
 Performance:
 96.90% on the first half of the remaining 20% of the Negra corpus (dev set)
 (90.33% on unknown words)
 
+german-dewac.tagger
+This model uses features from the distributional similarity clusters
+built from the deWac web corpus.
+
 german-fast.tagger
 Lacks distributional similarity features, but is several times faster
 than the other alternatives.
 
@@ -12,7 +12,6 @@
 import edu.stanford.nlp.util.Timing;
 
 public class TestThreadedCRFClassifier {
-
   TestThreadedCRFClassifier(Properties props) {
     inputEncoding = props.getProperty("inputEncoding", "UTF-8");
   }
@@ -24,8 +23,8 @@ public class TestThreadedCRFClassifier {
 
   private final String inputEncoding;
 
-  static CRFClassifier loadClassifier(String loadPath, Properties props) {
-    CRFClassifier crf = new CRFClassifier(props);
+  CRFClassifier loadClassifier(String loadPath, Properties props) {
+    CRFClassifier crf = new CRFClassifier(props);    
     crf.loadClassifierNoExceptions(loadPath, props);
     return crf;
   }
@@ -59,9 +58,9 @@ public void run() {
       Timing t = new Timing();
       resultsString = runClassifier(crf, filename);
       long millis = t.stop();
-      System.out.println("Thread " + threadName + " took " + millis +
+      System.out.println("Thread " + threadName + " took " + millis + 
                          "ms to tag file " + filename);
-    }
+    }        
   }
 
   /**
@@ -72,7 +71,7 @@ public void run() {
    * -crf2 ../stanford-releases/stanford-ner-models/dewac_175m_600.ser.gz
    * -testFile ../data/german-ner/deu.testa -inputEncoding iso-8859-1
    */
-  public static void main(String[] args) {
+  static public void main(String[] args) {
     try {
       System.setOut(new PrintStream(System.out, true, "UTF-8"));
       System.setErr(new PrintStream(System.err, true, "UTF-8"));
@@ -82,10 +81,10 @@ public static void main(String[] args) {
 
     runTest(StringUtils.argsToProperties(args));
   }
-
+  
   static public void runTest(Properties props) {
     TestThreadedCRFClassifier test = new TestThreadedCRFClassifier(props);
-    test.runThreadedTest(props);
+    test.runThreadedTest(props);    
   }
 
 
@@ -96,7 +95,7 @@ void runThreadedTest(Properties props) {
     ArrayList<String> modelNames = new ArrayList<String>();
     ArrayList<CRFClassifier> classifiers = new ArrayList<CRFClassifier>();
 
-    for (int i = 1;
+    for (int i = 1; 
          props.getProperty("crf" + Integer.toString(i)) != null; ++i) {
       String model = props.getProperty("crf" + Integer.toString(i));
       CRFClassifier crf = loadClassifier(model, props);
@@ -108,7 +107,7 @@ void runThreadedTest(Properties props) {
       // must run twice to account for "transductive learning"
       results = runClassifier(crf, testFile);
       baseResults.add(results);
-      System.out.println("Stored base results for " + model +
+      System.out.println("Stored base results for " + model + 
                          "; length " + results.length());
     }
 
@@ -122,13 +121,13 @@ void runThreadedTest(Properties props) {
       String repeated = runClassifier(crf, testFile);
       if (!base.equals(repeated)) {
         throw new RuntimeException("Repeated unthreaded results " +
-                                   "not the same for " + model +
+                                   "not the same for " + model + 
                                    " run on file " + testFile);
       }
     }
 
     // test the first classifier in several simultaneous threads
-    int numThreads = PropertiesUtils.getInt(props, "simThreads",
+    int numThreads = PropertiesUtils.getInt(props, "simThreads", 
                                             DEFAULT_SIM_THREADS);
 
     ArrayList<CRFThread> threads = new ArrayList<CRFThread>();
@@ -149,11 +148,11 @@ void runThreadedTest(Properties props) {
         System.out.println("Yay!");
       } else {
         throw new RuntimeException("Results not equal when running " +
-                                   modelNames.get(0) + " under " +
+                                   modelNames.get(0) + " under " + 
                                    numThreads + " simultaneous threads");
       }
     }
-
+    
     // test multiple classifiers (if given) in multiple threads each
     if (classifiers.size() > 1) {
       numThreads = PropertiesUtils.getInt(props, "multipleThreads",
@@ -163,11 +162,11 @@ void runThreadedTest(Properties props) {
         int classifierNum = i % classifiers.size();
         int repeatNum = i / classifiers.size();
         threads.add(new CRFThread(classifiers.get(classifierNum), testFile,
-                                  ("Simultaneous-" + classifierNum +
+                                  ("Simultaneous-" + classifierNum + 
                                    "-" + repeatNum)));
       }
-      for (CRFThread thread : threads) {
-        thread.start();
+      for (int i = 0; i < threads.size(); ++i) {
+        threads.get(i).start();
       }
       for (int i = 0; i < threads.size(); ++i) {
         int classifierNum = i % classifiers.size();
@@ -183,17 +182,16 @@ void runThreadedTest(Properties props) {
           System.out.println("Yay!");
         } else {
           throw new RuntimeException("Results not equal when running " +
-                                     modelNames.get(classifierNum) +
-                                     " under " + numThreads +
+                                     modelNames.get(classifierNum) + 
+                                     " under " + numThreads + 
                                      " threads with " +
-                                     classifiers.size() +
+                                     classifiers.size() + 
                                      " total classifiers");
         }
-      }
+      }      
     }
 
     // if no exceptions thrown, great success
     System.out.println("Everything worked!");
   }
-
 }
@@ -4,33 +4,30 @@
 
 import java.util.Properties;
 
-/**
+/** 
  * Test that the CRFClassifier works when multiple classifiers are run
  * in multiple threads.
  *
  *  @author John Bauer
  */
 public class ThreadedCRFClassifierITest extends TestCase {
-
   Properties props;
 
-  private static final String german1 =
-    "edu/stanford/nlp/models/ner/german.conll.hgc_175m_600.crf.ser.gz";
-  /** -- We're no longer supporting this one
-  private String german2 =
+  private String german1 = 
+    "/u/nlp/data/ner/goodClassifiers/german.hgc_175m_600.crf.ser.gz";
+  private String german2 = 
     "/u/nlp/data/ner/goodClassifiers/german.dewac_175m_600.crf.ser.gz";
-  */
-  private static final String germanTestFile = "/u/nlp/data/german/ner/2016/deu.utf8.testa";
+  private String germanTestFile = "/u/nlp/data/german/ner/deu.testa";
 
-  private static final String english1 =
+  private String english1 = 
     "/u/nlp/data/ner/goodClassifiers/english.all.3class.nodistsim.crf.ser.gz";
-  private static final String english2 =
-    "/u/nlp/data/ner/goodClassifiers/english.conll.4class.distsim.crf.ser.gz";
-  private static final String englishTestFile = "/u/nlp/data/ner/column_data/conll.4class.testa";
-
-  private static final String germanEncoding = "utf-8";
-  private static final String englishEncoding = "utf-8";
+  private String english2 = 
+    "/u/nlp/data/ner/goodClassifiers/english.all.3class.distsim.crf.ser.gz";
+  private String englishTestFile = "/u/nlp/data/ner/column_data/conll.testa";
 
+  private String germanEncoding = "iso-8859-1";
+  private String englishEncoding = "utf-8";
+  
   @Override
   public void setUp() {
     props = new Properties();
@@ -50,13 +47,12 @@ public void testOneGermanCRF() {
     TestThreadedCRFClassifier.runTest(props);
   }
 
-  public void testTwoEnglishCRFs() {
-    props.setProperty("crf1", english1);
-    props.setProperty("crf2", english2);
-    props.setProperty("testFile", englishTestFile);
-    props.setProperty("inputEncoding", englishEncoding);
+  public void testTwoGermanCRFs() {
+    props.setProperty("crf1", german1);
+    props.setProperty("crf2", german2);
+    props.setProperty("testFile", germanTestFile);
+    props.setProperty("inputEncoding", germanEncoding);
     TestThreadedCRFClassifier.runTest(props);
   }
-
 }
 
@@ -2,13 +2,12 @@
 
 import java.util.Set;
 
+import edu.stanford.nlp.parser.nndep.DependencyParser;
+import edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser;
 import junit.framework.TestCase;
 
 import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
-import edu.stanford.nlp.parser.nndep.DependencyParser;
-import edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser;
 import edu.stanford.nlp.tagger.maxent.MaxentTagger;
-import edu.stanford.nlp.util.Sets;
 
 /** This test checks whether our trained POS tagger and parser models are using the identical POS tag set
  *  for the various languages that we support. It's a good idea if they are.
@@ -25,34 +24,25 @@ private static void testTagSet4(String[] lexParsers,
     Set<String> tagSet = lp.getLexicon().tagSet(lp.treebankLanguagePack().getBasicCategoryFunction());
     for (String name : maxentTaggers) {
       MaxentTagger tagger = new MaxentTagger(name);
-      assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch:\n" +
-                   "left - right: " + Sets.diff(tagSet, tagger.tagSet()) +
-                   "; right - left: " + Sets.diff(tagger.tagSet(), tagSet) + "\n",
-                   tagSet, tagger.tagSet());
+      assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch", tagSet, tagger.tagSet());
     }
     for (String name : lexParsers) {
       LexicalizedParser lp2 = LexicalizedParser.loadModel(name);
-      assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch:\n" +
-                   "left - right: " + Sets.diff(tagSet, lp2.getLexicon().tagSet(lp.treebankLanguagePack().getBasicCategoryFunction())) + 
-                   "; right - left: " + Sets.diff(lp2.getLexicon().tagSet(lp.treebankLanguagePack().getBasicCategoryFunction()), tagSet) + "\n",
+      assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch",
                    tagSet, lp2.getLexicon().tagSet(lp.treebankLanguagePack().getBasicCategoryFunction()));
     }
 
     for (String name : srParsers) {
       ShiftReduceParser srp = ShiftReduceParser.loadModel(name);
 
-      assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch:\n" +
-                   "left - right: " + Sets.diff(tagSet, srp.tagSet()) +
-                   "; right - left: " + Sets.diff(srp.tagSet(), tagSet) + "\n",
+      assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch",
                    tagSet, srp.tagSet());
     }
 
     for (String name : nnDepParsers) {
       DependencyParser dp = DependencyParser.loadFromModelFile(name);
 
-      assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch:\n" +
-                   "left - right: " + Sets.diff(tagSet, dp.getPosSet()) +
-                   "; right - left: " + Sets.diff(dp.getPosSet(), tagSet) + "\n",
+      assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch",
                    tagSet, dp.getPosSet());
     }
 
@@ -90,7 +80,7 @@ public void testEnglishTagSet() {
   private static final String[] germanTaggers = {
     "edu/stanford/nlp/models/pos-tagger/german/german-fast.tagger",
     "edu/stanford/nlp/models/pos-tagger/german/german-fast-caseless.tagger",
-    // "edu/stanford/nlp/models/pos-tagger/german/german-dewac.tagger", // No longer supported; always worse than hgc
+    "edu/stanford/nlp/models/pos-tagger/german/german-dewac.tagger",
     "edu/stanford/nlp/models/pos-tagger/german/german-hgc.tagger"
   };
 
@@ -104,8 +94,6 @@ public void testEnglishTagSet() {
   };
 
   private static final String[] germanNnParsers = {
-    // This one uses UD tag set not fine-grained tags!
-    // "edu/stanford/nlp/models/parser/nndep/UD_German.gz",
   };
 
   public void testGermanTagSet() {
 
@@ -1,6 +1,6 @@
 
-trainFileList = /u/nlp/data/spanish/ner/ancora.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.train.tsv
-testFiles = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.dev.tsv
+trainFile = /u/nlp/data/spanish/ner/ancora.ner.train.tsv
+testFile = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv
 serializeTo = spanish.ancora.distsim.s512.crf.ser.gz
 
 distSimLexicon = /u/nlp/data/spanish/distsim/spanish.spence512.cls
 
@@ -1,6 +1,6 @@
 
-trainFileList = /u/nlp/data/spanish/ner/ancora.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.train.tsv
-testFiles = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.dev.tsv
+trainFile = /u/nlp/data/spanish/ner/ancora.ner.train.tsv
+testFile = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv
 serializeTo = spanish.ancora.crf.ser.gz
 
 useDistSim = false
 
@@ -1,6 +1,6 @@
 
-trainFileList = /u/nlp/data/spanish/ner/ancora.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.train.tsv
-testFiles = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.dev.tsv
+trainFile = /u/nlp/data/spanish/ner/ancora.ner.train.tsv
+testFile = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv
 serializeTo = spanish.ancora2.crf.ser.gz
 
 useDistSim = false
 
@@ -10,14 +10,14 @@ FRENCH_TEST = format=TREES,/u/nlp/data/lexparser/trees/French/FTB-Test.utf8.txt
 
 GERMAN_TEST = format=TREES,trf=edu.stanford.nlp.trees.international.negra.NegraPennTreeReaderFactory,/u/nlp/data/GermanACL08/negra/negra_3.mrg
 
-SPANISH_TEST = format=TREES,/u/nlp/data/spanish/ancora/ancora.test,/u/nlp/data/spanish/ldc/ldc-NW.test,/u/nlp/data/spanish/ldc/ldc-DF.test
+SPANISH_TEST = format=TREES,/u/nlp/data/spanish/ancora/ancora.test
 
 .SECONDEXPANSION:
 
 all: arabic chinese english french german spanish testing wsj
 .PHONY: all arabic chinese english french german spanish testing wsj
 
-arabic: arabic.tagger  arabic-train.tagger
+arabic: arabic.tagger  arabic-train.tagger 
 
 # we release an arabic model trained on everything, with a
 # corresponding model on train only for testing purposes
@@ -27,35 +27,35 @@ arabic.tagger arabic-train.tagger: $$@.props
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(ARABIC_TEST) -verboseResults false >> $@.out 2>&1
 
-chinese: chinese-distsim.tagger chinese-nodistsim.tagger
+chinese: chinese-distsim.tagger chinese-nodistsim.tagger 
 
 chinese-nodistsim.tagger chinese-distsim.tagger: $$@.props
 	@echo Training $@
-	@echo Will test on $(CHINESE_TEST)
+	@echo Will test on $(CHINESE_TEST) 
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(CHINESE_TEST)  -verboseResults false >> $@.out 2>&1
 
 english: english-bidirectional-distsim.tagger english-caseless-left3words-distsim.tagger english-left3words-distsim.tagger
 
 english-bidirectional-distsim.tagger english-caseless-left3words-distsim.tagger english-left3words-distsim.tagger: $$@.props
 	@echo Training $@
-	@echo Will test on $(ENGLISH_TEST)
+	@echo Will test on $(ENGLISH_TEST) 
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(ENGLISH_TEST)  -verboseResults false >> $@.out 2>&1
 
 french: french.tagger
 
 french.tagger: $$@.props
 	@echo Training $@
-	@echo Will test on $(FRENCH_TEST)
+	@echo Will test on $(FRENCH_TEST) 
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(FRENCH_TEST)  -verboseResults false >> $@.out 2>&1
 
 german: german-dewac.tagger german-fast.tagger german-fast-caseless.tagger german-hgc.tagger
 
 german-dewac.tagger german-fast.tagger german-fast-caseless.tagger german-hgc.tagger: $$@.props
 	@echo Training $@
-	@echo Will test on $(GERMAN_TEST)
+	@echo Will test on $(GERMAN_TEST) 
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(GERMAN_TEST)  -verboseResults false >> $@.out 2>&1
 
@@ -64,16 +64,16 @@ spanish: spanish.tagger spanish-distsim.tagger
 spanish.tagger spanish-distsim.tagger: $$@.props
 	@echo Training $@
 	@echo Will test on $(SPANISH_TEST)
-	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
-#	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(SPANISH_TEST) -verboseResults false >> $@.out 2>&1
+#java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
+	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(SPANISH_TEST) -verboseResults false >> $@.out 2>&1
 
 testing: testing.tagger
 
 testing.tagger:
 	@echo Training $@
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
 
-wsj: wsj-0-18-bidirectional-distsim.tagger  wsj-0-18-bidirectional-nodistsim.tagger  wsj-0-18-caseless-left3words-distsim.tagger  wsj-0-18-left3words-distsim.tagger wsj-0-18-left3words-nodistsim.tagger
+wsj: wsj-0-18-bidirectional-distsim.tagger  wsj-0-18-bidirectional-nodistsim.tagger  wsj-0-18-caseless-left3words-distsim.tagger  wsj-0-18-left3words-distsim.tagger wsj-0-18-left3words-nodistsim.tagger 
 
 wsj-0-18-bidirectional-distsim.tagger wsj-0-18-bidirectional-nodistsim.tagger wsj-0-18-caseless-left3words-distsim.tagger wsj-0-18-left3words-distsim.tagger wsj-0-18-left3words-nodistsim.tagger: $$@.props
 	@echo Training $@