Skip to content

Commit 8e38a68

Browse files
Gabor AngeliStanford NLP
authored andcommitted
Add some logging messages to CoreNLP
1 parent 18964e6 commit 8e38a68

22 files changed

+169
-242
lines changed

doc/tagger/README-Models.txt

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,11 +105,15 @@ University of Stuttgart and the Seminar für Sprachwissenschaft of the
105105
University of Tübingen. See:
106106
http://www.ims.uni-stuttgart.de/projekte/CQPDemos/Bundestag/help-tagset.html
107107
This model uses features from the distributional similarity clusters
108-
built over the HGC (Huge German Corpus).
108+
built over the HGC.
109109
Performance:
110110
96.90% on the first half of the remaining 20% of the Negra corpus (dev set)
111111
(90.33% on unknown words)
112112

113+
german-dewac.tagger
114+
This model uses features from the distributional similarity clusters
115+
built from the deWac web corpus.
116+
113117
german-fast.tagger
114118
Lacks distributional similarity features, but is several times faster
115119
than the other alternatives.

itest/src/edu/stanford/nlp/ie/crf/TestThreadedCRFClassifier.java

Lines changed: 20 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
import edu.stanford.nlp.util.Timing;
1313

1414
public class TestThreadedCRFClassifier {
15-
1615
TestThreadedCRFClassifier(Properties props) {
1716
inputEncoding = props.getProperty("inputEncoding", "UTF-8");
1817
}
@@ -24,8 +23,8 @@ public class TestThreadedCRFClassifier {
2423

2524
private final String inputEncoding;
2625

27-
static CRFClassifier loadClassifier(String loadPath, Properties props) {
28-
CRFClassifier crf = new CRFClassifier(props);
26+
CRFClassifier loadClassifier(String loadPath, Properties props) {
27+
CRFClassifier crf = new CRFClassifier(props);
2928
crf.loadClassifierNoExceptions(loadPath, props);
3029
return crf;
3130
}
@@ -59,9 +58,9 @@ public void run() {
5958
Timing t = new Timing();
6059
resultsString = runClassifier(crf, filename);
6160
long millis = t.stop();
62-
System.out.println("Thread " + threadName + " took " + millis +
61+
System.out.println("Thread " + threadName + " took " + millis +
6362
"ms to tag file " + filename);
64-
}
63+
}
6564
}
6665

6766
/**
@@ -72,7 +71,7 @@ public void run() {
7271
* -crf2 ../stanford-releases/stanford-ner-models/dewac_175m_600.ser.gz
7372
* -testFile ../data/german-ner/deu.testa -inputEncoding iso-8859-1
7473
*/
75-
public static void main(String[] args) {
74+
static public void main(String[] args) {
7675
try {
7776
System.setOut(new PrintStream(System.out, true, "UTF-8"));
7877
System.setErr(new PrintStream(System.err, true, "UTF-8"));
@@ -82,10 +81,10 @@ public static void main(String[] args) {
8281

8382
runTest(StringUtils.argsToProperties(args));
8483
}
85-
84+
8685
static public void runTest(Properties props) {
8786
TestThreadedCRFClassifier test = new TestThreadedCRFClassifier(props);
88-
test.runThreadedTest(props);
87+
test.runThreadedTest(props);
8988
}
9089

9190

@@ -96,7 +95,7 @@ void runThreadedTest(Properties props) {
9695
ArrayList<String> modelNames = new ArrayList<String>();
9796
ArrayList<CRFClassifier> classifiers = new ArrayList<CRFClassifier>();
9897

99-
for (int i = 1;
98+
for (int i = 1;
10099
props.getProperty("crf" + Integer.toString(i)) != null; ++i) {
101100
String model = props.getProperty("crf" + Integer.toString(i));
102101
CRFClassifier crf = loadClassifier(model, props);
@@ -108,7 +107,7 @@ void runThreadedTest(Properties props) {
108107
// must run twice to account for "transductive learning"
109108
results = runClassifier(crf, testFile);
110109
baseResults.add(results);
111-
System.out.println("Stored base results for " + model +
110+
System.out.println("Stored base results for " + model +
112111
"; length " + results.length());
113112
}
114113

@@ -122,13 +121,13 @@ void runThreadedTest(Properties props) {
122121
String repeated = runClassifier(crf, testFile);
123122
if (!base.equals(repeated)) {
124123
throw new RuntimeException("Repeated unthreaded results " +
125-
"not the same for " + model +
124+
"not the same for " + model +
126125
" run on file " + testFile);
127126
}
128127
}
129128

130129
// test the first classifier in several simultaneous threads
131-
int numThreads = PropertiesUtils.getInt(props, "simThreads",
130+
int numThreads = PropertiesUtils.getInt(props, "simThreads",
132131
DEFAULT_SIM_THREADS);
133132

134133
ArrayList<CRFThread> threads = new ArrayList<CRFThread>();
@@ -149,11 +148,11 @@ void runThreadedTest(Properties props) {
149148
System.out.println("Yay!");
150149
} else {
151150
throw new RuntimeException("Results not equal when running " +
152-
modelNames.get(0) + " under " +
151+
modelNames.get(0) + " under " +
153152
numThreads + " simultaneous threads");
154153
}
155154
}
156-
155+
157156
// test multiple classifiers (if given) in multiple threads each
158157
if (classifiers.size() > 1) {
159158
numThreads = PropertiesUtils.getInt(props, "multipleThreads",
@@ -163,11 +162,11 @@ void runThreadedTest(Properties props) {
163162
int classifierNum = i % classifiers.size();
164163
int repeatNum = i / classifiers.size();
165164
threads.add(new CRFThread(classifiers.get(classifierNum), testFile,
166-
("Simultaneous-" + classifierNum +
165+
("Simultaneous-" + classifierNum +
167166
"-" + repeatNum)));
168167
}
169-
for (CRFThread thread : threads) {
170-
thread.start();
168+
for (int i = 0; i < threads.size(); ++i) {
169+
threads.get(i).start();
171170
}
172171
for (int i = 0; i < threads.size(); ++i) {
173172
int classifierNum = i % classifiers.size();
@@ -183,17 +182,16 @@ void runThreadedTest(Properties props) {
183182
System.out.println("Yay!");
184183
} else {
185184
throw new RuntimeException("Results not equal when running " +
186-
modelNames.get(classifierNum) +
187-
" under " + numThreads +
185+
modelNames.get(classifierNum) +
186+
" under " + numThreads +
188187
" threads with " +
189-
classifiers.size() +
188+
classifiers.size() +
190189
" total classifiers");
191190
}
192-
}
191+
}
193192
}
194193

195194
// if no exceptions thrown, great success
196195
System.out.println("Everything worked!");
197196
}
198-
199197
}

itest/src/edu/stanford/nlp/ie/crf/ThreadedCRFClassifierITest.java

Lines changed: 17 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -4,33 +4,30 @@
44

55
import java.util.Properties;
66

7-
/**
7+
/**
88
* Test that the CRFClassifier works when multiple classifiers are run
99
* in multiple threads.
1010
*
1111
* @author John Bauer
1212
*/
1313
public class ThreadedCRFClassifierITest extends TestCase {
14-
1514
Properties props;
1615

17-
private static final String german1 =
18-
"edu/stanford/nlp/models/ner/german.conll.hgc_175m_600.crf.ser.gz";
19-
/** -- We're no longer supporting this one
20-
private String german2 =
16+
private String german1 =
17+
"/u/nlp/data/ner/goodClassifiers/german.hgc_175m_600.crf.ser.gz";
18+
private String german2 =
2119
"/u/nlp/data/ner/goodClassifiers/german.dewac_175m_600.crf.ser.gz";
22-
*/
23-
private static final String germanTestFile = "/u/nlp/data/german/ner/2016/deu.utf8.testa";
20+
private String germanTestFile = "/u/nlp/data/german/ner/deu.testa";
2421

25-
private static final String english1 =
22+
private String english1 =
2623
"/u/nlp/data/ner/goodClassifiers/english.all.3class.nodistsim.crf.ser.gz";
27-
private static final String english2 =
28-
"/u/nlp/data/ner/goodClassifiers/english.conll.4class.distsim.crf.ser.gz";
29-
private static final String englishTestFile = "/u/nlp/data/ner/column_data/conll.4class.testa";
30-
31-
private static final String germanEncoding = "utf-8";
32-
private static final String englishEncoding = "utf-8";
24+
private String english2 =
25+
"/u/nlp/data/ner/goodClassifiers/english.all.3class.distsim.crf.ser.gz";
26+
private String englishTestFile = "/u/nlp/data/ner/column_data/conll.testa";
3327

28+
private String germanEncoding = "iso-8859-1";
29+
private String englishEncoding = "utf-8";
30+
3431
@Override
3532
public void setUp() {
3633
props = new Properties();
@@ -50,13 +47,12 @@ public void testOneGermanCRF() {
5047
TestThreadedCRFClassifier.runTest(props);
5148
}
5249

53-
public void testTwoEnglishCRFs() {
54-
props.setProperty("crf1", english1);
55-
props.setProperty("crf2", english2);
56-
props.setProperty("testFile", englishTestFile);
57-
props.setProperty("inputEncoding", englishEncoding);
50+
public void testTwoGermanCRFs() {
51+
props.setProperty("crf1", german1);
52+
props.setProperty("crf2", german2);
53+
props.setProperty("testFile", germanTestFile);
54+
props.setProperty("inputEncoding", germanEncoding);
5855
TestThreadedCRFClassifier.runTest(props);
5956
}
60-
6157
}
6258

itest/src/edu/stanford/nlp/pipeline/TaggerParserPosTagCompatibilityITest.java

Lines changed: 7 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,12 @@
22

33
import java.util.Set;
44

5+
import edu.stanford.nlp.parser.nndep.DependencyParser;
6+
import edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser;
57
import junit.framework.TestCase;
68

79
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
8-
import edu.stanford.nlp.parser.nndep.DependencyParser;
9-
import edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser;
1010
import edu.stanford.nlp.tagger.maxent.MaxentTagger;
11-
import edu.stanford.nlp.util.Sets;
1211

1312
/** This test checks whether our trained POS tagger and parser models are using the identical POS tag set
1413
* for the various languages that we support. It's a good idea if they are.
@@ -25,34 +24,25 @@ private static void testTagSet4(String[] lexParsers,
2524
Set<String> tagSet = lp.getLexicon().tagSet(lp.treebankLanguagePack().getBasicCategoryFunction());
2625
for (String name : maxentTaggers) {
2726
MaxentTagger tagger = new MaxentTagger(name);
28-
assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch:\n" +
29-
"left - right: " + Sets.diff(tagSet, tagger.tagSet()) +
30-
"; right - left: " + Sets.diff(tagger.tagSet(), tagSet) + "\n",
31-
tagSet, tagger.tagSet());
27+
assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch", tagSet, tagger.tagSet());
3228
}
3329
for (String name : lexParsers) {
3430
LexicalizedParser lp2 = LexicalizedParser.loadModel(name);
35-
assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch:\n" +
36-
"left - right: " + Sets.diff(tagSet, lp2.getLexicon().tagSet(lp.treebankLanguagePack().getBasicCategoryFunction())) +
37-
"; right - left: " + Sets.diff(lp2.getLexicon().tagSet(lp.treebankLanguagePack().getBasicCategoryFunction()), tagSet) + "\n",
31+
assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch",
3832
tagSet, lp2.getLexicon().tagSet(lp.treebankLanguagePack().getBasicCategoryFunction()));
3933
}
4034

4135
for (String name : srParsers) {
4236
ShiftReduceParser srp = ShiftReduceParser.loadModel(name);
4337

44-
assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch:\n" +
45-
"left - right: " + Sets.diff(tagSet, srp.tagSet()) +
46-
"; right - left: " + Sets.diff(srp.tagSet(), tagSet) + "\n",
38+
assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch",
4739
tagSet, srp.tagSet());
4840
}
4941

5042
for (String name : nnDepParsers) {
5143
DependencyParser dp = DependencyParser.loadFromModelFile(name);
5244

53-
assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch:\n" +
54-
"left - right: " + Sets.diff(tagSet, dp.getPosSet()) +
55-
"; right - left: " + Sets.diff(dp.getPosSet(), tagSet) + "\n",
45+
assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch",
5646
tagSet, dp.getPosSet());
5747
}
5848

@@ -90,7 +80,7 @@ public void testEnglishTagSet() {
9080
private static final String[] germanTaggers = {
9181
"edu/stanford/nlp/models/pos-tagger/german/german-fast.tagger",
9282
"edu/stanford/nlp/models/pos-tagger/german/german-fast-caseless.tagger",
93-
// "edu/stanford/nlp/models/pos-tagger/german/german-dewac.tagger", // No longer supported; always worse than hgc
83+
"edu/stanford/nlp/models/pos-tagger/german/german-dewac.tagger",
9484
"edu/stanford/nlp/models/pos-tagger/german/german-hgc.tagger"
9585
};
9686

@@ -104,8 +94,6 @@ public void testEnglishTagSet() {
10494
};
10595

10696
private static final String[] germanNnParsers = {
107-
// This one uses UD tag set not fine-grained tags!
108-
// "edu/stanford/nlp/models/parser/nndep/UD_German.gz",
10997
};
11098

11199
public void testGermanTagSet() {

scripts/ner/spanish.ancora.distsim.s512.prop

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11

2-
trainFileList = /u/nlp/data/spanish/ner/ancora.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.train.tsv
3-
testFiles = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.dev.tsv
2+
trainFile = /u/nlp/data/spanish/ner/ancora.ner.train.tsv
3+
testFile = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv
44
serializeTo = spanish.ancora.distsim.s512.crf.ser.gz
55

66
distSimLexicon = /u/nlp/data/spanish/distsim/spanish.spence512.cls

scripts/ner/spanish.ancora.prop

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11

2-
trainFileList = /u/nlp/data/spanish/ner/ancora.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.train.tsv
3-
testFiles = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.dev.tsv
2+
trainFile = /u/nlp/data/spanish/ner/ancora.ner.train.tsv
3+
testFile = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv
44
serializeTo = spanish.ancora.crf.ser.gz
55

66
useDistSim = false

scripts/ner/spanish.ancora2.prop

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11

2-
trainFileList = /u/nlp/data/spanish/ner/ancora.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.train.tsv
3-
testFiles = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.dev.tsv
2+
trainFile = /u/nlp/data/spanish/ner/ancora.ner.train.tsv
3+
testFile = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv
44
serializeTo = spanish.ancora2.crf.ser.gz
55

66
useDistSim = false

scripts/pos-tagger/Makefile

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,14 @@ FRENCH_TEST = format=TREES,/u/nlp/data/lexparser/trees/French/FTB-Test.utf8.txt
1010

1111
GERMAN_TEST = format=TREES,trf=edu.stanford.nlp.trees.international.negra.NegraPennTreeReaderFactory,/u/nlp/data/GermanACL08/negra/negra_3.mrg
1212

13-
SPANISH_TEST = format=TREES,/u/nlp/data/spanish/ancora/ancora.test,/u/nlp/data/spanish/ldc/ldc-NW.test,/u/nlp/data/spanish/ldc/ldc-DF.test
13+
SPANISH_TEST = format=TREES,/u/nlp/data/spanish/ancora/ancora.test
1414

1515
.SECONDEXPANSION:
1616

1717
all: arabic chinese english french german spanish testing wsj
1818
.PHONY: all arabic chinese english french german spanish testing wsj
1919

20-
arabic: arabic.tagger arabic-train.tagger
20+
arabic: arabic.tagger arabic-train.tagger
2121

2222
# we release an arabic model trained on everything, with a
2323
# corresponding model on train only for testing purposes
@@ -27,35 +27,35 @@ arabic.tagger arabic-train.tagger: $$@.props
2727
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
2828
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(ARABIC_TEST) -verboseResults false >> $@.out 2>&1
2929

30-
chinese: chinese-distsim.tagger chinese-nodistsim.tagger
30+
chinese: chinese-distsim.tagger chinese-nodistsim.tagger
3131

3232
chinese-nodistsim.tagger chinese-distsim.tagger: $$@.props
3333
@echo Training $@
34-
@echo Will test on $(CHINESE_TEST)
34+
@echo Will test on $(CHINESE_TEST)
3535
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
3636
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(CHINESE_TEST) -verboseResults false >> $@.out 2>&1
3737

3838
english: english-bidirectional-distsim.tagger english-caseless-left3words-distsim.tagger english-left3words-distsim.tagger
3939

4040
english-bidirectional-distsim.tagger english-caseless-left3words-distsim.tagger english-left3words-distsim.tagger: $$@.props
4141
@echo Training $@
42-
@echo Will test on $(ENGLISH_TEST)
42+
@echo Will test on $(ENGLISH_TEST)
4343
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
4444
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(ENGLISH_TEST) -verboseResults false >> $@.out 2>&1
4545

4646
french: french.tagger
4747

4848
french.tagger: $$@.props
4949
@echo Training $@
50-
@echo Will test on $(FRENCH_TEST)
50+
@echo Will test on $(FRENCH_TEST)
5151
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
5252
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(FRENCH_TEST) -verboseResults false >> $@.out 2>&1
5353

5454
german: german-dewac.tagger german-fast.tagger german-fast-caseless.tagger german-hgc.tagger
5555

5656
german-dewac.tagger german-fast.tagger german-fast-caseless.tagger german-hgc.tagger: $$@.props
5757
@echo Training $@
58-
@echo Will test on $(GERMAN_TEST)
58+
@echo Will test on $(GERMAN_TEST)
5959
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
6060
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(GERMAN_TEST) -verboseResults false >> $@.out 2>&1
6161

@@ -64,16 +64,16 @@ spanish: spanish.tagger spanish-distsim.tagger
6464
spanish.tagger spanish-distsim.tagger: $$@.props
6565
@echo Training $@
6666
@echo Will test on $(SPANISH_TEST)
67-
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
68-
# java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(SPANISH_TEST) -verboseResults false >> $@.out 2>&1
67+
#java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
68+
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(SPANISH_TEST) -verboseResults false >> $@.out 2>&1
6969

7070
testing: testing.tagger
7171

7272
testing.tagger:
7373
@echo Training $@
7474
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
7575

76-
wsj: wsj-0-18-bidirectional-distsim.tagger wsj-0-18-bidirectional-nodistsim.tagger wsj-0-18-caseless-left3words-distsim.tagger wsj-0-18-left3words-distsim.tagger wsj-0-18-left3words-nodistsim.tagger
76+
wsj: wsj-0-18-bidirectional-distsim.tagger wsj-0-18-bidirectional-nodistsim.tagger wsj-0-18-caseless-left3words-distsim.tagger wsj-0-18-left3words-distsim.tagger wsj-0-18-left3words-nodistsim.tagger
7777

7878
wsj-0-18-bidirectional-distsim.tagger wsj-0-18-bidirectional-nodistsim.tagger wsj-0-18-caseless-left3words-distsim.tagger wsj-0-18-left3words-distsim.tagger wsj-0-18-left3words-nodistsim.tagger: $$@.props
7979
@echo Training $@

0 commit comments

Comments
 (0)