Skip to content

Commit 0790127

Browse files
Gabor AngeliStanford NLP
authored andcommitted
Merge branch 'master' of jamie.stanford.edu:/u/nlp/git/javanlp
1 parent 8e38a68 commit 0790127

22 files changed

+242
-160
lines changed

doc/tagger/README-Models.txt

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -105,15 +105,11 @@ University of Stuttgart and the Seminar für Sprachwissenschaft of the
105105
University of Tübingen. See:
106106
http://www.ims.uni-stuttgart.de/projekte/CQPDemos/Bundestag/help-tagset.html
107107
This model uses features from the distributional similarity clusters
108-
built over the HGC.
108+
built over the HGC (Huge German Corpus).
109109
Performance:
110110
96.90% on the first half of the remaining 20% of the Negra corpus (dev set)
111111
(90.33% on unknown words)
112112

113-
german-dewac.tagger
114-
This model uses features from the distributional similarity clusters
115-
built from the deWac web corpus.
116-
117113
german-fast.tagger
118114
Lacks distributional similarity features, but is several times faster
119115
than the other alternatives.

itest/src/edu/stanford/nlp/ie/crf/TestThreadedCRFClassifier.java

Lines changed: 22 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import edu.stanford.nlp.util.Timing;
1313

1414
public class TestThreadedCRFClassifier {
15+
1516
TestThreadedCRFClassifier(Properties props) {
1617
inputEncoding = props.getProperty("inputEncoding", "UTF-8");
1718
}
@@ -23,8 +24,8 @@ public class TestThreadedCRFClassifier {
2324

2425
private final String inputEncoding;
2526

26-
CRFClassifier loadClassifier(String loadPath, Properties props) {
27-
CRFClassifier crf = new CRFClassifier(props);
27+
static CRFClassifier loadClassifier(String loadPath, Properties props) {
28+
CRFClassifier crf = new CRFClassifier(props);
2829
crf.loadClassifierNoExceptions(loadPath, props);
2930
return crf;
3031
}
@@ -58,9 +59,9 @@ public void run() {
5859
Timing t = new Timing();
5960
resultsString = runClassifier(crf, filename);
6061
long millis = t.stop();
61-
System.out.println("Thread " + threadName + " took " + millis +
62+
System.out.println("Thread " + threadName + " took " + millis +
6263
"ms to tag file " + filename);
63-
}
64+
}
6465
}
6566

6667
/**
@@ -71,7 +72,7 @@ public void run() {
7172
* -crf2 ../stanford-releases/stanford-ner-models/dewac_175m_600.ser.gz
7273
* -testFile ../data/german-ner/deu.testa -inputEncoding iso-8859-1
7374
*/
74-
static public void main(String[] args) {
75+
public static void main(String[] args) {
7576
try {
7677
System.setOut(new PrintStream(System.out, true, "UTF-8"));
7778
System.setErr(new PrintStream(System.err, true, "UTF-8"));
@@ -81,10 +82,10 @@ static public void main(String[] args) {
8182

8283
runTest(StringUtils.argsToProperties(args));
8384
}
84-
85+
8586
static public void runTest(Properties props) {
8687
TestThreadedCRFClassifier test = new TestThreadedCRFClassifier(props);
87-
test.runThreadedTest(props);
88+
test.runThreadedTest(props);
8889
}
8990

9091

@@ -95,7 +96,7 @@ void runThreadedTest(Properties props) {
9596
ArrayList<String> modelNames = new ArrayList<String>();
9697
ArrayList<CRFClassifier> classifiers = new ArrayList<CRFClassifier>();
9798

98-
for (int i = 1;
99+
for (int i = 1;
99100
props.getProperty("crf" + Integer.toString(i)) != null; ++i) {
100101
String model = props.getProperty("crf" + Integer.toString(i));
101102
CRFClassifier crf = loadClassifier(model, props);
@@ -107,7 +108,7 @@ void runThreadedTest(Properties props) {
107108
// must run twice to account for "transductive learning"
108109
results = runClassifier(crf, testFile);
109110
baseResults.add(results);
110-
System.out.println("Stored base results for " + model +
111+
System.out.println("Stored base results for " + model +
111112
"; length " + results.length());
112113
}
113114

@@ -121,13 +122,13 @@ void runThreadedTest(Properties props) {
121122
String repeated = runClassifier(crf, testFile);
122123
if (!base.equals(repeated)) {
123124
throw new RuntimeException("Repeated unthreaded results " +
124-
"not the same for " + model +
125+
"not the same for " + model +
125126
" run on file " + testFile);
126127
}
127128
}
128129

129130
// test the first classifier in several simultaneous threads
130-
int numThreads = PropertiesUtils.getInt(props, "simThreads",
131+
int numThreads = PropertiesUtils.getInt(props, "simThreads",
131132
DEFAULT_SIM_THREADS);
132133

133134
ArrayList<CRFThread> threads = new ArrayList<CRFThread>();
@@ -148,11 +149,11 @@ void runThreadedTest(Properties props) {
148149
System.out.println("Yay!");
149150
} else {
150151
throw new RuntimeException("Results not equal when running " +
151-
modelNames.get(0) + " under " +
152+
modelNames.get(0) + " under " +
152153
numThreads + " simultaneous threads");
153154
}
154155
}
155-
156+
156157
// test multiple classifiers (if given) in multiple threads each
157158
if (classifiers.size() > 1) {
158159
numThreads = PropertiesUtils.getInt(props, "multipleThreads",
@@ -162,11 +163,11 @@ void runThreadedTest(Properties props) {
162163
int classifierNum = i % classifiers.size();
163164
int repeatNum = i / classifiers.size();
164165
threads.add(new CRFThread(classifiers.get(classifierNum), testFile,
165-
("Simultaneous-" + classifierNum +
166+
("Simultaneous-" + classifierNum +
166167
"-" + repeatNum)));
167168
}
168-
for (int i = 0; i < threads.size(); ++i) {
169-
threads.get(i).start();
169+
for (CRFThread thread : threads) {
170+
thread.start();
170171
}
171172
for (int i = 0; i < threads.size(); ++i) {
172173
int classifierNum = i % classifiers.size();
@@ -182,16 +183,17 @@ void runThreadedTest(Properties props) {
182183
System.out.println("Yay!");
183184
} else {
184185
throw new RuntimeException("Results not equal when running " +
185-
modelNames.get(classifierNum) +
186-
" under " + numThreads +
186+
modelNames.get(classifierNum) +
187+
" under " + numThreads +
187188
" threads with " +
188-
classifiers.size() +
189+
classifiers.size() +
189190
" total classifiers");
190191
}
191-
}
192+
}
192193
}
193194

194195
// if no exceptions thrown, great success
195196
System.out.println("Everything worked!");
196197
}
198+
197199
}

itest/src/edu/stanford/nlp/ie/crf/ThreadedCRFClassifierITest.java

Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -4,30 +4,33 @@
44

55
import java.util.Properties;
66

7-
/**
7+
/**
88
* Test that the CRFClassifier works when multiple classifiers are run
99
* in multiple threads.
1010
*
1111
* @author John Bauer
1212
*/
1313
public class ThreadedCRFClassifierITest extends TestCase {
14+
1415
Properties props;
1516

16-
private String german1 =
17-
"/u/nlp/data/ner/goodClassifiers/german.hgc_175m_600.crf.ser.gz";
18-
private String german2 =
17+
private static final String german1 =
18+
"edu/stanford/nlp/models/ner/german.conll.hgc_175m_600.crf.ser.gz";
19+
/** -- We're no longer supporting this one
20+
private String german2 =
1921
"/u/nlp/data/ner/goodClassifiers/german.dewac_175m_600.crf.ser.gz";
20-
private String germanTestFile = "/u/nlp/data/german/ner/deu.testa";
22+
*/
23+
private static final String germanTestFile = "/u/nlp/data/german/ner/2016/deu.utf8.testa";
2124

22-
private String english1 =
25+
private static final String english1 =
2326
"/u/nlp/data/ner/goodClassifiers/english.all.3class.nodistsim.crf.ser.gz";
24-
private String english2 =
25-
"/u/nlp/data/ner/goodClassifiers/english.all.3class.distsim.crf.ser.gz";
26-
private String englishTestFile = "/u/nlp/data/ner/column_data/conll.testa";
27+
private static final String english2 =
28+
"/u/nlp/data/ner/goodClassifiers/english.conll.4class.distsim.crf.ser.gz";
29+
private static final String englishTestFile = "/u/nlp/data/ner/column_data/conll.4class.testa";
30+
31+
private static final String germanEncoding = "utf-8";
32+
private static final String englishEncoding = "utf-8";
2733

28-
private String germanEncoding = "iso-8859-1";
29-
private String englishEncoding = "utf-8";
30-
3134
@Override
3235
public void setUp() {
3336
props = new Properties();
@@ -47,12 +50,13 @@ public void testOneGermanCRF() {
4750
TestThreadedCRFClassifier.runTest(props);
4851
}
4952

50-
public void testTwoGermanCRFs() {
51-
props.setProperty("crf1", german1);
52-
props.setProperty("crf2", german2);
53-
props.setProperty("testFile", germanTestFile);
54-
props.setProperty("inputEncoding", germanEncoding);
53+
public void testTwoEnglishCRFs() {
54+
props.setProperty("crf1", english1);
55+
props.setProperty("crf2", english2);
56+
props.setProperty("testFile", englishTestFile);
57+
props.setProperty("inputEncoding", englishEncoding);
5558
TestThreadedCRFClassifier.runTest(props);
5659
}
60+
5761
}
5862

itest/src/edu/stanford/nlp/pipeline/TaggerParserPosTagCompatibilityITest.java

Lines changed: 19 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,13 @@
22

33
import java.util.Set;
44

5-
import edu.stanford.nlp.parser.nndep.DependencyParser;
6-
import edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser;
75
import junit.framework.TestCase;
86

97
import edu.stanford.nlp.parser.lexparser.LexicalizedParser;
8+
import edu.stanford.nlp.parser.nndep.DependencyParser;
9+
import edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser;
1010
import edu.stanford.nlp.tagger.maxent.MaxentTagger;
11+
import edu.stanford.nlp.util.Sets;
1112

1213
/** This test checks whether our trained POS tagger and parser models are using the identical POS tag set
1314
* for the various languages that we support. It's a good idea if they are.
@@ -24,25 +25,34 @@ private static void testTagSet4(String[] lexParsers,
2425
Set<String> tagSet = lp.getLexicon().tagSet(lp.treebankLanguagePack().getBasicCategoryFunction());
2526
for (String name : maxentTaggers) {
2627
MaxentTagger tagger = new MaxentTagger(name);
27-
assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch", tagSet, tagger.tagSet());
28+
assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch:\n" +
29+
"left - right: " + Sets.diff(tagSet, tagger.tagSet()) +
30+
"; right - left: " + Sets.diff(tagger.tagSet(), tagSet) + "\n",
31+
tagSet, tagger.tagSet());
2832
}
2933
for (String name : lexParsers) {
3034
LexicalizedParser lp2 = LexicalizedParser.loadModel(name);
31-
assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch",
35+
assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch:\n" +
36+
"left - right: " + Sets.diff(tagSet, lp2.getLexicon().tagSet(lp.treebankLanguagePack().getBasicCategoryFunction())) +
37+
"; right - left: " + Sets.diff(lp2.getLexicon().tagSet(lp.treebankLanguagePack().getBasicCategoryFunction()), tagSet) + "\n",
3238
tagSet, lp2.getLexicon().tagSet(lp.treebankLanguagePack().getBasicCategoryFunction()));
3339
}
3440

3541
for (String name : srParsers) {
3642
ShiftReduceParser srp = ShiftReduceParser.loadModel(name);
3743

38-
assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch",
44+
assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch:\n" +
45+
"left - right: " + Sets.diff(tagSet, srp.tagSet()) +
46+
"; right - left: " + Sets.diff(srp.tagSet(), tagSet) + "\n",
3947
tagSet, srp.tagSet());
4048
}
4149

4250
for (String name : nnDepParsers) {
4351
DependencyParser dp = DependencyParser.loadFromModelFile(name);
4452

45-
assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch",
53+
assertEquals(lexParsers[0] + " vs. " + name + " tag set mismatch:\n" +
54+
"left - right: " + Sets.diff(tagSet, dp.getPosSet()) +
55+
"; right - left: " + Sets.diff(dp.getPosSet(), tagSet) + "\n",
4656
tagSet, dp.getPosSet());
4757
}
4858

@@ -80,7 +90,7 @@ public void testEnglishTagSet() {
8090
private static final String[] germanTaggers = {
8191
"edu/stanford/nlp/models/pos-tagger/german/german-fast.tagger",
8292
"edu/stanford/nlp/models/pos-tagger/german/german-fast-caseless.tagger",
83-
"edu/stanford/nlp/models/pos-tagger/german/german-dewac.tagger",
93+
// "edu/stanford/nlp/models/pos-tagger/german/german-dewac.tagger", // No longer supported; always worse than hgc
8494
"edu/stanford/nlp/models/pos-tagger/german/german-hgc.tagger"
8595
};
8696

@@ -94,6 +104,8 @@ public void testEnglishTagSet() {
94104
};
95105

96106
private static final String[] germanNnParsers = {
107+
// This one uses UD tag set not fine-grained tags!
108+
// "edu/stanford/nlp/models/parser/nndep/UD_German.gz",
97109
};
98110

99111
public void testGermanTagSet() {

scripts/ner/spanish.ancora.distsim.s512.prop

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11

2-
trainFile = /u/nlp/data/spanish/ner/ancora.ner.train.tsv
3-
testFile = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv
2+
trainFileList = /u/nlp/data/spanish/ner/ancora.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.train.tsv
3+
testFiles = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.dev.tsv
44
serializeTo = spanish.ancora.distsim.s512.crf.ser.gz
55

66
distSimLexicon = /u/nlp/data/spanish/distsim/spanish.spence512.cls

scripts/ner/spanish.ancora.prop

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11

2-
trainFile = /u/nlp/data/spanish/ner/ancora.ner.train.tsv
3-
testFile = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv
2+
trainFileList = /u/nlp/data/spanish/ner/ancora.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.train.tsv
3+
testFiles = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.dev.tsv
44
serializeTo = spanish.ancora.crf.ser.gz
55

66
useDistSim = false

scripts/ner/spanish.ancora2.prop

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11

2-
trainFile = /u/nlp/data/spanish/ner/ancora.ner.train.tsv
3-
testFile = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv
2+
trainFileList = /u/nlp/data/spanish/ner/ancora.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.train.tsv
3+
testFiles = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.dev.tsv
44
serializeTo = spanish.ancora2.crf.ser.gz
55

66
useDistSim = false

scripts/pos-tagger/Makefile

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,14 @@ FRENCH_TEST = format=TREES,/u/nlp/data/lexparser/trees/French/FTB-Test.utf8.txt
1010

1111
GERMAN_TEST = format=TREES,trf=edu.stanford.nlp.trees.international.negra.NegraPennTreeReaderFactory,/u/nlp/data/GermanACL08/negra/negra_3.mrg
1212

13-
SPANISH_TEST = format=TREES,/u/nlp/data/spanish/ancora/ancora.test
13+
SPANISH_TEST = format=TREES,/u/nlp/data/spanish/ancora/ancora.test,/u/nlp/data/spanish/ldc/ldc-NW.test,/u/nlp/data/spanish/ldc/ldc-DF.test
1414

1515
.SECONDEXPANSION:
1616

1717
all: arabic chinese english french german spanish testing wsj
1818
.PHONY: all arabic chinese english french german spanish testing wsj
1919

20-
arabic: arabic.tagger arabic-train.tagger
20+
arabic: arabic.tagger arabic-train.tagger
2121

2222
# we release an arabic model trained on everything, with a
2323
# corresponding model on train only for testing purposes
@@ -27,35 +27,35 @@ arabic.tagger arabic-train.tagger: $$@.props
2727
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
2828
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(ARABIC_TEST) -verboseResults false >> $@.out 2>&1
2929

30-
chinese: chinese-distsim.tagger chinese-nodistsim.tagger
30+
chinese: chinese-distsim.tagger chinese-nodistsim.tagger
3131

3232
chinese-nodistsim.tagger chinese-distsim.tagger: $$@.props
3333
@echo Training $@
34-
@echo Will test on $(CHINESE_TEST)
34+
@echo Will test on $(CHINESE_TEST)
3535
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
3636
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(CHINESE_TEST) -verboseResults false >> $@.out 2>&1
3737

3838
english: english-bidirectional-distsim.tagger english-caseless-left3words-distsim.tagger english-left3words-distsim.tagger
3939

4040
english-bidirectional-distsim.tagger english-caseless-left3words-distsim.tagger english-left3words-distsim.tagger: $$@.props
4141
@echo Training $@
42-
@echo Will test on $(ENGLISH_TEST)
42+
@echo Will test on $(ENGLISH_TEST)
4343
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
4444
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(ENGLISH_TEST) -verboseResults false >> $@.out 2>&1
4545

4646
french: french.tagger
4747

4848
french.tagger: $$@.props
4949
@echo Training $@
50-
@echo Will test on $(FRENCH_TEST)
50+
@echo Will test on $(FRENCH_TEST)
5151
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
5252
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(FRENCH_TEST) -verboseResults false >> $@.out 2>&1
5353

5454
german: german-dewac.tagger german-fast.tagger german-fast-caseless.tagger german-hgc.tagger
5555

5656
german-dewac.tagger german-fast.tagger german-fast-caseless.tagger german-hgc.tagger: $$@.props
5757
@echo Training $@
58-
@echo Will test on $(GERMAN_TEST)
58+
@echo Will test on $(GERMAN_TEST)
5959
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
6060
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(GERMAN_TEST) -verboseResults false >> $@.out 2>&1
6161

@@ -64,16 +64,16 @@ spanish: spanish.tagger spanish-distsim.tagger
6464
spanish.tagger spanish-distsim.tagger: $$@.props
6565
@echo Training $@
6666
@echo Will test on $(SPANISH_TEST)
67-
#java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
68-
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(SPANISH_TEST) -verboseResults false >> $@.out 2>&1
67+
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
68+
# java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(SPANISH_TEST) -verboseResults false >> $@.out 2>&1
6969

7070
testing: testing.tagger
7171

7272
testing.tagger:
7373
@echo Training $@
7474
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
7575

76-
wsj: wsj-0-18-bidirectional-distsim.tagger wsj-0-18-bidirectional-nodistsim.tagger wsj-0-18-caseless-left3words-distsim.tagger wsj-0-18-left3words-distsim.tagger wsj-0-18-left3words-nodistsim.tagger
76+
wsj: wsj-0-18-bidirectional-distsim.tagger wsj-0-18-bidirectional-nodistsim.tagger wsj-0-18-caseless-left3words-distsim.tagger wsj-0-18-left3words-distsim.tagger wsj-0-18-left3words-nodistsim.tagger
7777

7878
wsj-0-18-bidirectional-distsim.tagger wsj-0-18-bidirectional-nodistsim.tagger wsj-0-18-caseless-left3words-distsim.tagger wsj-0-18-left3words-distsim.tagger wsj-0-18-left3words-nodistsim.tagger: $$@.props
7979
@echo Training $@

0 commit comments

Comments
 (0)