Skip to content

Commit 67328c0

Browse files
manningStanford NLP
authored andcommitted
Remove editing bug
1 parent ee74a0a commit 67328c0

File tree

11 files changed

+77
-94
lines changed

11 files changed

+77
-94
lines changed

itest/src/edu/stanford/nlp/ie/crf/TestThreadedCRFClassifier.java

Lines changed: 20 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
import edu.stanford.nlp.util.Timing;
1313

1414
public class TestThreadedCRFClassifier {
15-
1615
TestThreadedCRFClassifier(Properties props) {
1716
inputEncoding = props.getProperty("inputEncoding", "UTF-8");
1817
}
@@ -24,8 +23,8 @@ public class TestThreadedCRFClassifier {
2423

2524
private final String inputEncoding;
2625

27-
static CRFClassifier loadClassifier(String loadPath, Properties props) {
28-
CRFClassifier crf = new CRFClassifier(props);
26+
CRFClassifier loadClassifier(String loadPath, Properties props) {
27+
CRFClassifier crf = new CRFClassifier(props);
2928
crf.loadClassifierNoExceptions(loadPath, props);
3029
return crf;
3130
}
@@ -59,9 +58,9 @@ public void run() {
5958
Timing t = new Timing();
6059
resultsString = runClassifier(crf, filename);
6160
long millis = t.stop();
62-
System.out.println("Thread " + threadName + " took " + millis +
61+
System.out.println("Thread " + threadName + " took " + millis +
6362
"ms to tag file " + filename);
64-
}
63+
}
6564
}
6665

6766
/**
@@ -72,7 +71,7 @@ public void run() {
7271
* -crf2 ../stanford-releases/stanford-ner-models/dewac_175m_600.ser.gz
7372
* -testFile ../data/german-ner/deu.testa -inputEncoding iso-8859-1
7473
*/
75-
public static void main(String[] args) {
74+
static public void main(String[] args) {
7675
try {
7776
System.setOut(new PrintStream(System.out, true, "UTF-8"));
7877
System.setErr(new PrintStream(System.err, true, "UTF-8"));
@@ -82,10 +81,10 @@ public static void main(String[] args) {
8281

8382
runTest(StringUtils.argsToProperties(args));
8483
}
85-
84+
8685
static public void runTest(Properties props) {
8786
TestThreadedCRFClassifier test = new TestThreadedCRFClassifier(props);
88-
test.runThreadedTest(props);
87+
test.runThreadedTest(props);
8988
}
9089

9190

@@ -96,7 +95,7 @@ void runThreadedTest(Properties props) {
9695
ArrayList<String> modelNames = new ArrayList<String>();
9796
ArrayList<CRFClassifier> classifiers = new ArrayList<CRFClassifier>();
9897

99-
for (int i = 1;
98+
for (int i = 1;
10099
props.getProperty("crf" + Integer.toString(i)) != null; ++i) {
101100
String model = props.getProperty("crf" + Integer.toString(i));
102101
CRFClassifier crf = loadClassifier(model, props);
@@ -108,7 +107,7 @@ void runThreadedTest(Properties props) {
108107
// must run twice to account for "transductive learning"
109108
results = runClassifier(crf, testFile);
110109
baseResults.add(results);
111-
System.out.println("Stored base results for " + model +
110+
System.out.println("Stored base results for " + model +
112111
"; length " + results.length());
113112
}
114113

@@ -122,13 +121,13 @@ void runThreadedTest(Properties props) {
122121
String repeated = runClassifier(crf, testFile);
123122
if (!base.equals(repeated)) {
124123
throw new RuntimeException("Repeated unthreaded results " +
125-
"not the same for " + model +
124+
"not the same for " + model +
126125
" run on file " + testFile);
127126
}
128127
}
129128

130129
// test the first classifier in several simultaneous threads
131-
int numThreads = PropertiesUtils.getInt(props, "simThreads",
130+
int numThreads = PropertiesUtils.getInt(props, "simThreads",
132131
DEFAULT_SIM_THREADS);
133132

134133
ArrayList<CRFThread> threads = new ArrayList<CRFThread>();
@@ -149,11 +148,11 @@ void runThreadedTest(Properties props) {
149148
System.out.println("Yay!");
150149
} else {
151150
throw new RuntimeException("Results not equal when running " +
152-
modelNames.get(0) + " under " +
151+
modelNames.get(0) + " under " +
153152
numThreads + " simultaneous threads");
154153
}
155154
}
156-
155+
157156
// test multiple classifiers (if given) in multiple threads each
158157
if (classifiers.size() > 1) {
159158
numThreads = PropertiesUtils.getInt(props, "multipleThreads",
@@ -163,11 +162,11 @@ void runThreadedTest(Properties props) {
163162
int classifierNum = i % classifiers.size();
164163
int repeatNum = i / classifiers.size();
165164
threads.add(new CRFThread(classifiers.get(classifierNum), testFile,
166-
("Simultaneous-" + classifierNum +
165+
("Simultaneous-" + classifierNum +
167166
"-" + repeatNum)));
168167
}
169-
for (CRFThread thread : threads) {
170-
thread.start();
168+
for (int i = 0; i < threads.size(); ++i) {
169+
threads.get(i).start();
171170
}
172171
for (int i = 0; i < threads.size(); ++i) {
173172
int classifierNum = i % classifiers.size();
@@ -183,17 +182,16 @@ void runThreadedTest(Properties props) {
183182
System.out.println("Yay!");
184183
} else {
185184
throw new RuntimeException("Results not equal when running " +
186-
modelNames.get(classifierNum) +
187-
" under " + numThreads +
185+
modelNames.get(classifierNum) +
186+
" under " + numThreads +
188187
" threads with " +
189-
classifiers.size() +
188+
classifiers.size() +
190189
" total classifiers");
191190
}
192-
}
191+
}
193192
}
194193

195194
// if no exceptions thrown, great success
196195
System.out.println("Everything worked!");
197196
}
198-
199197
}

itest/src/edu/stanford/nlp/ie/crf/ThreadedCRFClassifierITest.java

Lines changed: 11 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,33 +4,32 @@
44

55
import java.util.Properties;
66

7-
/**
7+
/**
88
* Test that the CRFClassifier works when multiple classifiers are run
99
* in multiple threads.
1010
*
1111
* @author John Bauer
1212
*/
1313
public class ThreadedCRFClassifierITest extends TestCase {
14-
1514
Properties props;
1615

17-
private static final String german1 =
18-
"edu/stanford/nlp/models/ner/german.conll.hgc_175m_600.crf.ser.gz";
16+
private String german1 =
17+
"/u/nlp/data/ner/goodClassifiers/german.hgc_175m_600.crf.ser.gz";
1918
/** -- We're no longer supporting this one
20-
private String german2 =
19+
private String german2 =
2120
"/u/nlp/data/ner/goodClassifiers/german.dewac_175m_600.crf.ser.gz";
2221
*/
23-
private static final String germanTestFile = "/u/nlp/data/german/ner/2016/deu.utf8.testa";
22+
private String germanTestFile = "/u/nlp/data/german/ner/2016/deu.testa";
2423

25-
private static final String english1 =
24+
private String english1 =
2625
"/u/nlp/data/ner/goodClassifiers/english.all.3class.nodistsim.crf.ser.gz";
27-
private static final String english2 =
26+
private String english2 =
2827
"/u/nlp/data/ner/goodClassifiers/english.conll.4class.distsim.crf.ser.gz";
29-
private static final String englishTestFile = "/u/nlp/data/ner/column_data/conll.4class.testa";
30-
31-
private static final String germanEncoding = "utf-8";
32-
private static final String englishEncoding = "utf-8";
28+
private String englishTestFile = "/u/nlp/data/ner/column_data/conll.4class.testa";
3329

30+
private String germanEncoding = "iso-8859-1";
31+
private String englishEncoding = "utf-8";
32+
3433
@Override
3534
public void setUp() {
3635
props = new Properties();
@@ -57,6 +56,5 @@ public void testTwoEnglishCRFs() {
5756
props.setProperty("inputEncoding", englishEncoding);
5857
TestThreadedCRFClassifier.runTest(props);
5958
}
60-
6159
}
6260

scripts/ner/spanish.ancora.distsim.s512.prop

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11

2-
trainFileList = /u/nlp/data/spanish/ner/ancora.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.train.tsv
3-
testFiles = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.dev.tsv
2+
trainFile = /u/nlp/data/spanish/ner/ancora.ner.train.tsv
3+
testFile = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv
44
serializeTo = spanish.ancora.distsim.s512.crf.ser.gz
55

66
distSimLexicon = /u/nlp/data/spanish/distsim/spanish.spence512.cls

scripts/ner/spanish.ancora.prop

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11

2-
trainFileList = /u/nlp/data/spanish/ner/ancora.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.train.tsv
3-
testFiles = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.dev.tsv
2+
trainFile = /u/nlp/data/spanish/ner/ancora.ner.train.tsv
3+
testFile = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv
44
serializeTo = spanish.ancora.crf.ser.gz
55

66
useDistSim = false

scripts/ner/spanish.ancora2.prop

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11

2-
trainFileList = /u/nlp/data/spanish/ner/ancora.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.train.tsv
3-
testFiles = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.dev.tsv
2+
trainFile = /u/nlp/data/spanish/ner/ancora.ner.train.tsv
3+
testFile = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv
44
serializeTo = spanish.ancora2.crf.ser.gz
55

66
useDistSim = false

scripts/pos-tagger/Makefile

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,14 @@ FRENCH_TEST = format=TREES,/u/nlp/data/lexparser/trees/French/FTB-Test.utf8.txt
1010

1111
GERMAN_TEST = format=TREES,trf=edu.stanford.nlp.trees.international.negra.NegraPennTreeReaderFactory,/u/nlp/data/GermanACL08/negra/negra_3.mrg
1212

13-
SPANISH_TEST = format=TREES,/u/nlp/data/spanish/ancora/ancora.test,/u/nlp/data/spanish/ldc/ldc-NW.test,/u/nlp/data/spanish/ldc/ldc-DF.test
13+
SPANISH_TEST = format=TREES,/u/nlp/data/spanish/ancora/ancora.test
1414

1515
.SECONDEXPANSION:
1616

1717
all: arabic chinese english french german spanish testing wsj
1818
.PHONY: all arabic chinese english french german spanish testing wsj
1919

20-
arabic: arabic.tagger arabic-train.tagger
20+
arabic: arabic.tagger arabic-train.tagger
2121

2222
# we release an arabic model trained on everything, with a
2323
# corresponding model on train only for testing purposes
@@ -27,35 +27,35 @@ arabic.tagger arabic-train.tagger: $$@.props
2727
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
2828
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(ARABIC_TEST) -verboseResults false >> $@.out 2>&1
2929

30-
chinese: chinese-distsim.tagger chinese-nodistsim.tagger
30+
chinese: chinese-distsim.tagger chinese-nodistsim.tagger
3131

3232
chinese-nodistsim.tagger chinese-distsim.tagger: $$@.props
3333
@echo Training $@
34-
@echo Will test on $(CHINESE_TEST)
34+
@echo Will test on $(CHINESE_TEST)
3535
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
3636
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(CHINESE_TEST) -verboseResults false >> $@.out 2>&1
3737

3838
english: english-bidirectional-distsim.tagger english-caseless-left3words-distsim.tagger english-left3words-distsim.tagger
3939

4040
english-bidirectional-distsim.tagger english-caseless-left3words-distsim.tagger english-left3words-distsim.tagger: $$@.props
4141
@echo Training $@
42-
@echo Will test on $(ENGLISH_TEST)
42+
@echo Will test on $(ENGLISH_TEST)
4343
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
4444
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(ENGLISH_TEST) -verboseResults false >> $@.out 2>&1
4545

4646
french: french.tagger
4747

4848
french.tagger: $$@.props
4949
@echo Training $@
50-
@echo Will test on $(FRENCH_TEST)
50+
@echo Will test on $(FRENCH_TEST)
5151
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
5252
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(FRENCH_TEST) -verboseResults false >> $@.out 2>&1
5353

5454
german: german-dewac.tagger german-fast.tagger german-fast-caseless.tagger german-hgc.tagger
5555

5656
german-dewac.tagger german-fast.tagger german-fast-caseless.tagger german-hgc.tagger: $$@.props
5757
@echo Training $@
58-
@echo Will test on $(GERMAN_TEST)
58+
@echo Will test on $(GERMAN_TEST)
5959
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
6060
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(GERMAN_TEST) -verboseResults false >> $@.out 2>&1
6161

@@ -64,16 +64,16 @@ spanish: spanish.tagger spanish-distsim.tagger
6464
spanish.tagger spanish-distsim.tagger: $$@.props
6565
@echo Training $@
6666
@echo Will test on $(SPANISH_TEST)
67-
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
68-
# java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(SPANISH_TEST) -verboseResults false >> $@.out 2>&1
67+
#java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
68+
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(SPANISH_TEST) -verboseResults false >> $@.out 2>&1
6969

7070
testing: testing.tagger
7171

7272
testing.tagger:
7373
@echo Training $@
7474
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
7575

76-
wsj: wsj-0-18-bidirectional-distsim.tagger wsj-0-18-bidirectional-nodistsim.tagger wsj-0-18-caseless-left3words-distsim.tagger wsj-0-18-left3words-distsim.tagger wsj-0-18-left3words-nodistsim.tagger
76+
wsj: wsj-0-18-bidirectional-distsim.tagger wsj-0-18-bidirectional-nodistsim.tagger wsj-0-18-caseless-left3words-distsim.tagger wsj-0-18-left3words-distsim.tagger wsj-0-18-left3words-nodistsim.tagger
7777

7878
wsj-0-18-bidirectional-distsim.tagger wsj-0-18-bidirectional-nodistsim.tagger wsj-0-18-caseless-left3words-distsim.tagger wsj-0-18-left3words-distsim.tagger wsj-0-18-left3words-nodistsim.tagger: $$@.props
7979
@echo Training $@

scripts/pos-tagger/spanish.tagger.props

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
## tagger training invoked at Sat Oct 08 12:21:50 PDT 2016 with arguments:
1+
## tagger training invoked at Wed Jul 30 08:33:18 PDT 2014 with arguments:
22
model = spanish.tagger
33
arch = left3words,naacl2003unknowns,allwordshapes(-1,1)
44
wordFunction =
5-
trainFile = format=TREES,/u/nlp/data/spanish/ancora/ancora.train;format=TREES,/u/nlp/data/spanish/ldc/ldc-DF.train;format=TREES,/u/nlp/data/spanish/ldc/ldc-NW.train
5+
trainFile = format=TREES,/u/nlp/data/spanish/ancora/ancora.train
66
closedClassTags =
77
closedClassTagThreshold = 40
88
curWordMinFeatureThresh = 2

scripts/srparser/Makefile

Lines changed: 5 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,9 @@ ENGLISH_TAGGER = /u/nlp/data/pos-tagger/distrib/english-left3words-distsim.tagge
1414
ENGLISH_TLPP = $(WSJ_TLPP)
1515

1616

17-
FRENCH_TRAIN = /u/nlp/data/lexparser/trees/FrenchCC/FTB-Train.utf8.txt
18-
FRENCH_DEV = /u/nlp/data/lexparser/trees/FrenchCC/FTB-Dev.utf8.txt
19-
FRENCH_TEST = /u/nlp/data/lexparser/trees/FrenchCC/FTB-Test.utf8.txt
17+
FRENCH_TRAIN = /u/nlp/data/lexparser/trees/FrenchCC/FTB-Train.utf8.txt
18+
FRENCH_DEV = /u/nlp/data/lexparser/trees/FrenchCC/FTB-Dev.utf8.txt
19+
FRENCH_TEST = /u/nlp/data/lexparser/trees/FrenchCC/FTB-Test.utf8.txt
2020
FRENCH_TAGGER = /u/nlp/data/pos-tagger/distrib-2014-06-09/french.tagger
2121
FRENCH_TLPP = edu.stanford.nlp.parser.lexparser.FrenchTreebankParserParams
2222

@@ -41,16 +41,10 @@ ARABIC_TEST = /u/nlp/data/lexparser/trees/Arabic/2-Unvoc-Test.utf8.txt
4141
ARABIC_TAGGER = /u/nlp/data/pos-tagger/distrib/arabic-train.tagger
4242
ARABIC_TLPP = edu.stanford.nlp.parser.lexparser.ArabicTreebankParserParams
4343

44+
4445
SPANISH_TRAIN = /u/nlp/data/spanish/ancora/ancora.train
45-
SPANISH_TRAIN2 = /u/nlp/data/spanish/ldc/ldc-NW.train
46-
SPANISH_TRAIN3 = /u/nlp/data/spanish/ldc/ldc-DF.train
4746
SPANISH_DEV = /u/nlp/data/spanish/ancora/ancora.dev
48-
SPANISH_DEV2 = /u/nlp/data/spanish/ldc/ldc-NW.dev
49-
SPANISH_DEV3 = /u/nlp/data/spanish/ldc/ldc-DF.dev
50-
SPANISH_DEV_TMP = /u/nlp/data/spanish/all.dev.tmp
5147
SPANISH_TEST = /u/nlp/data/spanish/ancora/ancora.test
52-
SPANISH_TEST2 = /u/nlp/data/spanish/ldc/ldc-NW.train
53-
SPANISH_TEST3 = /u/nlp/data/spanish/ldc/ldc-DF.train
5448
SPANISH_TAGGER= /u/nlp/data/pos-tagger/distrib/spanish-distsim.tagger
5549
SPANISH_TLPP = edu.stanford.nlp.parser.lexparser.SpanishTreebankParserParams
5650

@@ -118,14 +112,9 @@ arabicSR.ser.gz:
118112

119113
spanishSR.ser.gz:
120114
@echo Training $@
121-
@echo Creating unified Spanish development data file $(SPANISH_DEV_TMP)
122-
cat $(SPANISH_DEV) $(SPANISH_DEV2) $(SPANISH_DEV3) > $(SPANISH_DEV_TMP)
123115
@echo Will test on $(SPANISH_TEST)
124-
java -mx10g edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser -trainTreebank $(SPANISH_TRAIN) -trainTreebank $(SPANISH_TRAIN2) -trainTreebank $(SPANISH_TRAIN3) -devTreebank $(SPANISH_DEV_TMP) -serializedPath $@ -trainingThreads 4 -batchSize 12 -preTag -taggerSerializedFile $(SPANISH_TAGGER) -trainingIterations 100 -stalledIterationLimit 25 -tlpp $(SPANISH_TLPP) > $@.out 2>&1
125-
rm $(SPANISH_DEV_TMP)
116+
java -mx10g edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser -trainTreebank $(SPANISH_TRAIN) -devTreebank $(SPANISH_DEV) -serializedPath $@ -trainingThreads 4 -batchSize 12 -preTag -taggerSerializedFile $(SPANISH_TAGGER) -trainingIterations 100 -stalledIterationLimit 25 -tlpp $(SPANISH_TLPP) > $@.out 2>&1
126117
java -mx5g edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser -testTreebank $(SPANISH_TEST) -serializedPath $@ -preTag -taggerSerializedFile $(SPANISH_TAGGER) -trainingIterations 100 -stalledIterationLimit 25 >> $@.out 2>&1
127-
java -mx5g edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser -testTreebank $(SPANISH_TEST2) -serializedPath $@ -preTag -taggerSerializedFile $(SPANISH_TAGGER) -trainingIterations 100 -stalledIterationLimit 25 >> $@.out 2>&1
128-
java -mx5g edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser -testTreebank $(SPANISH_TEST3) -serializedPath $@ -preTag -taggerSerializedFile $(SPANISH_TAGGER) -trainingIterations 100 -stalledIterationLimit 25 >> $@.out 2>&1
129118

130119
spanishSR.beam.ser.gz:
131120
@echo Training $@

src/edu/stanford/nlp/pipeline/StanfordCoreNLP.properties

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ annotators = tokenize, ssplit, pos, lemma, ner, parse, mention, coref
44
# annotators = tokenize, ssplit, pos, lemma, truecase
55
# annotators = tokenize, ssplit, regexner
66
# These include:
7-
# - truecase: A true-casing annotator (for fixing lowercase or all caps text)k
7+
# - truecase: A true-casing annotator (for fixing lowercase or all caps text)
88
# - regexner: Simple rule or regular-expression based NER (via TokensRegex)
99
# - cleanxml: Removes XML from documents prior to processing
1010
# - entitymentions:

0 commit comments

Comments
 (0)