Skip to content

Commit ee74a0a

Browse files
hansStanford NLP
authored andcommitted
Add Spanish data to SR parser, POS tagger, NER model properties files
1 parent 470380e commit ee74a0a

File tree

6 files changed

+34
-23
lines changed

6 files changed

+34
-23
lines changed

scripts/ner/spanish.ancora.distsim.s512.prop

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11

2-
trainFile = /u/nlp/data/spanish/ner/ancora.ner.train.tsv
3-
testFile = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv
2+
trainFileList = /u/nlp/data/spanish/ner/ancora.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.train.tsv
3+
testFiles = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.dev.tsv
44
serializeTo = spanish.ancora.distsim.s512.crf.ser.gz
55

66
distSimLexicon = /u/nlp/data/spanish/distsim/spanish.spence512.cls

scripts/ner/spanish.ancora.prop

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11

2-
trainFile = /u/nlp/data/spanish/ner/ancora.ner.train.tsv
3-
testFile = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv
2+
trainFileList = /u/nlp/data/spanish/ner/ancora.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.train.tsv
3+
testFiles = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.dev.tsv
44
serializeTo = spanish.ancora.crf.ser.gz
55

66
useDistSim = false

scripts/ner/spanish.ancora2.prop

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11

2-
trainFile = /u/nlp/data/spanish/ner/ancora.ner.train.tsv
3-
testFile = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv
2+
trainFileList = /u/nlp/data/spanish/ner/ancora.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.train.tsv
3+
testFiles = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.dev.tsv
44
serializeTo = spanish.ancora2.crf.ser.gz
55

66
useDistSim = false

scripts/pos-tagger/Makefile

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,14 @@ FRENCH_TEST = format=TREES,/u/nlp/data/lexparser/trees/French/FTB-Test.utf8.txt
1010

1111
GERMAN_TEST = format=TREES,trf=edu.stanford.nlp.trees.international.negra.NegraPennTreeReaderFactory,/u/nlp/data/GermanACL08/negra/negra_3.mrg
1212

13-
SPANISH_TEST = format=TREES,/u/nlp/data/spanish/ancora/ancora.test
13+
SPANISH_TEST = format=TREES,/u/nlp/data/spanish/ancora/ancora.test,/u/nlp/data/spanish/ldc/ldc-NW.test,/u/nlp/data/spanish/ldc/ldc-DF.test
1414

1515
.SECONDEXPANSION:
1616

1717
all: arabic chinese english french german spanish testing wsj
1818
.PHONY: all arabic chinese english french german spanish testing wsj
1919

20-
arabic: arabic.tagger arabic-train.tagger
20+
arabic: arabic.tagger arabic-train.tagger
2121

2222
# we release an arabic model trained on everything, with a
2323
# corresponding model on train only for testing purposes
@@ -27,35 +27,35 @@ arabic.tagger arabic-train.tagger: $$@.props
2727
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
2828
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(ARABIC_TEST) -verboseResults false >> $@.out 2>&1
2929

30-
chinese: chinese-distsim.tagger chinese-nodistsim.tagger
30+
chinese: chinese-distsim.tagger chinese-nodistsim.tagger
3131

3232
chinese-nodistsim.tagger chinese-distsim.tagger: $$@.props
3333
@echo Training $@
34-
@echo Will test on $(CHINESE_TEST)
34+
@echo Will test on $(CHINESE_TEST)
3535
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
3636
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(CHINESE_TEST) -verboseResults false >> $@.out 2>&1
3737

3838
english: english-bidirectional-distsim.tagger english-caseless-left3words-distsim.tagger english-left3words-distsim.tagger
3939

4040
english-bidirectional-distsim.tagger english-caseless-left3words-distsim.tagger english-left3words-distsim.tagger: $$@.props
4141
@echo Training $@
42-
@echo Will test on $(ENGLISH_TEST)
42+
@echo Will test on $(ENGLISH_TEST)
4343
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
4444
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(ENGLISH_TEST) -verboseResults false >> $@.out 2>&1
4545

4646
french: french.tagger
4747

4848
french.tagger: $$@.props
4949
@echo Training $@
50-
@echo Will test on $(FRENCH_TEST)
50+
@echo Will test on $(FRENCH_TEST)
5151
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
5252
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(FRENCH_TEST) -verboseResults false >> $@.out 2>&1
5353

5454
german: german-dewac.tagger german-fast.tagger german-fast-caseless.tagger german-hgc.tagger
5555

5656
german-dewac.tagger german-fast.tagger german-fast-caseless.tagger german-hgc.tagger: $$@.props
5757
@echo Training $@
58-
@echo Will test on $(GERMAN_TEST)
58+
@echo Will test on $(GERMAN_TEST)
5959
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
6060
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(GERMAN_TEST) -verboseResults false >> $@.out 2>&1
6161

@@ -64,16 +64,16 @@ spanish: spanish.tagger spanish-distsim.tagger
6464
spanish.tagger spanish-distsim.tagger: $$@.props
6565
@echo Training $@
6666
@echo Will test on $(SPANISH_TEST)
67-
#java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
68-
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(SPANISH_TEST) -verboseResults false >> $@.out 2>&1
67+
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
68+
# java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(SPANISH_TEST) -verboseResults false >> $@.out 2>&1
6969

7070
testing: testing.tagger
7171

7272
testing.tagger:
7373
@echo Training $@
7474
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
7575

76-
wsj: wsj-0-18-bidirectional-distsim.tagger wsj-0-18-bidirectional-nodistsim.tagger wsj-0-18-caseless-left3words-distsim.tagger wsj-0-18-left3words-distsim.tagger wsj-0-18-left3words-nodistsim.tagger
76+
wsj: wsj-0-18-bidirectional-distsim.tagger wsj-0-18-bidirectional-nodistsim.tagger wsj-0-18-caseless-left3words-distsim.tagger wsj-0-18-left3words-distsim.tagger wsj-0-18-left3words-nodistsim.tagger
7777

7878
wsj-0-18-bidirectional-distsim.tagger wsj-0-18-bidirectional-nodistsim.tagger wsj-0-18-caseless-left3words-distsim.tagger wsj-0-18-left3words-distsim.tagger wsj-0-18-left3words-nodistsim.tagger: $$@.props
7979
@echo Training $@

scripts/pos-tagger/spanish.tagger.props

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
1-
## tagger training invoked at Wed Jul 30 08:33:18 PDT 2014 with arguments:
1+
## tagger training invoked at Sat Oct 08 12:21:50 PDT 2016 with arguments:
22
model = spanish.tagger
33
arch = left3words,naacl2003unknowns,allwordshapes(-1,1)
44
wordFunction =
5-
trainFile = format=TREES,/u/nlp/data/spanish/ancora/ancora.train
5+
trainFile = format=TREES,/u/nlp/data/spanish/ancora/ancora.train;format=TREES,/u/nlp/data/spanish/ldc/ldc-DF.train;format=TREES,/u/nlp/data/spanish/ldc/ldc-NW.train
66
closedClassTags =
77
closedClassTagThreshold = 40
88
curWordMinFeatureThresh = 2

scripts/srparser/Makefile

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,9 @@ ENGLISH_TAGGER = /u/nlp/data/pos-tagger/distrib/english-left3words-distsim.tagge
1414
ENGLISH_TLPP = $(WSJ_TLPP)
1515

1616

17-
FRENCH_TRAIN = /u/nlp/data/lexparser/trees/FrenchCC/FTB-Train.utf8.txt
18-
FRENCH_DEV = /u/nlp/data/lexparser/trees/FrenchCC/FTB-Dev.utf8.txt
19-
FRENCH_TEST = /u/nlp/data/lexparser/trees/FrenchCC/FTB-Test.utf8.txt
17+
FRENCH_TRAIN = /u/nlp/data/lexparser/trees/FrenchCC/FTB-Train.utf8.txt
18+
FRENCH_DEV = /u/nlp/data/lexparser/trees/FrenchCC/FTB-Dev.utf8.txt
19+
FRENCH_TEST = /u/nlp/data/lexparser/trees/FrenchCC/FTB-Test.utf8.txt
2020
FRENCH_TAGGER = /u/nlp/data/pos-tagger/distrib-2014-06-09/french.tagger
2121
FRENCH_TLPP = edu.stanford.nlp.parser.lexparser.FrenchTreebankParserParams
2222

@@ -41,10 +41,16 @@ ARABIC_TEST = /u/nlp/data/lexparser/trees/Arabic/2-Unvoc-Test.utf8.txt
4141
ARABIC_TAGGER = /u/nlp/data/pos-tagger/distrib/arabic-train.tagger
4242
ARABIC_TLPP = edu.stanford.nlp.parser.lexparser.ArabicTreebankParserParams
4343

44-
4544
SPANISH_TRAIN = /u/nlp/data/spanish/ancora/ancora.train
45+
SPANISH_TRAIN2 = /u/nlp/data/spanish/ldc/ldc-NW.train
46+
SPANISH_TRAIN3 = /u/nlp/data/spanish/ldc/ldc-DF.train
4647
SPANISH_DEV = /u/nlp/data/spanish/ancora/ancora.dev
48+
SPANISH_DEV2 = /u/nlp/data/spanish/ldc/ldc-NW.dev
49+
SPANISH_DEV3 = /u/nlp/data/spanish/ldc/ldc-DF.dev
50+
SPANISH_DEV_TMP = /u/nlp/data/spanish/all.dev.tmp
4751
SPANISH_TEST = /u/nlp/data/spanish/ancora/ancora.test
52+
SPANISH_TEST2 = /u/nlp/data/spanish/ldc/ldc-NW.train
53+
SPANISH_TEST3 = /u/nlp/data/spanish/ldc/ldc-DF.train
4854
SPANISH_TAGGER= /u/nlp/data/pos-tagger/distrib/spanish-distsim.tagger
4955
SPANISH_TLPP = edu.stanford.nlp.parser.lexparser.SpanishTreebankParserParams
5056

@@ -112,9 +118,14 @@ arabicSR.ser.gz:
112118

113119
spanishSR.ser.gz:
114120
@echo Training $@
121+
@echo Creating unified Spanish development data file $(SPANISH_DEV_TMP)
122+
cat $(SPANISH_DEV) $(SPANISH_DEV2) $(SPANISH_DEV3) > $(SPANISH_DEV_TMP)
115123
@echo Will test on $(SPANISH_TEST)
116-
java -mx10g edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser -trainTreebank $(SPANISH_TRAIN) -devTreebank $(SPANISH_DEV) -serializedPath $@ -trainingThreads 4 -batchSize 12 -preTag -taggerSerializedFile $(SPANISH_TAGGER) -trainingIterations 100 -stalledIterationLimit 25 -tlpp $(SPANISH_TLPP) > $@.out 2>&1
124+
java -mx10g edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser -trainTreebank $(SPANISH_TRAIN) -trainTreebank $(SPANISH_TRAIN2) -trainTreebank $(SPANISH_TRAIN3) -devTreebank $(SPANISH_DEV_TMP) -serializedPath $@ -trainingThreads 4 -batchSize 12 -preTag -taggerSerializedFile $(SPANISH_TAGGER) -trainingIterations 100 -stalledIterationLimit 25 -tlpp $(SPANISH_TLPP) > $@.out 2>&1
125+
rm $(SPANISH_DEV_TMP)
117126
java -mx5g edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser -testTreebank $(SPANISH_TEST) -serializedPath $@ -preTag -taggerSerializedFile $(SPANISH_TAGGER) -trainingIterations 100 -stalledIterationLimit 25 >> $@.out 2>&1
127+
java -mx5g edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser -testTreebank $(SPANISH_TEST2) -serializedPath $@ -preTag -taggerSerializedFile $(SPANISH_TAGGER) -trainingIterations 100 -stalledIterationLimit 25 >> $@.out 2>&1
128+
java -mx5g edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser -testTreebank $(SPANISH_TEST3) -serializedPath $@ -preTag -taggerSerializedFile $(SPANISH_TAGGER) -trainingIterations 100 -stalledIterationLimit 25 >> $@.out 2>&1
118129

119130
spanishSR.beam.ser.gz:
120131
@echo Training $@

0 commit comments

Comments
 (0)