Add Spanish data to SR parser, POS tagger, NER model properties files

hans · Stanford NLP · commit ee74a0a62f47 · 2016-10-09T12:28:37.000-07:00
diff --git a/scripts/ner/spanish.ancora.distsim.s512.prop b/scripts/ner/spanish.ancora.distsim.s512.prop
@@ -1,6 +1,6 @@
 
-trainFile = /u/nlp/data/spanish/ner/ancora.ner.train.tsv
-testFile = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv
+trainFileList = /u/nlp/data/spanish/ner/ancora.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.train.tsv
+testFiles = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.dev.tsv
 serializeTo = spanish.ancora.distsim.s512.crf.ser.gz
 
 distSimLexicon = /u/nlp/data/spanish/distsim/spanish.spence512.cls
diff --git a/scripts/ner/spanish.ancora.prop b/scripts/ner/spanish.ancora.prop
@@ -1,6 +1,6 @@
 
-trainFile = /u/nlp/data/spanish/ner/ancora.ner.train.tsv
-testFile = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv
+trainFileList = /u/nlp/data/spanish/ner/ancora.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.train.tsv
+testFiles = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.dev.tsv
 serializeTo = spanish.ancora.crf.ser.gz
 
 useDistSim = false
diff --git a/scripts/ner/spanish.ancora2.prop b/scripts/ner/spanish.ancora2.prop
@@ -1,6 +1,6 @@
 
-trainFile = /u/nlp/data/spanish/ner/ancora.ner.train.tsv
-testFile = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv
+trainFileList = /u/nlp/data/spanish/ner/ancora.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.train.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.train.tsv
+testFiles = /u/nlp/data/spanish/ner/ancora.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-DF.ner.dev.tsv,/u/nlp/data/spanish/ner/ldc-NW.ner.dev.tsv
 serializeTo = spanish.ancora2.crf.ser.gz
 
 useDistSim = false
diff --git a/scripts/pos-tagger/Makefile b/scripts/pos-tagger/Makefile
@@ -10,14 +10,14 @@ FRENCH_TEST = format=TREES,/u/nlp/data/lexparser/trees/French/FTB-Test.utf8.txt
 
 GERMAN_TEST = format=TREES,trf=edu.stanford.nlp.trees.international.negra.NegraPennTreeReaderFactory,/u/nlp/data/GermanACL08/negra/negra_3.mrg
 
-SPANISH_TEST = format=TREES,/u/nlp/data/spanish/ancora/ancora.test
+SPANISH_TEST = format=TREES,/u/nlp/data/spanish/ancora/ancora.test,/u/nlp/data/spanish/ldc/ldc-NW.test,/u/nlp/data/spanish/ldc/ldc-DF.test
 
 .SECONDEXPANSION:
 
 all: arabic chinese english french german spanish testing wsj
 .PHONY: all arabic chinese english french german spanish testing wsj
 
-arabic: arabic.tagger  arabic-train.tagger 
+arabic: arabic.tagger  arabic-train.tagger
 
 # we release an arabic model trained on everything, with a
 # corresponding model on train only for testing purposes
@@ -27,35 +27,35 @@ arabic.tagger arabic-train.tagger: $$@.props
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(ARABIC_TEST) -verboseResults false >> $@.out 2>&1
 
-chinese: chinese-distsim.tagger chinese-nodistsim.tagger 
+chinese: chinese-distsim.tagger chinese-nodistsim.tagger
 
 chinese-nodistsim.tagger chinese-distsim.tagger: $$@.props
 	@echo Training $@
-	@echo Will test on $(CHINESE_TEST) 
+	@echo Will test on $(CHINESE_TEST)
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(CHINESE_TEST)  -verboseResults false >> $@.out 2>&1
 
 english: english-bidirectional-distsim.tagger english-caseless-left3words-distsim.tagger english-left3words-distsim.tagger
 
 english-bidirectional-distsim.tagger english-caseless-left3words-distsim.tagger english-left3words-distsim.tagger: $$@.props
 	@echo Training $@
-	@echo Will test on $(ENGLISH_TEST) 
+	@echo Will test on $(ENGLISH_TEST)
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(ENGLISH_TEST)  -verboseResults false >> $@.out 2>&1
 
 french: french.tagger
 
 french.tagger: $$@.props
 	@echo Training $@
-	@echo Will test on $(FRENCH_TEST) 
+	@echo Will test on $(FRENCH_TEST)
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(FRENCH_TEST)  -verboseResults false >> $@.out 2>&1
 
 german: german-dewac.tagger german-fast.tagger german-fast-caseless.tagger german-hgc.tagger
 
 german-dewac.tagger german-fast.tagger german-fast-caseless.tagger german-hgc.tagger: $$@.props
 	@echo Training $@
-	@echo Will test on $(GERMAN_TEST) 
+	@echo Will test on $(GERMAN_TEST)
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(GERMAN_TEST)  -verboseResults false >> $@.out 2>&1
 
@@ -64,16 +64,16 @@ spanish: spanish.tagger spanish-distsim.tagger
 spanish.tagger spanish-distsim.tagger: $$@.props
 	@echo Training $@
 	@echo Will test on $(SPANISH_TEST)
-#java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
-	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(SPANISH_TEST) -verboseResults false >> $@.out 2>&1
+	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
+#	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(SPANISH_TEST) -verboseResults false >> $@.out 2>&1
 
 testing: testing.tagger
 
 testing.tagger:
 	@echo Training $@
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
 
-wsj: wsj-0-18-bidirectional-distsim.tagger  wsj-0-18-bidirectional-nodistsim.tagger  wsj-0-18-caseless-left3words-distsim.tagger  wsj-0-18-left3words-distsim.tagger wsj-0-18-left3words-nodistsim.tagger 
+wsj: wsj-0-18-bidirectional-distsim.tagger  wsj-0-18-bidirectional-nodistsim.tagger  wsj-0-18-caseless-left3words-distsim.tagger  wsj-0-18-left3words-distsim.tagger wsj-0-18-left3words-nodistsim.tagger
 
 wsj-0-18-bidirectional-distsim.tagger wsj-0-18-bidirectional-nodistsim.tagger wsj-0-18-caseless-left3words-distsim.tagger wsj-0-18-left3words-distsim.tagger wsj-0-18-left3words-nodistsim.tagger: $$@.props
 	@echo Training $@
diff --git a/scripts/pos-tagger/spanish.tagger.props b/scripts/pos-tagger/spanish.tagger.props
@@ -1,8 +1,8 @@
-## tagger training invoked at Wed Jul 30 08:33:18 PDT 2014 with arguments:
+## tagger training invoked at Sat Oct 08 12:21:50 PDT 2016 with arguments:
                    model = spanish.tagger
                     arch = left3words,naacl2003unknowns,allwordshapes(-1,1)
             wordFunction = 
-               trainFile = format=TREES,/u/nlp/data/spanish/ancora/ancora.train
+               trainFile = format=TREES,/u/nlp/data/spanish/ancora/ancora.train;format=TREES,/u/nlp/data/spanish/ldc/ldc-DF.train;format=TREES,/u/nlp/data/spanish/ldc/ldc-NW.train
          closedClassTags = 
  closedClassTagThreshold = 40
  curWordMinFeatureThresh = 2
diff --git a/scripts/srparser/Makefile b/scripts/srparser/Makefile
@@ -14,9 +14,9 @@ ENGLISH_TAGGER = /u/nlp/data/pos-tagger/distrib/english-left3words-distsim.tagge
 ENGLISH_TLPP   = $(WSJ_TLPP)
 
 
-FRENCH_TRAIN  = /u/nlp/data/lexparser/trees/FrenchCC/FTB-Train.utf8.txt 
-FRENCH_DEV    = /u/nlp/data/lexparser/trees/FrenchCC/FTB-Dev.utf8.txt 
-FRENCH_TEST   = /u/nlp/data/lexparser/trees/FrenchCC/FTB-Test.utf8.txt 
+FRENCH_TRAIN  = /u/nlp/data/lexparser/trees/FrenchCC/FTB-Train.utf8.txt
+FRENCH_DEV    = /u/nlp/data/lexparser/trees/FrenchCC/FTB-Dev.utf8.txt
+FRENCH_TEST   = /u/nlp/data/lexparser/trees/FrenchCC/FTB-Test.utf8.txt
 FRENCH_TAGGER = /u/nlp/data/pos-tagger/distrib-2014-06-09/french.tagger
 FRENCH_TLPP   = edu.stanford.nlp.parser.lexparser.FrenchTreebankParserParams
 
@@ -41,10 +41,16 @@ ARABIC_TEST   = /u/nlp/data/lexparser/trees/Arabic/2-Unvoc-Test.utf8.txt
 ARABIC_TAGGER = /u/nlp/data/pos-tagger/distrib/arabic-train.tagger
 ARABIC_TLPP   = edu.stanford.nlp.parser.lexparser.ArabicTreebankParserParams
 
-
 SPANISH_TRAIN = /u/nlp/data/spanish/ancora/ancora.train
+SPANISH_TRAIN2 = /u/nlp/data/spanish/ldc/ldc-NW.train
+SPANISH_TRAIN3 = /u/nlp/data/spanish/ldc/ldc-DF.train
 SPANISH_DEV   = /u/nlp/data/spanish/ancora/ancora.dev
+SPANISH_DEV2 = /u/nlp/data/spanish/ldc/ldc-NW.dev
+SPANISH_DEV3 = /u/nlp/data/spanish/ldc/ldc-DF.dev
+SPANISH_DEV_TMP = /u/nlp/data/spanish/all.dev.tmp
 SPANISH_TEST  = /u/nlp/data/spanish/ancora/ancora.test
+SPANISH_TEST2 = /u/nlp/data/spanish/ldc/ldc-NW.train
+SPANISH_TEST3 = /u/nlp/data/spanish/ldc/ldc-DF.train
 SPANISH_TAGGER= /u/nlp/data/pos-tagger/distrib/spanish-distsim.tagger
 SPANISH_TLPP  = edu.stanford.nlp.parser.lexparser.SpanishTreebankParserParams
 
@@ -112,9 +118,14 @@ arabicSR.ser.gz:
 
 spanishSR.ser.gz:
 	@echo Training $@
+	@echo Creating unified Spanish development data file $(SPANISH_DEV_TMP)
+	cat $(SPANISH_DEV) $(SPANISH_DEV2) $(SPANISH_DEV3) > $(SPANISH_DEV_TMP)
 	@echo Will test on $(SPANISH_TEST)
-	java -mx10g edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser -trainTreebank $(SPANISH_TRAIN) -devTreebank $(SPANISH_DEV) -serializedPath $@ -trainingThreads 4 -batchSize 12 -preTag -taggerSerializedFile $(SPANISH_TAGGER) -trainingIterations 100 -stalledIterationLimit 25 -tlpp $(SPANISH_TLPP) > $@.out 2>&1
+	java -mx10g edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser -trainTreebank $(SPANISH_TRAIN) -trainTreebank $(SPANISH_TRAIN2) -trainTreebank $(SPANISH_TRAIN3) -devTreebank $(SPANISH_DEV_TMP) -serializedPath $@ -trainingThreads 4 -batchSize 12 -preTag -taggerSerializedFile $(SPANISH_TAGGER) -trainingIterations 100 -stalledIterationLimit 25 -tlpp $(SPANISH_TLPP) > $@.out 2>&1
+	rm $(SPANISH_DEV_TMP)
 	java -mx5g edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser -testTreebank $(SPANISH_TEST) -serializedPath $@ -preTag -taggerSerializedFile $(SPANISH_TAGGER) -trainingIterations 100 -stalledIterationLimit 25 >> $@.out 2>&1
+	java -mx5g edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser -testTreebank $(SPANISH_TEST2) -serializedPath $@ -preTag -taggerSerializedFile $(SPANISH_TAGGER) -trainingIterations 100 -stalledIterationLimit 25 >> $@.out 2>&1
+	java -mx5g edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser -testTreebank $(SPANISH_TEST3) -serializedPath $@ -preTag -taggerSerializedFile $(SPANISH_TAGGER) -trainingIterations 100 -stalledIterationLimit 25 >> $@.out 2>&1
 
 spanishSR.beam.ser.gz:
 	@echo Training $@