Add support for Italian tokenizer, NER, depparse, and tagger

AngledLuffa · AngledLuffa · commit 2740c30b6f14 · 2021-09-30T12:22:01.000-07:00
diff --git a/scripts/cdc-tokenize/Makefile b/scripts/cdc-tokenize/Makefile
@@ -1,15 +1,23 @@
 PYTHON = python3
-EVAL_SCRIPT = /home/john/stanza/stanza/utils/conll18_ud_eval.py
+EVAL_SCRIPT = /u/horatio/stanza/stanza/utils/conll18_ud_eval.py
 
-HU_TRAINING = /home/john/extern_data/ud2/ud-treebanks-v2.8/UD_Hungarian-Szeged/hu_szeged-ud-train.conllu
+HU_TRAINING = /u/nlp/software/CoreNLP-models/hu/stattok/4.3.0/hu_szeged-ud-train.conllu
 
-HU_TEST_INPUT = /home/john/extern_data/ud2/ud-treebanks-v2.8/UD_Hungarian-Szeged/hu_szeged-ud-test.txt
-HU_TEST_GOLD = /home/john/extern_data/ud2/ud-treebanks-v2.8/UD_Hungarian-Szeged/hu_szeged-ud-test.conllu
+HU_TEST_INPUT = /u/nlp/software/CoreNLP-models/hu/stattok/4.3.0/hu_szeged-ud-test.txt
+HU_TEST_GOLD = /u/nlp/software/CoreNLP-models/hu/stattok/4.3.0/hu_szeged-ud-test.conllu
+
+
+# ignoring twittiro and postwita because this model gets thrown off
+# quite a lot by the non-standard sentence endings
+IT_TRAINING = /u/nlp/software/CoreNLP-models/it/stattok/4.3.0/it_isdt-ud-train.conllu,/u/nlp/software/CoreNLP-models/it/stattok/4.3.0/it_vit-ud-train.conllu,/u/nlp/software/CoreNLP-models/it/stattok/4.3.0/italian.mwt
+
+IT_TEST_INPUT = /u/nlp/software/CoreNLP-models/it/stattok/4.3.0/it_isdt-ud-test.txt
+IT_TEST_GOLD = /u/nlp/software/CoreNLP-models/it/stattok/4.3.0/it_isdt-ud-test.conllu
 
 .SECONDEXPANSION:
 
-all: hungarian
-.PHONY: all hungarian
+all: hungarian italian
+.PHONY: all hungarian italian
 
 hungarian: hu-tokenizer.ser.gz
 
@@ -18,3 +26,11 @@ hu-tokenizer.ser.gz:
 	java edu.stanford.nlp.process.stattok.StatTokSentTrainer -trainFile $(HU_TRAINING) -serializeTo $@
 	java edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators cdc_tokenize -cdc_tokenize.model $@ -file $(HU_TEST_INPUT) -outputFormat conllu -output.printFakeDeps True -outputDirectory /tmp/$@.out
 	$(PYTHON) $(EVAL_SCRIPT) -v $(HU_TEST_GOLD) /tmp/$@.out/$(notdir $(HU_TEST_INPUT)).conllu
+
+italian: it-tokenizer.ser.gz
+
+it-tokenizer.ser.gz:
+	@echo Training $@
+	java edu.stanford.nlp.process.stattok.StatTokSentTrainer -inferMultiWordRules 1 -trainFile $(IT_TRAINING) -serializeTo $@
+	java edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators cdc_tokenize -cdc_tokenize.model $@ -file $(IT_TEST_INPUT) -outputFormat conllu -output.printFakeDeps True -outputDirectory /tmp/$@.out
+	$(PYTHON) $(EVAL_SCRIPT) -v $(IT_TEST_GOLD) /tmp/$@.out/$(notdir $(IT_TEST_INPUT)).conllu
diff --git a/scripts/ner/Makefile b/scripts/ner/Makefile
@@ -1,5 +1,5 @@
 
-all: chinese genia german all.3class nowiki.3class conll.4class muc.7class spanish
+all: chinese genia german hungarian italian all.3class nowiki.3class conll.4class muc.7class spanish
 
 chinese: chinese.misc.nodistsim.ser.gz chinese.misc.distsim.ser.gz
 
@@ -27,6 +27,12 @@ hungarian.crf.ser.gz:
 	java -mx20g edu.stanford.nlp.ie.crf.CRFClassifier -prop hungarian.prop -serializeTo $@ > hungarian.out 2>&1
 
 
+italian: italian.crf.ser.gz
+
+italian.crf.ser.gz:
+	java -mx20g edu.stanford.nlp.ie.crf.CRFClassifier -prop italian.prop -serializeTo $@ > italian.out 2>&1
+
+
 # currently we exclude enp_DE.sbb.io, as the data has too many issues, but we could work to include it....
 # ,/u/nlp/data/german/ner/2016/Europeana-Newspapers-data/ner-corpora/enp_DE.sbb.bio/enp_DE.sbb.io
 
diff --git a/scripts/ner/italian.prop b/scripts/ner/italian.prop
@@ -0,0 +1,65 @@
+trainFileList = /u/nlp/data/ner/italian/2021-09-15/it_fbk.train.io,/u/nlp/data/ner/italian/2021-09-15/it_fbk.dev.io
+testFile = /u/nlp/data/ner/italian/2021-09-15/it_fbk.test.io
+serializeTo = italian.crf.ser.gz
+
+type=crf
+
+# distSimLexicon = /u/nlp/data/german/ner/hgc_175m_600
+# distSimLexicon = /u/nlp/data/german/ner/2016/hgc-175M-600
+# right options for new hgc_175m_600
+useDistSim = false
+
+# Now using stripped 2 column files so can add extra datasets!
+map = word=0,answer=1
+
+encoding = utf-8
+# saveFeatureIndexToDisk = true # now buggy but unnecessary
+mergeTags = false
+useTitle = false
+
+cleanGazette = True
+useGazettes = True
+gazette = it-LOC-wikipedia.txt,it-PER-wikipedia.txt,it-ORG-wikipedia.txt
+
+useClassFeature=true
+useWord=true
+useNGrams=true
+noMidNGrams=true
+# Having no maxNGramLeng seemed to work marginally better, but omitted for efficiency
+maxNGramLeng=6
+usePrev=true
+useNext=true
+useLongSequences=true
+useSequences=true
+usePrevSequences=true
+useTypeSeqs=true
+useTypeSeqs2=true
+useTypeySequences=true
+# Including useOccurrencePatterns increased scores really marginally (could even disappear now we have weaker regularization)
+useOccurrencePatterns=true
+useLastRealWord=true
+useNextRealWord=true
+normalize=true
+# using chris4 instead hurts in most recent experiment. Earlier, an experiment had seemed to show the opposite.
+wordShape=chris2useLC
+useDisjunctive=true
+# Width 5 works a little better than 4
+disjunctionWidth=5
+
+maxLeft=1
+readerAndWriter=edu.stanford.nlp.sequences.ColumnDocumentReaderAndWriter
+useObservedSequencesOnly=true
+useQN = true
+QNsize = 15
+# sigma 20 works better than sigma 5, which is MUCH better than sigma 1; that was the limit of hyperparameter optimization
+# On the basic CoNLL dataset (no distsim, no extra data), sigma=50 is a bit better still (by 0.13 F1)
+sigma = 20
+
+# For making faster (less features); changing this to 0.025 doesn't improve performance
+featureDiffThresh=0.05
+
+# evaluateIOB=true
+
+# other notes
+# even though useTaggySequences will use distsim rather than POS sequences, turning it on didn't help
+# adding useWordPairs doesn't seem to help. (Getting them anyway in an edge feature.)
diff --git a/scripts/nndep/Makefile b/scripts/nndep/Makefile
@@ -1,5 +1,6 @@
 ENGLISH_EMBEDDINGS=/u/nlp/data/depparser/nn/data/embeddings/en-cw.txt
 CHINESE_EMBEDDINGS=/u/nlp/data/depparser/nn/data/embeddings/zh-word2vec.txt
+ITALIAN_EMBEDDINGS=/u/nlp/software/CoreNLP-models/it/depparse/4.3.0/it.embedding.txt
 
 DATA_DIR=/u/nlp/data/dependency_treebanks
 
@@ -28,6 +29,9 @@ UD_FRENCH_TRAIN=${DATA_DIR}/UD/1.1/fr/fr-ud-train-clean.conllu
 UD_FRENCH_DEV=${DATA_DIR}/UD/1.1/fr/fr-ud-dev-clean.conllu
 UD_FRENCH_TEST=${DATA_DIR}/UD/1.1/fr/fr-ud-test-clean.conllu
 
+UD_ITALIAN_TRAIN = /u/nlp/software/CoreNLP-models/it/depparse/4.3.0/it_isdt-ud-train.conllu
+UD_ITALIAN_DEV   = /u/nlp/software/CoreNLP-models/it/depparse/4.3.0/it_isdt-ud-dev.conllu
+UD_ITALIAN_TEST  = /u/nlp/software/CoreNLP-models/it/depparse/4.3.0/it_isdt-ud-test.conllu
 
 PTB_Stanford:
 	java edu.stanford.nlp.parser.nndep.DependencyParser -props nndep.properties -trainFile $(PTB_STANFORD_TRAIN) -devFile $(PTB_STANFORD_DEV) -embedFile $(ENGLISH_EMBEDDINGS) -model $@.txt.gz > $@.log 2>&1
@@ -62,3 +66,9 @@ UD_FRENCH:
 	java edu.stanford.nlp.parser.nndep.DependencyParser -testFile $(UD_FRENCH_DEV) -language French -model $@.txt.gz -outFile $@.out.dev >> $@.log 2>&1
 	java edu.stanford.nlp.parser.nndep.DependencyParser -testFile $(UD_FRENCH_TEST) -language French -model $@.txt.gz -outFile $@.out.test >> $@.log 2>&1
 
+# the ONE THING the language setting gets used for is punctuation, so...
+UD_ITALIAN:
+	java edu.stanford.nlp.parser.nndep.DependencyParser -props nndep.properties -trainFile $(UD_ITALIAN_TRAIN) -language French -devFile $(UD_ITALIAN_DEV) -embedFile $(ITALIAN_EMBEDDINGS) -embeddingSize 100  -model $@.txt.gz >> $@.log 2>&1
+	java edu.stanford.nlp.parser.nndep.DependencyParser -testFile $(UD_ITALIAN_DEV) -language French -model $@.txt.gz -outFile $@.out.dev >> $@.log 2>&1
+	java edu.stanford.nlp.parser.nndep.DependencyParser -testFile $(UD_ITALIAN_TEST) -language French -model $@.txt.gz -outFile $@.out.test >> $@.log 2>&1
+
diff --git a/scripts/pos-tagger/Makefile b/scripts/pos-tagger/Makefile
@@ -13,12 +13,14 @@ GERMAN_TEST = format=TSV,wordColumn=1,tagColumn=3,comments=True,/u/nlp/software/
 
 HUNGARIAN_TEST = format=TSV,comments=true,wordColumn=1,tagColumn=3,/u/nlp/software/CoreNLP-models/hu/pos/4.3.0/hu_szeged-ud-test.conllu
 
+ITALIAN_TEST = format=TSV,comments=true,wordColumn=1,tagColumn=3,/u/nlp/software/CoreNLP-models/it/pos/4.3.0/it_isdt-ud-test.conllu
+
 SPANISH_TEST = format=TREES,/u/nlp/data/spanish/ancora/ancora.test,/u/nlp/data/spanish/ldc/ldc-NW.test,/u/nlp/data/spanish/ldc/ldc-DF.test
 
 .SECONDEXPANSION:
 
-all: arabic chinese english french german hungarian spanish testing wsj
-.PHONY: all arabic chinese english french german hungarian spanish testing wsj
+all: arabic chinese english french german hungarian italian spanish testing wsj
+.PHONY: all arabic chinese english french german hungarian italian spanish testing wsj
 
 arabic: arabic.tagger  arabic-train.tagger
 
@@ -71,6 +73,14 @@ hungarian.tagger: $$@.props
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
 	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(HUNGARIAN_TEST)  -verboseResults false >> $@.out 2>&1
 
+italian: italian.tagger
+
+italian.tagger: $$@.props
+	@echo Training $@
+	@echo Will test on $(ITALIAN_TEST)
+	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
+	java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(ITALIAN_TEST)  -verboseResults false >> $@.out 2>&1
+
 
 spanish: spanish.tagger spanish-distsim.tagger
 
diff --git a/scripts/pos-tagger/italian.tagger.props b/scripts/pos-tagger/italian.tagger.props
@@ -0,0 +1,36 @@
+## tagger training invoked at Tue Sep 14 13:49:09 PDT 2021 with arguments:
+                   model = italian.tagger
+                    arch = left3words,naacl2003unknowns,suffix(10),prefix(4),words(-2,2),unicodeshapes(-2,2),unicodeshapeconjunction(-1,1)
+            wordFunction = 
+               trainFile = format=TSV,comments=true,wordColumn=1,tagColumn=3,/u/nlp/software/CoreNLP-models/it/pos/4.3.0/it_isdt-ud-train.conllu;format=TSV,comments=true,wordColumn=1,tagColumn=3,/u/nlp/software/CoreNLP-models/it/pos/4.3.0/it_vit-ud-train.conllu;format=TSV,comments=true,wordColumn=1,tagColumn=3,/u/nlp/software/CoreNLP-models/it/pos/4.3.0/it_isdt-ud-dev.conllu;format=TSV,comments=true,wordColumn=1,tagColumn=3,/u/nlp/software/CoreNLP-models/it/pos/4.3.0/it_vit-ud-dev.conllu;format=TSV,comments=true,wordColumn=1,tagColumn=3,/u/nlp/software/CoreNLP-models/it/pos/4.3.0/italian.mwt
+         closedClassTags = 
+ closedClassTagThreshold = 40
+ curWordMinFeatureThresh = 2
+                   debug = false
+             debugPrefix = 
+            tagSeparator = _
+                encoding = utf-8
+              iterations = 100
+                    lang = 
+    learnClosedClassTags = false
+        minFeatureThresh = 2
+           openClassTags = 
+rareWordMinFeatureThresh = 10
+          rareWordThresh = 5
+                  search = owlqn2
+                    sgml = false
+            sigmaSquared = 0.0
+                   regL1 = 0.75
+               tagInside = 
+                tokenize = true
+        tokenizerFactory = 
+        tokenizerOptions = 
+                 verbose = false
+          verboseResults = true
+    veryCommonWordThresh = 250
+                xmlInput = null
+              outputFile = 
+            outputFormat = slashTags
+     outputFormatOptions = 
+                nthreads = 1
+        minWordsLockTags = 1