Skip to content

Commit 2740c30

Browse files
committed
Add support for Italian tokenizer, NER, depparse, and tagger
1 parent 9c88ac2 commit 2740c30

File tree

6 files changed

+152
-9
lines changed

6 files changed

+152
-9
lines changed

scripts/cdc-tokenize/Makefile

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,23 @@
11
PYTHON = python3
2-
EVAL_SCRIPT = /home/john/stanza/stanza/utils/conll18_ud_eval.py
2+
EVAL_SCRIPT = /u/horatio/stanza/stanza/utils/conll18_ud_eval.py
33

4-
HU_TRAINING = /home/john/extern_data/ud2/ud-treebanks-v2.8/UD_Hungarian-Szeged/hu_szeged-ud-train.conllu
4+
HU_TRAINING = /u/nlp/software/CoreNLP-models/hu/stattok/4.3.0/hu_szeged-ud-train.conllu
55

6-
HU_TEST_INPUT = /home/john/extern_data/ud2/ud-treebanks-v2.8/UD_Hungarian-Szeged/hu_szeged-ud-test.txt
7-
HU_TEST_GOLD = /home/john/extern_data/ud2/ud-treebanks-v2.8/UD_Hungarian-Szeged/hu_szeged-ud-test.conllu
6+
HU_TEST_INPUT = /u/nlp/software/CoreNLP-models/hu/stattok/4.3.0/hu_szeged-ud-test.txt
7+
HU_TEST_GOLD = /u/nlp/software/CoreNLP-models/hu/stattok/4.3.0/hu_szeged-ud-test.conllu
8+
9+
10+
# ignoring twittiro and postwita because this model gets thrown off
11+
# quite a lot by the non-standard sentence endings
12+
IT_TRAINING = /u/nlp/software/CoreNLP-models/it/stattok/4.3.0/it_isdt-ud-train.conllu,/u/nlp/software/CoreNLP-models/it/stattok/4.3.0/it_vit-ud-train.conllu,/u/nlp/software/CoreNLP-models/it/stattok/4.3.0/italian.mwt
13+
14+
IT_TEST_INPUT = /u/nlp/software/CoreNLP-models/it/stattok/4.3.0/it_isdt-ud-test.txt
15+
IT_TEST_GOLD = /u/nlp/software/CoreNLP-models/it/stattok/4.3.0/it_isdt-ud-test.conllu
816

917
.SECONDEXPANSION:
1018

11-
all: hungarian
12-
.PHONY: all hungarian
19+
all: hungarian italian
20+
.PHONY: all hungarian italian
1321

1422
hungarian: hu-tokenizer.ser.gz
1523

@@ -18,3 +26,11 @@ hu-tokenizer.ser.gz:
1826
java edu.stanford.nlp.process.stattok.StatTokSentTrainer -trainFile $(HU_TRAINING) -serializeTo $@
1927
java edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators cdc_tokenize -cdc_tokenize.model $@ -file $(HU_TEST_INPUT) -outputFormat conllu -output.printFakeDeps True -outputDirectory /tmp/$@.out
2028
$(PYTHON) $(EVAL_SCRIPT) -v $(HU_TEST_GOLD) /tmp/$@.out/$(notdir $(HU_TEST_INPUT)).conllu
29+
30+
italian: it-tokenizer.ser.gz
31+
32+
it-tokenizer.ser.gz:
33+
@echo Training $@
34+
java edu.stanford.nlp.process.stattok.StatTokSentTrainer -inferMultiWordRules 1 -trainFile $(IT_TRAINING) -serializeTo $@
35+
java edu.stanford.nlp.pipeline.StanfordCoreNLP -annotators cdc_tokenize -cdc_tokenize.model $@ -file $(IT_TEST_INPUT) -outputFormat conllu -output.printFakeDeps True -outputDirectory /tmp/$@.out
36+
$(PYTHON) $(EVAL_SCRIPT) -v $(IT_TEST_GOLD) /tmp/$@.out/$(notdir $(IT_TEST_INPUT)).conllu

scripts/ner/Makefile

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11

2-
all: chinese genia german all.3class nowiki.3class conll.4class muc.7class spanish
2+
all: chinese genia german hungarian italian all.3class nowiki.3class conll.4class muc.7class spanish
33

44
chinese: chinese.misc.nodistsim.ser.gz chinese.misc.distsim.ser.gz
55

@@ -27,6 +27,12 @@ hungarian.crf.ser.gz:
2727
java -mx20g edu.stanford.nlp.ie.crf.CRFClassifier -prop hungarian.prop -serializeTo $@ > hungarian.out 2>&1
2828

2929

30+
italian: italian.crf.ser.gz
31+
32+
italian.crf.ser.gz:
33+
java -mx20g edu.stanford.nlp.ie.crf.CRFClassifier -prop italian.prop -serializeTo $@ > italian.out 2>&1
34+
35+
3036
# currently we exclude enp_DE.sbb.io, as the data has too many issues, but we could work to include it....
3137
# ,/u/nlp/data/german/ner/2016/Europeana-Newspapers-data/ner-corpora/enp_DE.sbb.bio/enp_DE.sbb.io
3238

scripts/ner/italian.prop

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
trainFileList = /u/nlp/data/ner/italian/2021-09-15/it_fbk.train.io,/u/nlp/data/ner/italian/2021-09-15/it_fbk.dev.io
2+
testFile = /u/nlp/data/ner/italian/2021-09-15/it_fbk.test.io
3+
serializeTo = italian.crf.ser.gz
4+
5+
type=crf
6+
7+
# distSimLexicon = /u/nlp/data/german/ner/hgc_175m_600
8+
# distSimLexicon = /u/nlp/data/german/ner/2016/hgc-175M-600
9+
# right options for new hgc_175m_600
10+
useDistSim = false
11+
12+
# Now using stripped 2 column files so can add extra datasets!
13+
map = word=0,answer=1
14+
15+
encoding = utf-8
16+
# saveFeatureIndexToDisk = true # now buggy but unnecessary
17+
mergeTags = false
18+
useTitle = false
19+
20+
cleanGazette = True
21+
useGazettes = True
22+
gazette = it-LOC-wikipedia.txt,it-PER-wikipedia.txt,it-ORG-wikipedia.txt
23+
24+
useClassFeature=true
25+
useWord=true
26+
useNGrams=true
27+
noMidNGrams=true
28+
# Having no maxNGramLeng seemed to work marginally better, but omitted for efficiency
29+
maxNGramLeng=6
30+
usePrev=true
31+
useNext=true
32+
useLongSequences=true
33+
useSequences=true
34+
usePrevSequences=true
35+
useTypeSeqs=true
36+
useTypeSeqs2=true
37+
useTypeySequences=true
38+
# Including useOccurrencePatterns increased scores really marginally (could even disappear now we have weaker regularization)
39+
useOccurrencePatterns=true
40+
useLastRealWord=true
41+
useNextRealWord=true
42+
normalize=true
43+
# using chris4 instead hurts in most recent experiment. Earlier, an experiment had seemed to show the opposite.
44+
wordShape=chris2useLC
45+
useDisjunctive=true
46+
# Width 5 works a little better than 4
47+
disjunctionWidth=5
48+
49+
maxLeft=1
50+
readerAndWriter=edu.stanford.nlp.sequences.ColumnDocumentReaderAndWriter
51+
useObservedSequencesOnly=true
52+
useQN = true
53+
QNsize = 15
54+
# sigma 20 works better than sigma 5, which is MUCH better than sigma 1; that was the limit of hyperparameter optimization
55+
# On the basic CoNLL dataset (no distsim, no extra data), sigma=50 is a bit better still (by 0.13 F1)
56+
sigma = 20
57+
58+
# For making faster (less features); changing this to 0.025 doesn't improve performance
59+
featureDiffThresh=0.05
60+
61+
# evaluateIOB=true
62+
63+
# other notes
64+
# even though useTaggySequences will use distsim rather than POS sequences, turning it on didn't help
65+
# adding useWordPairs doesn't seem to help. (Getting them anyway in an edge feature.)

scripts/nndep/Makefile

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
ENGLISH_EMBEDDINGS=/u/nlp/data/depparser/nn/data/embeddings/en-cw.txt
22
CHINESE_EMBEDDINGS=/u/nlp/data/depparser/nn/data/embeddings/zh-word2vec.txt
3+
ITALIAN_EMBEDDINGS=/u/nlp/software/CoreNLP-models/it/depparse/4.3.0/it.embedding.txt
34

45
DATA_DIR=/u/nlp/data/dependency_treebanks
56

@@ -28,6 +29,9 @@ UD_FRENCH_TRAIN=${DATA_DIR}/UD/1.1/fr/fr-ud-train-clean.conllu
2829
UD_FRENCH_DEV=${DATA_DIR}/UD/1.1/fr/fr-ud-dev-clean.conllu
2930
UD_FRENCH_TEST=${DATA_DIR}/UD/1.1/fr/fr-ud-test-clean.conllu
3031

32+
UD_ITALIAN_TRAIN = /u/nlp/software/CoreNLP-models/it/depparse/4.3.0/it_isdt-ud-train.conllu
33+
UD_ITALIAN_DEV = /u/nlp/software/CoreNLP-models/it/depparse/4.3.0/it_isdt-ud-dev.conllu
34+
UD_ITALIAN_TEST = /u/nlp/software/CoreNLP-models/it/depparse/4.3.0/it_isdt-ud-test.conllu
3135

3236
PTB_Stanford:
3337
java edu.stanford.nlp.parser.nndep.DependencyParser -props nndep.properties -trainFile $(PTB_STANFORD_TRAIN) -devFile $(PTB_STANFORD_DEV) -embedFile $(ENGLISH_EMBEDDINGS) -model $@.txt.gz > $@.log 2>&1
@@ -62,3 +66,9 @@ UD_FRENCH:
6266
java edu.stanford.nlp.parser.nndep.DependencyParser -testFile $(UD_FRENCH_DEV) -language French -model $@.txt.gz -outFile $@.out.dev >> $@.log 2>&1
6367
java edu.stanford.nlp.parser.nndep.DependencyParser -testFile $(UD_FRENCH_TEST) -language French -model $@.txt.gz -outFile $@.out.test >> $@.log 2>&1
6468

69+
# the ONE THING the language setting gets used for is punctuation, so...
70+
UD_ITALIAN:
71+
java edu.stanford.nlp.parser.nndep.DependencyParser -props nndep.properties -trainFile $(UD_ITALIAN_TRAIN) -language French -devFile $(UD_ITALIAN_DEV) -embedFile $(ITALIAN_EMBEDDINGS) -embeddingSize 100 -model $@.txt.gz >> $@.log 2>&1
72+
java edu.stanford.nlp.parser.nndep.DependencyParser -testFile $(UD_ITALIAN_DEV) -language French -model $@.txt.gz -outFile $@.out.dev >> $@.log 2>&1
73+
java edu.stanford.nlp.parser.nndep.DependencyParser -testFile $(UD_ITALIAN_TEST) -language French -model $@.txt.gz -outFile $@.out.test >> $@.log 2>&1
74+

scripts/pos-tagger/Makefile

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,14 @@ GERMAN_TEST = format=TSV,wordColumn=1,tagColumn=3,comments=True,/u/nlp/software/
1313

1414
HUNGARIAN_TEST = format=TSV,comments=true,wordColumn=1,tagColumn=3,/u/nlp/software/CoreNLP-models/hu/pos/4.3.0/hu_szeged-ud-test.conllu
1515

16+
ITALIAN_TEST = format=TSV,comments=true,wordColumn=1,tagColumn=3,/u/nlp/software/CoreNLP-models/it/pos/4.3.0/it_isdt-ud-test.conllu
17+
1618
SPANISH_TEST = format=TREES,/u/nlp/data/spanish/ancora/ancora.test,/u/nlp/data/spanish/ldc/ldc-NW.test,/u/nlp/data/spanish/ldc/ldc-DF.test
1719

1820
.SECONDEXPANSION:
1921

20-
all: arabic chinese english french german hungarian spanish testing wsj
21-
.PHONY: all arabic chinese english french german hungarian spanish testing wsj
22+
all: arabic chinese english french german hungarian italian spanish testing wsj
23+
.PHONY: all arabic chinese english french german hungarian italian spanish testing wsj
2224

2325
arabic: arabic.tagger arabic-train.tagger
2426

@@ -71,6 +73,14 @@ hungarian.tagger: $$@.props
7173
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
7274
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(HUNGARIAN_TEST) -verboseResults false >> $@.out 2>&1
7375

76+
italian: italian.tagger
77+
78+
italian.tagger: $$@.props
79+
@echo Training $@
80+
@echo Will test on $(ITALIAN_TEST)
81+
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
82+
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(ITALIAN_TEST) -verboseResults false >> $@.out 2>&1
83+
7484

7585
spanish: spanish.tagger spanish-distsim.tagger
7686

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
## tagger training invoked at Tue Sep 14 13:49:09 PDT 2021 with arguments:
2+
model = italian.tagger
3+
arch = left3words,naacl2003unknowns,suffix(10),prefix(4),words(-2,2),unicodeshapes(-2,2),unicodeshapeconjunction(-1,1)
4+
wordFunction =
5+
trainFile = format=TSV,comments=true,wordColumn=1,tagColumn=3,/u/nlp/software/CoreNLP-models/it/pos/4.3.0/it_isdt-ud-train.conllu;format=TSV,comments=true,wordColumn=1,tagColumn=3,/u/nlp/software/CoreNLP-models/it/pos/4.3.0/it_vit-ud-train.conllu;format=TSV,comments=true,wordColumn=1,tagColumn=3,/u/nlp/software/CoreNLP-models/it/pos/4.3.0/it_isdt-ud-dev.conllu;format=TSV,comments=true,wordColumn=1,tagColumn=3,/u/nlp/software/CoreNLP-models/it/pos/4.3.0/it_vit-ud-dev.conllu;format=TSV,comments=true,wordColumn=1,tagColumn=3,/u/nlp/software/CoreNLP-models/it/pos/4.3.0/italian.mwt
6+
closedClassTags =
7+
closedClassTagThreshold = 40
8+
curWordMinFeatureThresh = 2
9+
debug = false
10+
debugPrefix =
11+
tagSeparator = _
12+
encoding = utf-8
13+
iterations = 100
14+
lang =
15+
learnClosedClassTags = false
16+
minFeatureThresh = 2
17+
openClassTags =
18+
rareWordMinFeatureThresh = 10
19+
rareWordThresh = 5
20+
search = owlqn2
21+
sgml = false
22+
sigmaSquared = 0.0
23+
regL1 = 0.75
24+
tagInside =
25+
tokenize = true
26+
tokenizerFactory =
27+
tokenizerOptions =
28+
verbose = false
29+
verboseResults = true
30+
veryCommonWordThresh = 250
31+
xmlInput = null
32+
outputFile =
33+
outputFormat = slashTags
34+
outputFormatOptions =
35+
nthreads = 1
36+
minWordsLockTags = 1

0 commit comments

Comments
 (0)