Skip to content

Commit 39635aa

Browse files
committed
Update tagger Makefile with the latest German UD model
1 parent 41075fb commit 39635aa

File tree

2 files changed

+40
-3
lines changed

2 files changed

+40
-3
lines changed

scripts/pos-tagger/Makefile

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ ENGLISH_TEST = /u/nlp/data/pos-tagger/english/test-wsj-19-21
99

1010
FRENCH_TEST = format=TREES,/u/nlp/data/lexparser/trees/FrenchCC/FTB-Test.utf8.txt
1111

12-
GERMAN_TEST = format=TREES,trf=edu.stanford.nlp.trees.international.negra.NegraPennTreeReaderFactory,/u/nlp/data/GermanACL08/negra/negra_3.mrg
12+
GERMAN_TEST = format=TSV,wordColumn=1,tagColumn=3,comments=True,/u/nlp/software/CoreNLP-models/de/pos/german-ud/4.3.0/de_gsd-ud-test.conllu
1313

1414
SPANISH_TEST = format=TREES,/u/nlp/data/spanish/ancora/ancora.test,/u/nlp/data/spanish/ldc/ldc-NW.test,/u/nlp/data/spanish/ldc/ldc-DF.test
1515

@@ -52,9 +52,10 @@ french.tagger: $$@.props
5252
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
5353
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -model $@ -testFile $(FRENCH_TEST) -verboseResults false >> $@.out 2>&1
5454

55-
german: german-dewac.tagger german-fast.tagger german-fast-caseless.tagger german-hgc.tagger
55+
# TODO: people might still appreciate a fast tagger?
56+
german: german-ud.tagger
5657

57-
german-dewac.tagger german-fast.tagger german-fast-caseless.tagger german-hgc.tagger: $$@.props
58+
german-ud.tagger: $$@.props
5859
@echo Training $@
5960
@echo Will test on $(GERMAN_TEST)
6061
java -mx6g edu.stanford.nlp.tagger.maxent.MaxentTagger -props $@.props > $@.out 2>&1
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
## tagger training invoked at Mon Sep 13 00:20:27 PDT 2021 with arguments:
2+
model = german-ud.tagger
3+
arch = left3words,naacl2003unknowns,unicodeshapes(-2,2),distsim(/u/nlp/data/german/ner/hgc_175m_600,-1,1),distsimconjunction(/u/nlp/data/german/ner/hgc_175m_600,-1,1),unicodeshapeconjunction(-1,1)
4+
wordFunction =
5+
trainFile = format=TSV,wordColumn=1,tagColumn=3,comments=True,/u/nlp/software/CoreNLP-models/de/pos/german-ud/4.3.0/de_gsd-ud-train.conllu;format=TSV,wordColumn=1,tagColumn=3,comments=True,/u/nlp/software/CoreNLP-models/de/pos/german-ud/4.3.0/de_gsd-ud-dev.conllu
6+
closedClassTags =
7+
closedClassTagThreshold = 40
8+
curWordMinFeatureThresh = 2
9+
debug = false
10+
debugPrefix =
11+
tagSeparator = _
12+
encoding = utf-8
13+
iterations = 100
14+
lang = german
15+
learnClosedClassTags = false
16+
minFeatureThresh = 2
17+
openClassTags =
18+
rareWordMinFeatureThresh = 10
19+
rareWordThresh = 5
20+
search = owlqn2
21+
sgml = false
22+
sigmaSquared = 0.0
23+
regL1 = 0.625
24+
tagInside =
25+
tokenize = true
26+
tokenizerFactory =
27+
tokenizerOptions = asciiQuotes
28+
verbose = false
29+
verboseResults = true
30+
veryCommonWordThresh = 250
31+
xmlInput = null
32+
outputFile =
33+
outputFormat = slashTags
34+
outputFormatOptions =
35+
nthreads = 1
36+
minWordsLockTags = 1

0 commit comments

Comments
 (0)