Skip to content

Commit 4d0305f

Browse files
committed
Add skeletal tlp and tlpp for Italian
1 parent c7b0abc commit 4d0305f

File tree

4 files changed

+199
-5
lines changed

4 files changed

+199
-5
lines changed

scripts/nndep/Makefile

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -66,9 +66,8 @@ UD_FRENCH:
6666
java edu.stanford.nlp.parser.nndep.DependencyParser -testFile $(UD_FRENCH_DEV) -language French -model $@.txt.gz -outFile $@.out.dev >> $@.log 2>&1
6767
java edu.stanford.nlp.parser.nndep.DependencyParser -testFile $(UD_FRENCH_TEST) -language French -model $@.txt.gz -outFile $@.out.test >> $@.log 2>&1
6868

69-
# the ONE THING the language setting gets used for is punctuation, so...
7069
UD_ITALIAN:
71-
java edu.stanford.nlp.parser.nndep.DependencyParser -props nndep.properties -trainFile $(UD_ITALIAN_TRAIN) -language French -devFile $(UD_ITALIAN_DEV) -embedFile $(ITALIAN_EMBEDDINGS) -embeddingSize 100 -model $@.txt.gz >> $@.log 2>&1
72-
java edu.stanford.nlp.parser.nndep.DependencyParser -testFile $(UD_ITALIAN_DEV) -language French -model $@.txt.gz -outFile $@.out.dev >> $@.log 2>&1
73-
java edu.stanford.nlp.parser.nndep.DependencyParser -testFile $(UD_ITALIAN_TEST) -language French -model $@.txt.gz -outFile $@.out.test >> $@.log 2>&1
70+
java edu.stanford.nlp.parser.nndep.DependencyParser -props nndep.properties -trainFile $(UD_ITALIAN_TRAIN) -language Italian -devFile $(UD_ITALIAN_DEV) -embedFile $(ITALIAN_EMBEDDINGS) -embeddingSize 100 -model $@.txt.gz >> $@.log 2>&1
71+
java edu.stanford.nlp.parser.nndep.DependencyParser -testFile $(UD_ITALIAN_DEV) -language Italian -model $@.txt.gz -outFile $@.out.dev >> $@.log 2>&1
72+
java edu.stanford.nlp.parser.nndep.DependencyParser -testFile $(UD_ITALIAN_TEST) -language Italian -model $@.txt.gz -outFile $@.out.test >> $@.log 2>&1
7473

src/edu/stanford/nlp/international/Language.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ public enum Language {
4242
Hindi( treebankForLanguage("Hindi")),
4343
Hungarian( new HungarianTreebankParserParams()),
4444
Indonesian( treebankForLanguage("Indonesian")),
45-
Italian( treebankForLanguage("Italian")),
45+
Italian( new ItalianTreebankParserParams()),
4646
Irish( treebankForLanguage("Irish")),
4747
Kazakh( treebankForLanguage("Kazakh")),
4848
Korean( treebankForLanguage("Korean")),
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
package edu.stanford.nlp.parser.lexparser;
2+
3+
import java.io.Serializable;
4+
import java.util.ArrayList;
5+
import java.util.List;
6+
7+
8+
import edu.stanford.nlp.ling.HasWord;
9+
import edu.stanford.nlp.ling.Word;
10+
import edu.stanford.nlp.trees.DiskTreebank;
11+
import edu.stanford.nlp.trees.HeadFinder;
12+
import edu.stanford.nlp.trees.LeftHeadFinder;
13+
import edu.stanford.nlp.trees.MemoryTreebank;
14+
import edu.stanford.nlp.trees.PennTreeReaderFactory;
15+
import edu.stanford.nlp.trees.Tree;
16+
import edu.stanford.nlp.trees.TreeNormalizer;
17+
import edu.stanford.nlp.trees.TreeReaderFactory;
18+
import edu.stanford.nlp.trees.TreeTransformer;
19+
import edu.stanford.nlp.trees.international.italian.ItalianTreebankLanguagePack;
20+
import edu.stanford.nlp.util.StringUtils;
21+
import edu.stanford.nlp.util.logging.Redwood;
22+
23+
/**
24+
* Bare-bones implementation of a ParserParams for the Italian Turin treebank.
25+
* <br>
26+
* Suitable for use in the SR Parser. Will need additional work to function in the PCFG.
27+
* Also, would likely function better with a new headfinder.
28+
*/
29+
public class ItalianTreebankParserParams extends AbstractTreebankParserParams {
30+
/** A logger for this class */
31+
private static final Redwood.RedwoodChannels log = Redwood.channels(ItalianTreebankParserParams.class);
32+
33+
public ItalianTreebankParserParams() {
34+
super(new ItalianTreebankLanguagePack());
35+
// TODO: make a Italian specific HeadFinder or build one that can be learned
36+
headFinder = new LeftHeadFinder();
37+
}
38+
39+
private HeadFinder headFinder;
40+
41+
private TreeNormalizer normalizer = null;
42+
43+
static final String[] EMPTY_SISTERS = new String[0];
44+
45+
@Override
46+
public HeadFinder headFinder() {
47+
return headFinder;
48+
}
49+
50+
@Override
51+
public HeadFinder typedDependencyHeadFinder() {
52+
return headFinder;
53+
}
54+
55+
@Override
56+
public TreeTransformer collinizer() {
57+
return new TreeCollinizer(tlp, true, false, 0);
58+
}
59+
60+
@Override
61+
public TreeTransformer collinizerEvalb() {
62+
return collinizer();
63+
}
64+
65+
@Override
66+
public String[] sisterSplitters() {
67+
// TODO: the SR Parser does not use this code path, so it is not implemented
68+
return EMPTY_SISTERS;
69+
}
70+
71+
@Override
72+
public Tree transformTree(Tree t, Tree root) {
73+
// TODO: the SR Parser does not use this code path, so it is not implemented
74+
return t;
75+
}
76+
77+
public static class ItalianSubcategoryStripper extends TreeNormalizer {
78+
@Override
79+
public String normalizeNonterminal(String category) {
80+
// The stanza script leaves the fancy endings on the tags
81+
// but simplifies the constiituency tags
82+
List<String> pieces = StringUtils.split(category, "~");
83+
84+
return pieces.get(0);
85+
}
86+
}
87+
88+
TreeNormalizer buildNormalizer() {
89+
return new ItalianSubcategoryStripper();
90+
}
91+
92+
/** {@inheritDoc} */
93+
@Override
94+
public TreeReaderFactory treeReaderFactory() {
95+
if (normalizer == null) {
96+
normalizer = buildNormalizer();
97+
}
98+
return new PennTreeReaderFactory(normalizer);
99+
}
100+
101+
102+
@Override
103+
public void display() {
104+
String params = "Using ItalianTreebankParserParams";
105+
log.info(params);
106+
}
107+
108+
/** {@inheritDoc} */
109+
@Override
110+
public List<? extends HasWord> defaultTestSentence() {
111+
List<Word> ret = new ArrayList<>();
112+
String[] sent = {"Questo", "è", "un", "test", "."};
113+
for (String str : sent) {
114+
ret.add(new Word(str));
115+
}
116+
return ret;
117+
}
118+
119+
private static final long serialVersionUID = 9824524678L;
120+
}
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
package edu.stanford.nlp.trees.international.italian;
2+
3+
import edu.stanford.nlp.trees.AbstractTreebankLanguagePack;
4+
import edu.stanford.nlp.trees.HeadFinder;
5+
import edu.stanford.nlp.trees.LeftHeadFinder;
6+
7+
/**
8+
* Treebank language pack suitable for the Italian Turin treebank.
9+
*<br>
10+
* Note that the original Turin dataset had quite a few oddities which
11+
* made it unsuitable for directly using it. Stanza has a treebank
12+
* prep script which greatly simplifies it, though
13+
*/
14+
public class ItalianTreebankLanguagePack extends AbstractTreebankLanguagePack {
15+
16+
private static final long serialVersionUID = -235378253615245L;
17+
18+
// original treebank has PUNCT for some things, like -, but in general
19+
// the tags are ,.:
20+
// the UD tagger will redo the tags to be PUNCT
21+
private static final String[] punctTags = { "-LRB-", "-RRB-", ",", ".", ":", "\"", "PUNCT" };
22+
23+
private static final String[] SFPunctTags = { ".", ":", "PUNCT" };
24+
25+
private static final String[] punctWords = { "!", "\"", "&", "'", "§", "(", ")", "[", "]", "+", ",", "-", ".", "...", "/", "—", ":", ";", "==", "?" };
26+
27+
private static final String[] startSymbols = { "ROOT" };
28+
29+
// weirdly ... doesn't end sentences
30+
private static final String[] SFPunctWords = {":", ".", "!", "?", ";" };
31+
32+
@Override
33+
public String[] punctuationTags() {
34+
return punctTags;
35+
}
36+
37+
@Override
38+
public String[] punctuationWords() {
39+
return punctWords;
40+
}
41+
42+
@Override
43+
public String[] sentenceFinalPunctuationTags() {
44+
return SFPunctTags;
45+
}
46+
47+
@Override
48+
public String[] sentenceFinalPunctuationWords() {
49+
return SFPunctWords;
50+
}
51+
52+
@Override
53+
public String[] startSymbols() {
54+
return startSymbols;
55+
}
56+
57+
/** {@inheritDoc} */
58+
@Override
59+
public String treebankFileExtension() {
60+
return "mrg";
61+
}
62+
63+
/** {@inheritDoc} */
64+
@Override
65+
public HeadFinder headFinder() {
66+
return new LeftHeadFinder();
67+
}
68+
69+
/** {@inheritDoc} */
70+
@Override
71+
public HeadFinder typedDependencyHeadFinder() {
72+
return new LeftHeadFinder();
73+
}
74+
75+
}

0 commit comments

Comments
 (0)