Skip to content

Commit a863b6c

Browse files
committed
TLP & TBPP for Hungarian
Includes a TreeNormalizer which completely strips the subcategories when reading the SPMRL treebank Uses a right headfinder instead of a left
1 parent edbd4be commit a863b6c

File tree

3 files changed

+249
-0
lines changed

3 files changed

+249
-0
lines changed
Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,149 @@
1+
package edu.stanford.nlp.parser.lexparser;
2+
3+
import java.io.Serializable;
4+
import java.util.ArrayList;
5+
import java.util.List;
6+
7+
8+
import edu.stanford.nlp.ling.HasWord;
9+
import edu.stanford.nlp.ling.Word;
10+
import edu.stanford.nlp.trees.DiskTreebank;
11+
import edu.stanford.nlp.trees.HeadFinder;
12+
import edu.stanford.nlp.trees.RightHeadFinder;
13+
import edu.stanford.nlp.trees.MemoryTreebank;
14+
import edu.stanford.nlp.trees.PennTreeReaderFactory;
15+
import edu.stanford.nlp.trees.Tree;
16+
import edu.stanford.nlp.trees.TreeNormalizer;
17+
import edu.stanford.nlp.trees.TreeReaderFactory;
18+
import edu.stanford.nlp.trees.TreeTransformer;
19+
import edu.stanford.nlp.trees.international.hungarian.HungarianTreebankLanguagePack;
20+
import edu.stanford.nlp.util.StringUtils;
21+
import edu.stanford.nlp.util.logging.Redwood;
22+
23+
/**
24+
* Bare-bones implementation of a ParserParams for the Hungarian SPMRL treebank.
25+
* <br>
26+
* Suitable for use in the SR Parser. Will need additional work to function in the PCFG.
27+
* Also, would likely function better with a new headfinder.
28+
*/
29+
public class HungarianTreebankParserParams extends AbstractTreebankParserParams {
30+
/** A logger for this class */
31+
private static final Redwood.RedwoodChannels log = Redwood.channels(HungarianTreebankParserParams.class);
32+
33+
public HungarianTreebankParserParams() {
34+
super(new HungarianTreebankLanguagePack());
35+
// TODO: make a Hungarian specific HeadFinder or build one that can be learned
36+
headFinder = new RightHeadFinder();
37+
}
38+
39+
private HeadFinder headFinder;
40+
41+
private TreeNormalizer normalizer = null;
42+
43+
static final String[] EMPTY_SISTERS = new String[0];
44+
45+
@Override
46+
public HeadFinder headFinder() {
47+
return headFinder;
48+
}
49+
50+
@Override
51+
public HeadFinder typedDependencyHeadFinder() {
52+
return headFinder;
53+
}
54+
55+
/**
56+
* Allows you to read in trees from the source you want. It's the
57+
* responsibility of treeReaderFactory() to deal properly with character-set
58+
* encoding of the input. It also is the responsibility of tr to properly
59+
* normalize trees.
60+
*/
61+
@Override
62+
public DiskTreebank diskTreebank() {
63+
return new DiskTreebank(treeReaderFactory());
64+
}
65+
66+
67+
/**
68+
* Allows you to read in trees from the source you want. It's the
69+
* responsibility of treeReaderFactory() to deal properly with character-set
70+
* encoding of the input. It also is the responsibility of tr to properly
71+
* normalize trees.
72+
*/
73+
@Override
74+
public MemoryTreebank memoryTreebank() {
75+
return new MemoryTreebank(treeReaderFactory());
76+
}
77+
78+
@Override
79+
public TreeTransformer collinizer() {
80+
return new TreeCollinizer(tlp, true, false, 0);
81+
}
82+
83+
@Override
84+
public TreeTransformer collinizerEvalb() {
85+
return collinizer();
86+
}
87+
88+
@Override
89+
public String[] sisterSplitters() {
90+
// TODO: the SR Parser does not use this code path, so it is not implemented
91+
return EMPTY_SISTERS;
92+
}
93+
94+
@Override
95+
public Tree transformTree(Tree t, Tree root) {
96+
// TODO: the SR Parser does not use this code path, so it is not implemented
97+
return t;
98+
}
99+
100+
public static class HungarianSubcategoryStripper extends TreeNormalizer {
101+
@Override
102+
public String normalizeNonterminal(String category) {
103+
List<String> pieces = StringUtils.split(category, ":");
104+
category = pieces.get(0);
105+
if (category.equals("PP-locy")) {
106+
category = "PP-LOCY";
107+
}
108+
109+
// TODO: maybe some categories should be kept?
110+
pieces = StringUtils.split(category, "-");
111+
category = pieces.get(0);
112+
113+
return pieces.get(0);
114+
}
115+
}
116+
117+
TreeNormalizer buildNormalizer() {
118+
return new HungarianSubcategoryStripper();
119+
}
120+
121+
/** {@inheritDoc} */
122+
@Override
123+
public TreeReaderFactory treeReaderFactory() {
124+
if (normalizer == null) {
125+
normalizer = buildNormalizer();
126+
}
127+
return new PennTreeReaderFactory(normalizer);
128+
}
129+
130+
131+
@Override
132+
public void display() {
133+
String hungarianParams = "Using HungarianTreebankParserParams";
134+
log.info(hungarianParams);
135+
}
136+
137+
/** {@inheritDoc} */
138+
@Override
139+
public List<? extends HasWord> defaultTestSentence() {
140+
List<Word> ret = new ArrayList<>();
141+
String[] sent = {"Ez", "egy", "teszt", "."};
142+
for (String str : sent) {
143+
ret.add(new Word(str));
144+
}
145+
return ret;
146+
}
147+
148+
private static final long serialVersionUID = 5652324513L;
149+
}
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
package edu.stanford.nlp.trees;
2+
3+
4+
/**
5+
* HeadFinder that always returns the rightmost daughter as head.
6+
*<br>
7+
* Useful for languages which have a mostly right branching structure
8+
* where we haven't done a ton of work figuring out how to find heads.
9+
*<br>
10+
* In particular, a conversation with Dora Demszky made it sound like
11+
* Hungarian would get better results with a RightHeadFinder instead
12+
* of LeftHeadFinder
13+
*
14+
* @author John Bauer
15+
*/
16+
public class RightHeadFinder implements HeadFinder {
17+
18+
private static final long serialVersionUID = 127638412457653L;
19+
20+
public Tree determineHead(Tree t) {
21+
if (t.isLeaf()) {
22+
return null;
23+
} else {
24+
int child = t.numChildren() - 1;
25+
return t.children()[child];
26+
}
27+
}
28+
29+
public Tree determineHead(Tree t, Tree parent) {
30+
return determineHead(t);
31+
}
32+
33+
}
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
package edu.stanford.nlp.trees.international.hungarian;
2+
3+
import edu.stanford.nlp.trees.AbstractTreebankLanguagePack;
4+
import edu.stanford.nlp.trees.HeadFinder;
5+
import edu.stanford.nlp.trees.LeftHeadFinder;
6+
7+
/**
8+
* Treebank language pack suitable for the Hungarian section of SPMRL
9+
*/
10+
public class HungarianTreebankLanguagePack extends AbstractTreebankLanguagePack {
11+
12+
private static final long serialVersionUID = -7982635612452142L;
13+
14+
// both sentence final and mid-sentence punctuation use PUNC
15+
// the UD tagger will redo the tags to be PUNCT
16+
private static final String[] punctTags = { "PUNC", "PUNCT" };
17+
18+
private static final String[] punctWords = { "!", "\"", "&", "'", "§", "(", ")", "+", ",", "-", ".", "...", "/", "—", ":", ";", "==", "?" };
19+
20+
private static final String[] startSymbols = { "ROOT" };
21+
22+
private static final String[] SFPunctWords = {".", "!", "?"};
23+
24+
@Override
25+
public String[] punctuationTags() {
26+
return punctTags;
27+
}
28+
29+
@Override
30+
public String[] punctuationWords() {
31+
return punctWords;
32+
}
33+
34+
@Override
35+
public String[] sentenceFinalPunctuationTags() {
36+
return punctTags;
37+
}
38+
39+
@Override
40+
public String[] sentenceFinalPunctuationWords() {
41+
return SFPunctWords;
42+
}
43+
44+
@Override
45+
public String[] startSymbols() {
46+
return startSymbols;
47+
}
48+
49+
/** {@inheritDoc} */
50+
@Override
51+
public String treebankFileExtension() {
52+
return "ptb";
53+
}
54+
55+
/** {@inheritDoc} */
56+
@Override
57+
public HeadFinder headFinder() {
58+
return new LeftHeadFinder();
59+
}
60+
61+
/** {@inheritDoc} */
62+
@Override
63+
public HeadFinder typedDependencyHeadFinder() {
64+
return new LeftHeadFinder();
65+
}
66+
67+
}

0 commit comments

Comments
 (0)