TLP & TBPP for Hungarian

AngledLuffa · AngledLuffa · commit a863b6ca2d8e · 2021-09-30T12:22:01.000-07:00
Includes a TreeNormalizer which completely strips the subcategories when reading the SPMRL treebank

Uses a right headfinder instead of a left
diff --git a/src/edu/stanford/nlp/parser/lexparser/HungarianTreebankParserParams.java b/src/edu/stanford/nlp/parser/lexparser/HungarianTreebankParserParams.java
@@ -0,0 +1,149 @@
+package edu.stanford.nlp.parser.lexparser;
+
+import java.io.Serializable;
+import java.util.ArrayList;
+import java.util.List;
+
+
+import edu.stanford.nlp.ling.HasWord;
+import edu.stanford.nlp.ling.Word;
+import edu.stanford.nlp.trees.DiskTreebank;
+import edu.stanford.nlp.trees.HeadFinder;
+import edu.stanford.nlp.trees.RightHeadFinder;
+import edu.stanford.nlp.trees.MemoryTreebank;
+import edu.stanford.nlp.trees.PennTreeReaderFactory;
+import edu.stanford.nlp.trees.Tree;
+import edu.stanford.nlp.trees.TreeNormalizer;
+import edu.stanford.nlp.trees.TreeReaderFactory;
+import edu.stanford.nlp.trees.TreeTransformer;
+import edu.stanford.nlp.trees.international.hungarian.HungarianTreebankLanguagePack;
+import edu.stanford.nlp.util.StringUtils;
+import edu.stanford.nlp.util.logging.Redwood;
+
+/**
+ * Bare-bones implementation of a ParserParams for the Hungarian SPMRL treebank.
+ * <br>
+ * Suitable for use in the SR Parser.  Will need additional work to function in the PCFG.
+ * Also, would likely function better with a new headfinder.
+ */
+public class HungarianTreebankParserParams extends AbstractTreebankParserParams  {
+  /** A logger for this class */
+  private static final Redwood.RedwoodChannels log = Redwood.channels(HungarianTreebankParserParams.class);
+
+  public HungarianTreebankParserParams() {
+    super(new HungarianTreebankLanguagePack());
+    // TODO: make a Hungarian specific HeadFinder or build one that can be learned
+    headFinder = new RightHeadFinder();
+  }
+
+  private HeadFinder headFinder;
+
+  private TreeNormalizer normalizer = null;
+
+  static final String[] EMPTY_SISTERS = new String[0];
+
+  @Override
+  public HeadFinder headFinder() {
+    return headFinder;
+  }
+
+  @Override
+  public HeadFinder typedDependencyHeadFinder() {
+    return headFinder;
+  }
+
+  /**
+   * Allows you to read in trees from the source you want.  It's the
+   * responsibility of treeReaderFactory() to deal properly with character-set
+   * encoding of the input.  It also is the responsibility of tr to properly
+   * normalize trees.
+   */
+  @Override
+  public DiskTreebank diskTreebank() {
+    return new DiskTreebank(treeReaderFactory());
+  }
+
+
+  /**
+   * Allows you to read in trees from the source you want.  It's the
+   * responsibility of treeReaderFactory() to deal properly with character-set
+   * encoding of the input.  It also is the responsibility of tr to properly
+   * normalize trees.
+   */
+  @Override
+  public MemoryTreebank memoryTreebank() {
+    return new MemoryTreebank(treeReaderFactory());
+  }
+
+  @Override
+  public TreeTransformer collinizer() {
+    return new TreeCollinizer(tlp, true, false, 0);
+  }
+
+  @Override
+  public TreeTransformer collinizerEvalb() {
+    return collinizer();
+  }
+
+  @Override
+  public String[] sisterSplitters() {
+    // TODO: the SR Parser does not use this code path, so it is not implemented
+    return EMPTY_SISTERS;
+  }
+
+  @Override
+  public Tree transformTree(Tree t, Tree root) {
+    // TODO: the SR Parser does not use this code path, so it is not implemented
+    return t;
+  }
+
+  public static class HungarianSubcategoryStripper extends TreeNormalizer {
+    @Override
+    public String normalizeNonterminal(String category) {
+      List<String> pieces = StringUtils.split(category, ":");
+      category = pieces.get(0);
+      if (category.equals("PP-locy")) {
+        category = "PP-LOCY";
+      }
+
+      // TODO: maybe some categories should be kept?
+      pieces = StringUtils.split(category, "-");
+      category = pieces.get(0);
+
+      return pieces.get(0);
+    }
+  }
+
+  TreeNormalizer buildNormalizer() {
+    return new HungarianSubcategoryStripper();
+  }
+
+  /** {@inheritDoc} */
+  @Override
+  public TreeReaderFactory treeReaderFactory() {
+    if (normalizer == null) {
+      normalizer = buildNormalizer();
+    }
+    return new PennTreeReaderFactory(normalizer);
+  }
+
+
+  @Override
+  public void display() {
+    String hungarianParams = "Using HungarianTreebankParserParams";
+    log.info(hungarianParams);
+  }
+
+  /** {@inheritDoc} */
+  @Override
+  public List<? extends HasWord> defaultTestSentence() {
+    List<Word> ret = new ArrayList<>();
+    String[] sent = {"Ez", "egy", "teszt", "."};
+    for (String str : sent) {
+      ret.add(new Word(str));
+    }
+    return ret;
+  }
+
+  private static final long serialVersionUID = 5652324513L;
+}
diff --git a/src/edu/stanford/nlp/trees/RightHeadFinder.java b/src/edu/stanford/nlp/trees/RightHeadFinder.java
@@ -0,0 +1,33 @@
+package edu.stanford.nlp.trees;
+
+
+/**
+ * HeadFinder that always returns the rightmost daughter as head.
+ *<br>
+ * Useful for languages which have a mostly right branching structure
+ * where we haven't done a ton of work figuring out how to find heads.
+ *<br>
+ * In particular, a conversation with Dora Demszky made it sound like
+ * Hungarian would get better results with a RightHeadFinder instead
+ * of LeftHeadFinder
+ *
+ * @author John Bauer
+ */
+public class RightHeadFinder implements HeadFinder {
+
+  private static final long serialVersionUID = 127638412457653L;
+
+  public Tree determineHead(Tree t) {
+    if (t.isLeaf()) {
+      return null;
+    } else {
+      int child = t.numChildren() - 1;
+      return t.children()[child];
+    }
+  }
+
+  public Tree determineHead(Tree t, Tree parent) {
+    return determineHead(t);
+  }
+
+}
diff --git a/src/edu/stanford/nlp/trees/international/hungarian/HungarianTreebankLanguagePack.java b/src/edu/stanford/nlp/trees/international/hungarian/HungarianTreebankLanguagePack.java
@@ -0,0 +1,67 @@
+package edu.stanford.nlp.trees.international.hungarian;
+
+import edu.stanford.nlp.trees.AbstractTreebankLanguagePack;
+import edu.stanford.nlp.trees.HeadFinder;
+import edu.stanford.nlp.trees.LeftHeadFinder;
+
+/**
+ * Treebank language pack suitable for the Hungarian section of SPMRL
+ */
+public class HungarianTreebankLanguagePack extends AbstractTreebankLanguagePack {
+
+  private static final long serialVersionUID = -7982635612452142L;
+
+  // both sentence final and mid-sentence punctuation use PUNC
+  // the UD tagger will redo the tags to be PUNCT
+  private static final String[] punctTags = { "PUNC", "PUNCT" };
+
+  private static final String[] punctWords = { "!", "\"", "&", "'", "§", "(", ")", "+", ",", "-", ".", "...", "/", "—", ":", ";", "==", "?" };
+  
+  private static final String[] startSymbols = { "ROOT" };
+
+  private static final String[] SFPunctWords = {".", "!", "?"};
+
+  @Override
+  public String[] punctuationTags() {
+    return punctTags;
+  }
+
+  @Override
+  public String[] punctuationWords() {
+    return punctWords;
+  }
+
+  @Override
+  public String[] sentenceFinalPunctuationTags() {
+    return punctTags;
+  }
+
+  @Override
+  public String[] sentenceFinalPunctuationWords() {
+    return SFPunctWords;
+  }
+
+  @Override
+  public String[] startSymbols() {
+    return startSymbols;
+  }
+  
+  /** {@inheritDoc} */
+  @Override
+  public String treebankFileExtension() {
+    return "ptb";
+  }
+
+  /** {@inheritDoc} */
+  @Override
+  public HeadFinder headFinder() {
+    return new LeftHeadFinder();
+  }
+
+  /** {@inheritDoc} */
+  @Override
+  public HeadFinder typedDependencyHeadFinder() {
+    return new LeftHeadFinder();
+  }
+
+}