Merge pull request #1129 from stanfordnlp/segmenter_stuff

AngledLuffa · web-flow · commit a1bddeece5ae · 2021-01-21T20:58:21.000-08:00
Segmenter stuff
diff --git a/src/edu/stanford/nlp/ie/AbstractSequenceClassifier.java b/src/edu/stanford/nlp/ie/AbstractSequenceClassifier.java
@@ -252,14 +252,14 @@ public DocumentReaderAndWriter<IN> makeReaderAndWriter() {
    * edu.stanford.nlp.wordseg.Sighan2005DocumentReaderAndWriter for
    * the Chinese Segmenter.
    */
-  public DocumentReaderAndWriter<IN> makePlainTextReaderAndWriter() {
+  public static <INN extends CoreMap> DocumentReaderAndWriter<INN> makePlainTextReaderAndWriter(SeqClassifierFlags flags) {
     String readerClassName = flags.plainTextDocumentReaderAndWriter;
     // We set this default here if needed because there may be models
     // which don't have the reader flag set
     if (readerClassName == null) {
       readerClassName = SeqClassifierFlags.DEFAULT_PLAIN_TEXT_READER;
     }
-    DocumentReaderAndWriter<IN> readerAndWriter;
+    DocumentReaderAndWriter<INN> readerAndWriter;
     try {
       readerAndWriter = ReflectionLoading.loadByReflection(readerClassName);
     } catch (Exception e) {
@@ -269,6 +269,10 @@ public DocumentReaderAndWriter<IN> makePlainTextReaderAndWriter() {
     return readerAndWriter;
   }
 
+  public DocumentReaderAndWriter<IN> makePlainTextReaderAndWriter() {
+    return makePlainTextReaderAndWriter(flags);
+  }
+
   /**
    * Returns the background class for the classifier.
    *
diff --git a/src/edu/stanford/nlp/sequences/SeqClassifierFlags.java b/src/edu/stanford/nlp/sequences/SeqClassifierFlags.java
@@ -1081,6 +1081,8 @@ public enum SlashHyphenEnum { NONE, WFRAG, WORD, BOTH };
    */
   public boolean useMoreNeighborNGrams = false;
 
+  /** if using dict2 in a segmenter, load it with this filename */
+  public String dict2name = "";
 
   // "ADD VARIABLES ABOVE HERE"
 
@@ -2634,6 +2636,8 @@ public void setProperties(Properties props, boolean printProps) {
       } else if (key.equalsIgnoreCase("ner.model")) {
         nerModel = val;
       } else if (key.equalsIgnoreCase("sutime.language")) {
+      } else if (key.equalsIgnoreCase("dict2name")) {
+        dict2name = val;
         // ADD VALUE ABOVE HERE
       } else if ( ! key.isEmpty() && ! key.equals("prop")) {
         log.info("Unknown property: |" + key + '|');
diff --git a/src/edu/stanford/nlp/wordseg/Gale2007ChineseSegmenterFeatureFactory.java b/src/edu/stanford/nlp/wordseg/Gale2007ChineseSegmenterFeatureFactory.java
@@ -52,6 +52,7 @@ public class Gale2007ChineseSegmenterFeatureFactory<IN extends CoreLabel> extend
 
   private transient TagAffixDetector taDetector; // = null;
   private transient CorpusDictionary outDict; // = null;
+  private transient NonDict2 nonDict; // = null;
 
   @Override
   public void init(SeqClassifierFlags flags) {
@@ -71,6 +72,11 @@ private synchronized void createOutDict() {
     }
   }
 
+  private synchronized void createNonDict() {
+    if (nonDict == null) {
+      nonDict = new NonDict2(flags);
+    }
+  }
 
   /**
    * Extracts all the features from the input data at a certain index.
@@ -479,8 +485,10 @@ protected Collection<String> featuresCpC(PaddedList<? extends CoreLabel> cInfo,
      * This is frickin' useful.  I hadn't realized.  CDM Oct 2007.
      */
     if (flags.useDict2) {
-      NonDict2 nd = new NonDict2(flags);
-      features.add(nd.checkDic(charp+charc, flags)+"nondict");
+      if (nonDict == null) {
+        createNonDict();
+      }
+      features.add(nonDict.checkDic(charp+charc, flags)+"nondict");
     }
 
     if (flags.useOutDict2) {
diff --git a/src/edu/stanford/nlp/wordseg/NonDict2.java b/src/edu/stanford/nlp/wordseg/NonDict2.java
@@ -1,43 +1,54 @@
 package edu.stanford.nlp.wordseg;
 
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Properties;
+import java.util.Set;
 
-import edu.stanford.nlp.util.logging.Redwood;
-
+import edu.stanford.nlp.ie.AbstractSequenceClassifier;
+import edu.stanford.nlp.io.IOUtils;
+import edu.stanford.nlp.ling.CoreLabel;
+import edu.stanford.nlp.sequences.DocumentReaderAndWriter;
 import edu.stanford.nlp.sequences.SeqClassifierFlags;
+import edu.stanford.nlp.util.StringUtils;
+import edu.stanford.nlp.util.logging.Redwood;
 
 public class NonDict2  {
 
   //public String sighanCorporaDict = "/u/nlp/data/chinese-segmenter/";
-  public String corporaDict = "/u/nlp/data/gale/segtool/stanford-seg/data/";
-  private static CorpusDictionary cd = null;
+  public static final String DEFAULT_HOME = "/u/nlp/data/gale/segtool/stanford-seg/data/";
+  public final String corporaDict;
+  private final CorpusDictionary cd;
 
   private static Redwood.RedwoodChannels logger = Redwood.channels(NonDict2.class);
 
   public NonDict2(SeqClassifierFlags flags) {
-    if (cd == null) {
-
-      if (flags.sighanCorporaDict != null) {
-        corporaDict = flags.sighanCorporaDict; // use the same flag for Sighan 2005,
-        // but our list is extracted from ctb
-      }
-      String path;
-      if (flags.useAs || flags.useHk || flags.useMsr) {
-        throw new RuntimeException("only support settings for CTB and PKU now.");
-      } else if ( flags.usePk ) {
-        path = corporaDict+"/dict/pku.non";
-      } else { // CTB
-        path = corporaDict+"/dict/ctb.non";
-      }
+    if (flags.sighanCorporaDict != null) {
+      corporaDict = flags.sighanCorporaDict; // use the same flag for Sighan 2005,
+      // but our list is extracted from ctb
+    } else {
+      corporaDict = DEFAULT_HOME;
+    }
 
-      cd = new CorpusDictionary(path);
-      // just output the msg...
-      if (flags.useAs || flags.useHk || flags.useMsr) {
-      } else if ( flags.usePk ) {
-        logger.info("INFO: flags.usePk=true | building NonDict2 from "+path);
-      } else { // CTB
-        logger.info("INFO: flags.usePk=false | building NonDict2 from "+path);
-      }
+    String path;
+    if (flags.dict2name != null && !flags.dict2name.equals("")) {
+      path = corporaDict + "/dict/" + flags.dict2name;
+      logger.info("INFO: dict2name specified | building NonDict2 from "+path);
+    } else if (flags.useAs || flags.useHk || flags.useMsr) {
+      throw new RuntimeException("only support settings for CTB and PKU now.");
+    } else if ( flags.usePk ) {
+      path = corporaDict+"/dict/pku.non";
+      logger.info("INFO: flags.usePk=true | building NonDict2 from "+path);
+    } else { // CTB
+      path = corporaDict+"/dict/ctb.non";
+      logger.info("INFO: flags.usePk=false | building NonDict2 from "+path);
     }
+
+    cd = new CorpusDictionary(path);
   }
 
   public String checkDic(String c2, SeqClassifierFlags flags) {
@@ -47,4 +58,65 @@ public String checkDic(String c2, SeqClassifierFlags flags) {
     return "0";
   }
 
+  /**
+   * Rebuilds a non-dict.  Use -textFile and -outputFile as appropriate.
+   * Uses SeqClassifierFlags so that specific flags for the reader can be honored.
+   */
+  public static void main(String[] args) throws IOException {
+    Properties props = StringUtils.argsToProperties(args, SeqClassifierFlags.flagsToNumArgs());
+
+    /*
+    // TODO: refactor this into a util?
+    // TODO: whitespace reader
+    boolean foundReader = false;
+    for (String propKey : props.stringPropertyNames()) {
+      if (propKey.equalsIgnoreCase("plainTextDocumentReaderAndWriter")) {
+        foundReader = true;
+        break;
+      }
+    }
+    if (!foundReader) {
+      // this doesn't exist
+      props.setProperty("plainTextDocumentReaderAndWriter", "edu.stanford.nlp.sequences.WhitespaceDocumentReaderAndWriter");
+    }
+    */
+
+    SeqClassifierFlags flags = new SeqClassifierFlags(props);
+
+    String inputFilename = flags.textFile;
+    String outputFilename = flags.outputFile;
+
+    DocumentReaderAndWriter<CoreLabel> readerAndWriter = AbstractSequenceClassifier.makePlainTextReaderAndWriter(flags);
+    readerAndWriter.init(flags);
+
+    Set<String> splitBigrams = new HashSet<>();
+
+    FileReader fin = new FileReader(inputFilename);
+    // for some weird syntax reason this can't take the place of ': iterable'
+    Iterable<List<CoreLabel>> iterable = () -> readerAndWriter.getIterator(fin);
+    List<CoreLabel> prevSentence = null;
+    for (List<CoreLabel> sentence : iterable) {
+      for (int i = 0; i < sentence.size() - 1; ++i) {
+        String prevWord = sentence.get(i).value();
+        String nextWord = sentence.get(i+1).value();
+        String bigram = prevWord.substring(prevWord.length() - 1) + nextWord.substring(0, 1);
+        splitBigrams.add(bigram);
+      }
+      if (prevSentence != null) {
+        String prevWord = prevSentence.get(prevSentence.size() - 1).value();
+        String nextWord = sentence.get(0).value();
+        String bigram = prevWord.substring(prevWord.length() - 1) + nextWord.substring(0, 1);
+        splitBigrams.add(bigram);
+      }
+      prevSentence = sentence;
+    }
+    fin.close();
+
+    PrintWriter fout = IOUtils.getPrintWriter(outputFilename, "utf-8");
+    for (String bigram : splitBigrams) {
+      fout.print(bigram);
+      fout.println();
+    }
+    fout.close();
+  }
 }
diff --git a/src/edu/stanford/nlp/wordseg/Sighan2005DocumentReaderAndWriter.java b/src/edu/stanford/nlp/wordseg/Sighan2005DocumentReaderAndWriter.java
@@ -7,6 +7,7 @@
 import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.List;
+import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
 import java.util.regex.Pattern;
@@ -39,9 +40,13 @@
 /**
  * DocumentReader for Chinese segmentation task. (Sighan bakeoff 2005)
  * Reads in characters and labels them as 1 or 0 (word START or NONSTART).
- *
+ * <br>
  * Note: maybe this can do less interning, since some is done in
  * ObjectBankWrapper, but this also calls trim() as it works....
+ * <br>
+ * Data can be output in two formats: plaintext, meaning whitespace
+ * separated words, or a fake conllu document usable with the conllu
+ * scoring script.
  *
  * @author Pi-Chuan Chang
  * @author Michel Galley (Viterbi search graph printing)
@@ -81,6 +86,12 @@ public class Sighan2005DocumentReaderAndWriter implements DocumentReaderAndWrite
   private SeqClassifierFlags flags;
   private IteratorFromReaderFactory<List<CoreLabel>> factory;
 
+  private enum OutputFormat {
+    PLAINTEXT, CONLLU
+  }
+
+  private OutputFormat outputFormat;
+
   @Override
   public Iterator<List<CoreLabel>> getIterator(Reader r) {
     return factory.getIterator(r);
@@ -108,6 +119,13 @@ public void init(SeqClassifierFlags flags) {
       String[] dicts2 = flags.dictionary2.split(",");
       cdict2 = new ChineseDictionary(dicts2, cdtos, flags.expandMidDot);
     }
+
+    if (flags.outputFormat != null) {
+      outputFormat = OutputFormat.valueOf(flags.outputFormat.toUpperCase(Locale.ROOT));
+      logger.info("Output format: " + outputFormat);
+    } else {
+      outputFormat = OutputFormat.PLAINTEXT;
+    }
   }
 
 
@@ -309,13 +327,55 @@ private static void addDictionaryFeatures(ChineseDictionary dict, Class<? extend
     }
   }
 
-  @Override
-  public void printAnswers(List<CoreLabel> doc, PrintWriter pw) {
+  private void printPlainTextAnswer(List<CoreLabel> doc, PrintWriter pw) {
     String ansStr = ChineseStringUtils.combineSegmentedSentence(doc, flags);
     pw.print(ansStr);
     pw.println();
   }
 
+  /**
+   * Prints a fake Conllu document for use in the conllu tokenization scoring scripts
+   */
+  private void printConlluAnswer(List<CoreLabel> doc, PrintWriter pw) {
+    String ansStr = ChineseStringUtils.combineSegmentedSentence(doc, flags);
+    pw.print("# text = " + ansStr);
+    pw.println();
+
+    List<String> words = StringUtils.split(ansStr);
+    int idx = 0;
+    for (String word : words) {
+      idx = idx + 1;
+      pw.print(idx + "\t" + word);
+      // 4 _ - print blanks for lemma & tags
+      pw.print("\t_\t_\t_\t_\t");
+      pw.print(idx - 1);
+      pw.print("\t");
+      if (idx == 1) {
+        pw.print("root");
+      } else {
+        pw.print("dep");
+      }
+      pw.print("\t_\t_");
+      pw.println();
+    }
+
+    pw.println();
+  }
+
+  @Override
+  public void printAnswers(List<CoreLabel> doc, PrintWriter pw) {
+    switch (outputFormat) {
+    case PLAINTEXT:
+      printPlainTextAnswer(doc, pw);
+      break;
+    case CONLLU:
+      printConlluAnswer(doc, pw);
+      break;
+    default:
+      throw new IllegalArgumentException("Unknown outputFormat: " + outputFormat);
+    }
+  }
+
 
   private static String intern(String s) {
     return s.trim().intern();