Skip to content

Commit a1bddee

Browse files
authored
Merge pull request #1129 from stanfordnlp/segmenter_stuff
Segmenter stuff
2 parents 32e3b9f + a29abc5 commit a1bddee

File tree

5 files changed

+181
-33
lines changed

5 files changed

+181
-33
lines changed

src/edu/stanford/nlp/ie/AbstractSequenceClassifier.java

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -252,14 +252,14 @@ public DocumentReaderAndWriter<IN> makeReaderAndWriter() {
252252
* edu.stanford.nlp.wordseg.Sighan2005DocumentReaderAndWriter for
253253
* the Chinese Segmenter.
254254
*/
255-
public DocumentReaderAndWriter<IN> makePlainTextReaderAndWriter() {
255+
public static <INN extends CoreMap> DocumentReaderAndWriter<INN> makePlainTextReaderAndWriter(SeqClassifierFlags flags) {
256256
String readerClassName = flags.plainTextDocumentReaderAndWriter;
257257
// We set this default here if needed because there may be models
258258
// which don't have the reader flag set
259259
if (readerClassName == null) {
260260
readerClassName = SeqClassifierFlags.DEFAULT_PLAIN_TEXT_READER;
261261
}
262-
DocumentReaderAndWriter<IN> readerAndWriter;
262+
DocumentReaderAndWriter<INN> readerAndWriter;
263263
try {
264264
readerAndWriter = ReflectionLoading.loadByReflection(readerClassName);
265265
} catch (Exception e) {
@@ -269,6 +269,10 @@ public DocumentReaderAndWriter<IN> makePlainTextReaderAndWriter() {
269269
return readerAndWriter;
270270
}
271271

272+
public DocumentReaderAndWriter<IN> makePlainTextReaderAndWriter() {
273+
return makePlainTextReaderAndWriter(flags);
274+
}
275+
272276
/**
273277
* Returns the background class for the classifier.
274278
*

src/edu/stanford/nlp/sequences/SeqClassifierFlags.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1081,6 +1081,8 @@ public enum SlashHyphenEnum { NONE, WFRAG, WORD, BOTH };
10811081
*/
10821082
public boolean useMoreNeighborNGrams = false;
10831083

1084+
/** if using dict2 in a segmenter, load it with this filename */
1085+
public String dict2name = "";
10841086

10851087
// "ADD VARIABLES ABOVE HERE"
10861088

@@ -2634,6 +2636,8 @@ public void setProperties(Properties props, boolean printProps) {
26342636
} else if (key.equalsIgnoreCase("ner.model")) {
26352637
nerModel = val;
26362638
} else if (key.equalsIgnoreCase("sutime.language")) {
2639+
} else if (key.equalsIgnoreCase("dict2name")) {
2640+
dict2name = val;
26372641
// ADD VALUE ABOVE HERE
26382642
} else if ( ! key.isEmpty() && ! key.equals("prop")) {
26392643
log.info("Unknown property: |" + key + '|');

src/edu/stanford/nlp/wordseg/Gale2007ChineseSegmenterFeatureFactory.java

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ public class Gale2007ChineseSegmenterFeatureFactory<IN extends CoreLabel> extend
5252

5353
private transient TagAffixDetector taDetector; // = null;
5454
private transient CorpusDictionary outDict; // = null;
55+
private transient NonDict2 nonDict; // = null;
5556

5657
@Override
5758
public void init(SeqClassifierFlags flags) {
@@ -71,6 +72,11 @@ private synchronized void createOutDict() {
7172
}
7273
}
7374

75+
private synchronized void createNonDict() {
76+
if (nonDict == null) {
77+
nonDict = new NonDict2(flags);
78+
}
79+
}
7480

7581
/**
7682
* Extracts all the features from the input data at a certain index.
@@ -479,8 +485,10 @@ protected Collection<String> featuresCpC(PaddedList<? extends CoreLabel> cInfo,
479485
* This is frickin' useful. I hadn't realized. CDM Oct 2007.
480486
*/
481487
if (flags.useDict2) {
482-
NonDict2 nd = new NonDict2(flags);
483-
features.add(nd.checkDic(charp+charc, flags)+"nondict");
488+
if (nonDict == null) {
489+
createNonDict();
490+
}
491+
features.add(nonDict.checkDic(charp+charc, flags)+"nondict");
484492
}
485493

486494
if (flags.useOutDict2) {
Lines changed: 98 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,43 +1,54 @@
11
package edu.stanford.nlp.wordseg;
22

3+
import java.io.FileReader;
4+
import java.io.IOException;
5+
import java.io.PrintWriter;
6+
import java.util.HashSet;
7+
import java.util.Iterator;
8+
import java.util.List;
9+
import java.util.Properties;
10+
import java.util.Set;
311

4-
import edu.stanford.nlp.util.logging.Redwood;
5-
12+
import edu.stanford.nlp.ie.AbstractSequenceClassifier;
13+
import edu.stanford.nlp.io.IOUtils;
14+
import edu.stanford.nlp.ling.CoreLabel;
15+
import edu.stanford.nlp.sequences.DocumentReaderAndWriter;
616
import edu.stanford.nlp.sequences.SeqClassifierFlags;
17+
import edu.stanford.nlp.util.StringUtils;
18+
import edu.stanford.nlp.util.logging.Redwood;
719

820
public class NonDict2 {
921

1022
//public String sighanCorporaDict = "/u/nlp/data/chinese-segmenter/";
11-
public String corporaDict = "/u/nlp/data/gale/segtool/stanford-seg/data/";
12-
private static CorpusDictionary cd = null;
23+
public static final String DEFAULT_HOME = "/u/nlp/data/gale/segtool/stanford-seg/data/";
24+
public final String corporaDict;
25+
private final CorpusDictionary cd;
1326

1427
private static Redwood.RedwoodChannels logger = Redwood.channels(NonDict2.class);
1528

1629
public NonDict2(SeqClassifierFlags flags) {
17-
if (cd == null) {
18-
19-
if (flags.sighanCorporaDict != null) {
20-
corporaDict = flags.sighanCorporaDict; // use the same flag for Sighan 2005,
21-
// but our list is extracted from ctb
22-
}
23-
String path;
24-
if (flags.useAs || flags.useHk || flags.useMsr) {
25-
throw new RuntimeException("only support settings for CTB and PKU now.");
26-
} else if ( flags.usePk ) {
27-
path = corporaDict+"/dict/pku.non";
28-
} else { // CTB
29-
path = corporaDict+"/dict/ctb.non";
30-
}
30+
if (flags.sighanCorporaDict != null) {
31+
corporaDict = flags.sighanCorporaDict; // use the same flag for Sighan 2005,
32+
// but our list is extracted from ctb
33+
} else {
34+
corporaDict = DEFAULT_HOME;
35+
}
3136

32-
cd = new CorpusDictionary(path);
33-
// just output the msg...
34-
if (flags.useAs || flags.useHk || flags.useMsr) {
35-
} else if ( flags.usePk ) {
36-
logger.info("INFO: flags.usePk=true | building NonDict2 from "+path);
37-
} else { // CTB
38-
logger.info("INFO: flags.usePk=false | building NonDict2 from "+path);
39-
}
37+
String path;
38+
if (flags.dict2name != null && !flags.dict2name.equals("")) {
39+
path = corporaDict + "/dict/" + flags.dict2name;
40+
logger.info("INFO: dict2name specified | building NonDict2 from "+path);
41+
} else if (flags.useAs || flags.useHk || flags.useMsr) {
42+
throw new RuntimeException("only support settings for CTB and PKU now.");
43+
} else if ( flags.usePk ) {
44+
path = corporaDict+"/dict/pku.non";
45+
logger.info("INFO: flags.usePk=true | building NonDict2 from "+path);
46+
} else { // CTB
47+
path = corporaDict+"/dict/ctb.non";
48+
logger.info("INFO: flags.usePk=false | building NonDict2 from "+path);
4049
}
50+
51+
cd = new CorpusDictionary(path);
4152
}
4253

4354
public String checkDic(String c2, SeqClassifierFlags flags) {
@@ -47,4 +58,65 @@ public String checkDic(String c2, SeqClassifierFlags flags) {
4758
return "0";
4859
}
4960

61+
/**
62+
* Rebuilds a non-dict. Use -textFile and -outputFile as appropriate.
63+
* Uses SeqClassifierFlags so that specific flags for the reader can be honored.
64+
*/
65+
public static void main(String[] args) throws IOException {
66+
Properties props = StringUtils.argsToProperties(args, SeqClassifierFlags.flagsToNumArgs());
67+
68+
/*
69+
// TODO: refactor this into a util?
70+
// TODO: whitespace reader
71+
boolean foundReader = false;
72+
for (String propKey : props.stringPropertyNames()) {
73+
if (propKey.equalsIgnoreCase("plainTextDocumentReaderAndWriter")) {
74+
foundReader = true;
75+
break;
76+
}
77+
}
78+
if (!foundReader) {
79+
// this doesn't exist
80+
props.setProperty("plainTextDocumentReaderAndWriter", "edu.stanford.nlp.sequences.WhitespaceDocumentReaderAndWriter");
81+
}
82+
*/
83+
84+
SeqClassifierFlags flags = new SeqClassifierFlags(props);
85+
86+
String inputFilename = flags.textFile;
87+
String outputFilename = flags.outputFile;
88+
89+
DocumentReaderAndWriter<CoreLabel> readerAndWriter = AbstractSequenceClassifier.makePlainTextReaderAndWriter(flags);
90+
readerAndWriter.init(flags);
91+
92+
Set<String> splitBigrams = new HashSet<>();
93+
94+
FileReader fin = new FileReader(inputFilename);
95+
// for some weird syntax reason this can't take the place of ': iterable'
96+
Iterable<List<CoreLabel>> iterable = () -> readerAndWriter.getIterator(fin);
97+
List<CoreLabel> prevSentence = null;
98+
for (List<CoreLabel> sentence : iterable) {
99+
for (int i = 0; i < sentence.size() - 1; ++i) {
100+
String prevWord = sentence.get(i).value();
101+
String nextWord = sentence.get(i+1).value();
102+
String bigram = prevWord.substring(prevWord.length() - 1) + nextWord.substring(0, 1);
103+
splitBigrams.add(bigram);
104+
}
105+
if (prevSentence != null) {
106+
String prevWord = prevSentence.get(prevSentence.size() - 1).value();
107+
String nextWord = sentence.get(0).value();
108+
String bigram = prevWord.substring(prevWord.length() - 1) + nextWord.substring(0, 1);
109+
splitBigrams.add(bigram);
110+
}
111+
prevSentence = sentence;
112+
}
113+
fin.close();
114+
115+
PrintWriter fout = IOUtils.getPrintWriter(outputFilename, "utf-8");
116+
for (String bigram : splitBigrams) {
117+
fout.print(bigram);
118+
fout.println();
119+
}
120+
fout.close();
121+
}
50122
}

src/edu/stanford/nlp/wordseg/Sighan2005DocumentReaderAndWriter.java

Lines changed: 63 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
import java.util.ArrayList;
88
import java.util.Iterator;
99
import java.util.List;
10+
import java.util.Locale;
1011
import java.util.Map;
1112
import java.util.Set;
1213
import java.util.regex.Pattern;
@@ -39,9 +40,13 @@
3940
/**
4041
* DocumentReader for Chinese segmentation task. (Sighan bakeoff 2005)
4142
* Reads in characters and labels them as 1 or 0 (word START or NONSTART).
42-
*
43+
* <br>
4344
* Note: maybe this can do less interning, since some is done in
4445
* ObjectBankWrapper, but this also calls trim() as it works....
46+
* <br>
47+
* Data can be output in two formats: plaintext, meaning whitespace
48+
* separated words, or a fake conllu document usable with the conllu
49+
* scoring script.
4550
*
4651
* @author Pi-Chuan Chang
4752
* @author Michel Galley (Viterbi search graph printing)
@@ -81,6 +86,12 @@ public class Sighan2005DocumentReaderAndWriter implements DocumentReaderAndWrite
8186
private SeqClassifierFlags flags;
8287
private IteratorFromReaderFactory<List<CoreLabel>> factory;
8388

89+
private enum OutputFormat {
90+
PLAINTEXT, CONLLU
91+
}
92+
93+
private OutputFormat outputFormat;
94+
8495
@Override
8596
public Iterator<List<CoreLabel>> getIterator(Reader r) {
8697
return factory.getIterator(r);
@@ -108,6 +119,13 @@ public void init(SeqClassifierFlags flags) {
108119
String[] dicts2 = flags.dictionary2.split(",");
109120
cdict2 = new ChineseDictionary(dicts2, cdtos, flags.expandMidDot);
110121
}
122+
123+
if (flags.outputFormat != null) {
124+
outputFormat = OutputFormat.valueOf(flags.outputFormat.toUpperCase(Locale.ROOT));
125+
logger.info("Output format: " + outputFormat);
126+
} else {
127+
outputFormat = OutputFormat.PLAINTEXT;
128+
}
111129
}
112130

113131

@@ -309,13 +327,55 @@ private static void addDictionaryFeatures(ChineseDictionary dict, Class<? extend
309327
}
310328
}
311329

312-
@Override
313-
public void printAnswers(List<CoreLabel> doc, PrintWriter pw) {
330+
private void printPlainTextAnswer(List<CoreLabel> doc, PrintWriter pw) {
314331
String ansStr = ChineseStringUtils.combineSegmentedSentence(doc, flags);
315332
pw.print(ansStr);
316333
pw.println();
317334
}
318335

336+
/**
337+
* Prints a fake Conllu document for use in the conllu tokenization scoring scripts
338+
*/
339+
private void printConlluAnswer(List<CoreLabel> doc, PrintWriter pw) {
340+
String ansStr = ChineseStringUtils.combineSegmentedSentence(doc, flags);
341+
pw.print("# text = " + ansStr);
342+
pw.println();
343+
344+
List<String> words = StringUtils.split(ansStr);
345+
int idx = 0;
346+
for (String word : words) {
347+
idx = idx + 1;
348+
pw.print(idx + "\t" + word);
349+
// 4 _ - print blanks for lemma & tags
350+
pw.print("\t_\t_\t_\t_\t");
351+
pw.print(idx - 1);
352+
pw.print("\t");
353+
if (idx == 1) {
354+
pw.print("root");
355+
} else {
356+
pw.print("dep");
357+
}
358+
pw.print("\t_\t_");
359+
pw.println();
360+
}
361+
362+
pw.println();
363+
}
364+
365+
@Override
366+
public void printAnswers(List<CoreLabel> doc, PrintWriter pw) {
367+
switch (outputFormat) {
368+
case PLAINTEXT:
369+
printPlainTextAnswer(doc, pw);
370+
break;
371+
case CONLLU:
372+
printConlluAnswer(doc, pw);
373+
break;
374+
default:
375+
throw new IllegalArgumentException("Unknown outputFormat: " + outputFormat);
376+
}
377+
}
378+
319379

320380
private static String intern(String s) {
321381
return s.trim().intern();

0 commit comments

Comments
 (0)