Skip to content

Commit a29abc5

Browse files
committed
Add a main program to NonDict2 which rebuilds the dictionary from a segmenter training file
1 parent c70ddec commit a29abc5

File tree

3 files changed

+89
-5
lines changed

3 files changed

+89
-5
lines changed

src/edu/stanford/nlp/ie/AbstractSequenceClassifier.java

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -252,14 +252,14 @@ public DocumentReaderAndWriter<IN> makeReaderAndWriter() {
252252
* edu.stanford.nlp.wordseg.Sighan2005DocumentReaderAndWriter for
253253
* the Chinese Segmenter.
254254
*/
255-
public DocumentReaderAndWriter<IN> makePlainTextReaderAndWriter() {
255+
public static <INN extends CoreMap> DocumentReaderAndWriter<INN> makePlainTextReaderAndWriter(SeqClassifierFlags flags) {
256256
String readerClassName = flags.plainTextDocumentReaderAndWriter;
257257
// We set this default here if needed because there may be models
258258
// which don't have the reader flag set
259259
if (readerClassName == null) {
260260
readerClassName = SeqClassifierFlags.DEFAULT_PLAIN_TEXT_READER;
261261
}
262-
DocumentReaderAndWriter<IN> readerAndWriter;
262+
DocumentReaderAndWriter<INN> readerAndWriter;
263263
try {
264264
readerAndWriter = ReflectionLoading.loadByReflection(readerClassName);
265265
} catch (Exception e) {
@@ -269,6 +269,10 @@ public DocumentReaderAndWriter<IN> makePlainTextReaderAndWriter() {
269269
return readerAndWriter;
270270
}
271271

272+
public DocumentReaderAndWriter<IN> makePlainTextReaderAndWriter() {
273+
return makePlainTextReaderAndWriter(flags);
274+
}
275+
272276
/**
273277
* Returns the background class for the classifier.
274278
*

src/edu/stanford/nlp/sequences/SeqClassifierFlags.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1081,6 +1081,8 @@ public enum SlashHyphenEnum { NONE, WFRAG, WORD, BOTH };
10811081
*/
10821082
public boolean useMoreNeighborNGrams = false;
10831083

1084+
/** if using dict2 in a segmenter, load it with this filename */
1085+
public String dict2name = "";
10841086

10851087
// "ADD VARIABLES ABOVE HERE"
10861088

@@ -2634,6 +2636,8 @@ public void setProperties(Properties props, boolean printProps) {
26342636
} else if (key.equalsIgnoreCase("ner.model")) {
26352637
nerModel = val;
26362638
} else if (key.equalsIgnoreCase("sutime.language")) {
2639+
} else if (key.equalsIgnoreCase("dict2name")) {
2640+
dict2name = val;
26372641
// ADD VALUE ABOVE HERE
26382642
} else if ( ! key.isEmpty() && ! key.equals("prop")) {
26392643
log.info("Unknown property: |" + key + '|');

src/edu/stanford/nlp/wordseg/NonDict2.java

Lines changed: 79 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,21 @@
11
package edu.stanford.nlp.wordseg;
22

3+
import java.io.FileReader;
4+
import java.io.IOException;
5+
import java.io.PrintWriter;
6+
import java.util.HashSet;
7+
import java.util.Iterator;
8+
import java.util.List;
9+
import java.util.Properties;
10+
import java.util.Set;
311

4-
import edu.stanford.nlp.util.logging.Redwood;
5-
12+
import edu.stanford.nlp.ie.AbstractSequenceClassifier;
13+
import edu.stanford.nlp.io.IOUtils;
14+
import edu.stanford.nlp.ling.CoreLabel;
15+
import edu.stanford.nlp.sequences.DocumentReaderAndWriter;
616
import edu.stanford.nlp.sequences.SeqClassifierFlags;
17+
import edu.stanford.nlp.util.StringUtils;
18+
import edu.stanford.nlp.util.logging.Redwood;
719

820
public class NonDict2 {
921

@@ -23,7 +35,10 @@ public NonDict2(SeqClassifierFlags flags) {
2335
}
2436

2537
String path;
26-
if (flags.useAs || flags.useHk || flags.useMsr) {
38+
if (flags.dict2name != null && !flags.dict2name.equals("")) {
39+
path = corporaDict + "/dict/" + flags.dict2name;
40+
logger.info("INFO: dict2name specified | building NonDict2 from "+path);
41+
} else if (flags.useAs || flags.useHk || flags.useMsr) {
2742
throw new RuntimeException("only support settings for CTB and PKU now.");
2843
} else if ( flags.usePk ) {
2944
path = corporaDict+"/dict/pku.non";
@@ -43,4 +58,65 @@ public String checkDic(String c2, SeqClassifierFlags flags) {
4358
return "0";
4459
}
4560

61+
/**
62+
* Rebuilds a non-dict. Use -textFile and -outputFile as appropriate.
63+
* Uses SeqClassifierFlags so that specific flags for the reader can be honored.
64+
*/
65+
public static void main(String[] args) throws IOException {
66+
Properties props = StringUtils.argsToProperties(args, SeqClassifierFlags.flagsToNumArgs());
67+
68+
/*
69+
// TODO: refactor this into a util?
70+
// TODO: whitespace reader
71+
boolean foundReader = false;
72+
for (String propKey : props.stringPropertyNames()) {
73+
if (propKey.equalsIgnoreCase("plainTextDocumentReaderAndWriter")) {
74+
foundReader = true;
75+
break;
76+
}
77+
}
78+
if (!foundReader) {
79+
// this doesn't exist
80+
props.setProperty("plainTextDocumentReaderAndWriter", "edu.stanford.nlp.sequences.WhitespaceDocumentReaderAndWriter");
81+
}
82+
*/
83+
84+
SeqClassifierFlags flags = new SeqClassifierFlags(props);
85+
86+
String inputFilename = flags.textFile;
87+
String outputFilename = flags.outputFile;
88+
89+
DocumentReaderAndWriter<CoreLabel> readerAndWriter = AbstractSequenceClassifier.makePlainTextReaderAndWriter(flags);
90+
readerAndWriter.init(flags);
91+
92+
Set<String> splitBigrams = new HashSet<>();
93+
94+
FileReader fin = new FileReader(inputFilename);
95+
// for some weird syntax reason this can't take the place of ': iterable'
96+
Iterable<List<CoreLabel>> iterable = () -> readerAndWriter.getIterator(fin);
97+
List<CoreLabel> prevSentence = null;
98+
for (List<CoreLabel> sentence : iterable) {
99+
for (int i = 0; i < sentence.size() - 1; ++i) {
100+
String prevWord = sentence.get(i).value();
101+
String nextWord = sentence.get(i+1).value();
102+
String bigram = prevWord.substring(prevWord.length() - 1) + nextWord.substring(0, 1);
103+
splitBigrams.add(bigram);
104+
}
105+
if (prevSentence != null) {
106+
String prevWord = prevSentence.get(prevSentence.size() - 1).value();
107+
String nextWord = sentence.get(0).value();
108+
String bigram = prevWord.substring(prevWord.length() - 1) + nextWord.substring(0, 1);
109+
splitBigrams.add(bigram);
110+
}
111+
prevSentence = sentence;
112+
}
113+
fin.close();
114+
115+
PrintWriter fout = IOUtils.getPrintWriter(outputFilename, "utf-8");
116+
for (String bigram : splitBigrams) {
117+
fout.print(bigram);
118+
fout.println();
119+
}
120+
fout.close();
121+
}
46122
}

0 commit comments

Comments
 (0)