1
1
package edu .stanford .nlp .wordseg ;
2
2
3
+ import java .io .FileReader ;
4
+ import java .io .IOException ;
5
+ import java .io .PrintWriter ;
6
+ import java .util .HashSet ;
7
+ import java .util .Iterator ;
8
+ import java .util .List ;
9
+ import java .util .Properties ;
10
+ import java .util .Set ;
3
11
4
- import edu .stanford .nlp .util .logging .Redwood ;
5
-
12
+ import edu .stanford .nlp .ie .AbstractSequenceClassifier ;
13
+ import edu .stanford .nlp .io .IOUtils ;
14
+ import edu .stanford .nlp .ling .CoreLabel ;
15
+ import edu .stanford .nlp .sequences .DocumentReaderAndWriter ;
6
16
import edu .stanford .nlp .sequences .SeqClassifierFlags ;
17
+ import edu .stanford .nlp .util .StringUtils ;
18
+ import edu .stanford .nlp .util .logging .Redwood ;
7
19
8
20
public class NonDict2 {
9
21
10
22
//public String sighanCorporaDict = "/u/nlp/data/chinese-segmenter/";
11
- public String corporaDict = "/u/nlp/data/gale/segtool/stanford-seg/data/" ;
12
- private static CorpusDictionary cd = null ;
23
+ public static final String DEFAULT_HOME = "/u/nlp/data/gale/segtool/stanford-seg/data/" ;
24
+ public final String corporaDict ;
25
+ private final CorpusDictionary cd ;
13
26
14
27
private static Redwood .RedwoodChannels logger = Redwood .channels (NonDict2 .class );
15
28
16
29
public NonDict2 (SeqClassifierFlags flags ) {
17
- if (cd == null ) {
18
-
19
- if (flags .sighanCorporaDict != null ) {
20
- corporaDict = flags .sighanCorporaDict ; // use the same flag for Sighan 2005,
21
- // but our list is extracted from ctb
22
- }
23
- String path ;
24
- if (flags .useAs || flags .useHk || flags .useMsr ) {
25
- throw new RuntimeException ("only support settings for CTB and PKU now." );
26
- } else if ( flags .usePk ) {
27
- path = corporaDict +"/dict/pku.non" ;
28
- } else { // CTB
29
- path = corporaDict +"/dict/ctb.non" ;
30
- }
30
+ if (flags .sighanCorporaDict != null ) {
31
+ corporaDict = flags .sighanCorporaDict ; // use the same flag for Sighan 2005,
32
+ // but our list is extracted from ctb
33
+ } else {
34
+ corporaDict = DEFAULT_HOME ;
35
+ }
31
36
32
- cd = new CorpusDictionary (path );
33
- // just output the msg...
34
- if (flags .useAs || flags .useHk || flags .useMsr ) {
35
- } else if ( flags .usePk ) {
36
- logger .info ("INFO: flags.usePk=true | building NonDict2 from " +path );
37
- } else { // CTB
38
- logger .info ("INFO: flags.usePk=false | building NonDict2 from " +path );
39
- }
37
+ String path ;
38
+ if (flags .dict2name != null && !flags .dict2name .equals ("" )) {
39
+ path = corporaDict + "/dict/" + flags .dict2name ;
40
+ logger .info ("INFO: dict2name specified | building NonDict2 from " +path );
41
+ } else if (flags .useAs || flags .useHk || flags .useMsr ) {
42
+ throw new RuntimeException ("only support settings for CTB and PKU now." );
43
+ } else if ( flags .usePk ) {
44
+ path = corporaDict +"/dict/pku.non" ;
45
+ logger .info ("INFO: flags.usePk=true | building NonDict2 from " +path );
46
+ } else { // CTB
47
+ path = corporaDict +"/dict/ctb.non" ;
48
+ logger .info ("INFO: flags.usePk=false | building NonDict2 from " +path );
40
49
}
50
+
51
+ cd = new CorpusDictionary (path );
41
52
}
42
53
43
54
public String checkDic (String c2 , SeqClassifierFlags flags ) {
@@ -47,4 +58,65 @@ public String checkDic(String c2, SeqClassifierFlags flags) {
47
58
return "0" ;
48
59
}
49
60
61
+ /**
62
+ * Rebuilds a non-dict. Use -textFile and -outputFile as appropriate.
63
+ * Uses SeqClassifierFlags so that specific flags for the reader can be honored.
64
+ */
65
+ public static void main (String [] args ) throws IOException {
66
+ Properties props = StringUtils .argsToProperties (args , SeqClassifierFlags .flagsToNumArgs ());
67
+
68
+ /*
69
+ // TODO: refactor this into a util?
70
+ // TODO: whitespace reader
71
+ boolean foundReader = false;
72
+ for (String propKey : props.stringPropertyNames()) {
73
+ if (propKey.equalsIgnoreCase("plainTextDocumentReaderAndWriter")) {
74
+ foundReader = true;
75
+ break;
76
+ }
77
+ }
78
+ if (!foundReader) {
79
+ // this doesn't exist
80
+ props.setProperty("plainTextDocumentReaderAndWriter", "edu.stanford.nlp.sequences.WhitespaceDocumentReaderAndWriter");
81
+ }
82
+ */
83
+
84
+ SeqClassifierFlags flags = new SeqClassifierFlags (props );
85
+
86
+ String inputFilename = flags .textFile ;
87
+ String outputFilename = flags .outputFile ;
88
+
89
+ DocumentReaderAndWriter <CoreLabel > readerAndWriter = AbstractSequenceClassifier .makePlainTextReaderAndWriter (flags );
90
+ readerAndWriter .init (flags );
91
+
92
+ Set <String > splitBigrams = new HashSet <>();
93
+
94
+ FileReader fin = new FileReader (inputFilename );
95
+ // for some weird syntax reason this can't take the place of ': iterable'
96
+ Iterable <List <CoreLabel >> iterable = () -> readerAndWriter .getIterator (fin );
97
+ List <CoreLabel > prevSentence = null ;
98
+ for (List <CoreLabel > sentence : iterable ) {
99
+ for (int i = 0 ; i < sentence .size () - 1 ; ++i ) {
100
+ String prevWord = sentence .get (i ).value ();
101
+ String nextWord = sentence .get (i +1 ).value ();
102
+ String bigram = prevWord .substring (prevWord .length () - 1 ) + nextWord .substring (0 , 1 );
103
+ splitBigrams .add (bigram );
104
+ }
105
+ if (prevSentence != null ) {
106
+ String prevWord = prevSentence .get (prevSentence .size () - 1 ).value ();
107
+ String nextWord = sentence .get (0 ).value ();
108
+ String bigram = prevWord .substring (prevWord .length () - 1 ) + nextWord .substring (0 , 1 );
109
+ splitBigrams .add (bigram );
110
+ }
111
+ prevSentence = sentence ;
112
+ }
113
+ fin .close ();
114
+
115
+ PrintWriter fout = IOUtils .getPrintWriter (outputFilename , "utf-8" );
116
+ for (String bigram : splitBigrams ) {
117
+ fout .print (bigram );
118
+ fout .println ();
119
+ }
120
+ fout .close ();
121
+ }
50
122
}
0 commit comments