1
1
package edu .stanford .nlp .wordseg ;
2
2
3
+ import java .io .FileReader ;
4
+ import java .io .IOException ;
5
+ import java .io .PrintWriter ;
6
+ import java .util .HashSet ;
7
+ import java .util .Iterator ;
8
+ import java .util .List ;
9
+ import java .util .Properties ;
10
+ import java .util .Set ;
3
11
4
- import edu .stanford .nlp .util .logging .Redwood ;
5
-
12
+ import edu .stanford .nlp .ie .AbstractSequenceClassifier ;
13
+ import edu .stanford .nlp .io .IOUtils ;
14
+ import edu .stanford .nlp .ling .CoreLabel ;
15
+ import edu .stanford .nlp .sequences .DocumentReaderAndWriter ;
6
16
import edu .stanford .nlp .sequences .SeqClassifierFlags ;
17
+ import edu .stanford .nlp .util .StringUtils ;
18
+ import edu .stanford .nlp .util .logging .Redwood ;
7
19
8
20
public class NonDict2 {
9
21
@@ -23,7 +35,10 @@ public NonDict2(SeqClassifierFlags flags) {
23
35
}
24
36
25
37
String path ;
26
- if (flags .useAs || flags .useHk || flags .useMsr ) {
38
+ if (flags .dict2name != null && !flags .dict2name .equals ("" )) {
39
+ path = corporaDict + "/dict/" + flags .dict2name ;
40
+ logger .info ("INFO: dict2name specified | building NonDict2 from " +path );
41
+ } else if (flags .useAs || flags .useHk || flags .useMsr ) {
27
42
throw new RuntimeException ("only support settings for CTB and PKU now." );
28
43
} else if ( flags .usePk ) {
29
44
path = corporaDict +"/dict/pku.non" ;
@@ -43,4 +58,65 @@ public String checkDic(String c2, SeqClassifierFlags flags) {
43
58
return "0" ;
44
59
}
45
60
61
+ /**
62
+ * Rebuilds a non-dict. Use -textFile and -outputFile as appropriate.
63
+ * Uses SeqClassifierFlags so that specific flags for the reader can be honored.
64
+ */
65
+ public static void main (String [] args ) throws IOException {
66
+ Properties props = StringUtils .argsToProperties (args , SeqClassifierFlags .flagsToNumArgs ());
67
+
68
+ /*
69
+ // TODO: refactor this into a util?
70
+ // TODO: whitespace reader
71
+ boolean foundReader = false;
72
+ for (String propKey : props.stringPropertyNames()) {
73
+ if (propKey.equalsIgnoreCase("plainTextDocumentReaderAndWriter")) {
74
+ foundReader = true;
75
+ break;
76
+ }
77
+ }
78
+ if (!foundReader) {
79
+ // this doesn't exist
80
+ props.setProperty("plainTextDocumentReaderAndWriter", "edu.stanford.nlp.sequences.WhitespaceDocumentReaderAndWriter");
81
+ }
82
+ */
83
+
84
+ SeqClassifierFlags flags = new SeqClassifierFlags (props );
85
+
86
+ String inputFilename = flags .textFile ;
87
+ String outputFilename = flags .outputFile ;
88
+
89
+ DocumentReaderAndWriter <CoreLabel > readerAndWriter = AbstractSequenceClassifier .makePlainTextReaderAndWriter (flags );
90
+ readerAndWriter .init (flags );
91
+
92
+ Set <String > splitBigrams = new HashSet <>();
93
+
94
+ FileReader fin = new FileReader (inputFilename );
95
+ // for some weird syntax reason this can't take the place of ': iterable'
96
+ Iterable <List <CoreLabel >> iterable = () -> readerAndWriter .getIterator (fin );
97
+ List <CoreLabel > prevSentence = null ;
98
+ for (List <CoreLabel > sentence : iterable ) {
99
+ for (int i = 0 ; i < sentence .size () - 1 ; ++i ) {
100
+ String prevWord = sentence .get (i ).value ();
101
+ String nextWord = sentence .get (i +1 ).value ();
102
+ String bigram = prevWord .substring (prevWord .length () - 1 ) + nextWord .substring (0 , 1 );
103
+ splitBigrams .add (bigram );
104
+ }
105
+ if (prevSentence != null ) {
106
+ String prevWord = prevSentence .get (prevSentence .size () - 1 ).value ();
107
+ String nextWord = sentence .get (0 ).value ();
108
+ String bigram = prevWord .substring (prevWord .length () - 1 ) + nextWord .substring (0 , 1 );
109
+ splitBigrams .add (bigram );
110
+ }
111
+ prevSentence = sentence ;
112
+ }
113
+ fin .close ();
114
+
115
+ PrintWriter fout = IOUtils .getPrintWriter (outputFilename , "utf-8" );
116
+ for (String bigram : splitBigrams ) {
117
+ fout .print (bigram );
118
+ fout .println ();
119
+ }
120
+ fout .close ();
121
+ }
46
122
}
0 commit comments