Skip to content

Commit a0ff942

Browse files
committed
Add a line for testing the ctb9 segmenters
1 parent 33386ee commit a0ff942

File tree

1 file changed

+8
-4
lines changed

1 file changed

+8
-4
lines changed

scripts/chinese-segmenter/Makefile

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,11 @@ CTB7_ALL=/u/nlp/data/chinese/ctb7/seg/ctb7-seg-with-extra.txt
3131
CTB7_TRAIN=/u/nlp/data/chinese/ctb7/seg/ctb7-seg.train.txt
3232

3333
ifndef CHINESE_SEGMENTER_HOME
34-
CTB9_ALL=/u/nlp/data/chinese/ctb9/seg/ctb9-seg-with-extra.txt
34+
CTB9_TRAIN=/u/nlp/data/chinese/ctb9/seg/ctb9-seg-with-extra.txt
35+
CTB9_TEST=/u/nlp/data/chinese/ctb9/seg/ctb9.test.txt
3536
else
36-
CTB9_ALL=$(CHINESE_SEGMENTER_HOME)/ctb9-seg-with-extra.txt
37+
CTB9_TRAIN=$(CHINESE_SEGMENTER_HOME)/ctb9-seg-with-extra.txt
38+
CTB9_TEST=$(CHINESE_SEGMENTER_HOME)/ctb9.test.txt
3739
endif
3840

3941
# Special prerelease segmentation data from Bolt. Do not release publicly!
@@ -71,11 +73,13 @@ ctb7.train.chris6.ser.gz: dict-chris6.ser.gz
7173

7274
# train on train CTB9 + extras, with all external lexicons, without training lexicon
7375
ctb9.train.chris6.ser.gz: dict-chris6.ser.gz
74-
time java -mx60g edu.stanford.nlp.ie.crf.CRFClassifier -prop ctb9-chris6.prop -serDictionary $+ -sighanCorporaDict $(SIGHAN2007_CORPORA_DICT) -trainFile $(CTB9_ALL) -serializeTo $@ > $@.log 2> $@.err
76+
time java -mx60g edu.stanford.nlp.ie.crf.CRFClassifier -prop ctb9-chris6.prop -serDictionary $+ -sighanCorporaDict $(SIGHAN2007_CORPORA_DICT) -trainFile $(CTB9_TRAIN) -serializeTo $@ > $@.log 2> $@.err
77+
time java -mx5g edu.stanford.nlp.ie.crf.CRFClassifier -sighanCorporaDict $(SIGHAN2007_CORPORA_DICT) -loadClassifier $@ -testFile $(CTB9_TEST) -inputEncoding UTF-8 -sighanPostProcessing true -serDictionary $+ -keepAllWhitespaces false >> $@.log 2>> $@.err
7578

7679
# train on train CTB9 + extras, with all external lexicons, without training lexicon, use the threshold to make it smaller
7780
ctb9.train-small.chris6.ser.gz: dict-chris6.ser.gz
78-
time java -mx60g edu.stanford.nlp.ie.crf.CRFClassifier -prop ctb9-chris6.prop -serDictionary $+ -sighanCorporaDict $(SIGHAN2007_CORPORA_DICT) -featureDiffThresh 0.005 -trainFile $(CTB9_ALL) -serializeTo $@ > $@.log 2> $@.err
81+
time java -mx60g edu.stanford.nlp.ie.crf.CRFClassifier -prop ctb9-chris6.prop -serDictionary $+ -sighanCorporaDict $(SIGHAN2007_CORPORA_DICT) -featureDiffThresh 0.005 -trainFile $(CTB9_TRAIN) -serializeTo $@ > $@.log 2> $@.err
82+
time java -mx5g edu.stanford.nlp.ie.crf.CRFClassifier -sighanCorporaDict $(SIGHAN2007_CORPORA_DICT) -loadClassifier $@ -testFile $(CTB9_TEST) -inputEncoding UTF-8 -sighanPostProcessing true -serDictionary $+ -keepAllWhitespaces false >> $@.log 2>> $@.err
7983

8084
# train on all CTB7, with all external lexicons, without training lexicon
8185
bolt.chris6.ser.gz: dict-chris6.ser.gz

0 commit comments

Comments
 (0)