Skip to content

Commit eee6259

Browse files
committed
Add a command line for a GSD segmenter
1 parent a0ff942 commit eee6259

File tree

1 file changed

+11
-0
lines changed

1 file changed

+11
-0
lines changed

scripts/chinese-segmenter/Makefile

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,15 @@ CTB7_TRAIN=/u/nlp/data/chinese/ctb7/seg/ctb7-seg.train.txt
3333
ifndef CHINESE_SEGMENTER_HOME
3434
CTB9_TRAIN=/u/nlp/data/chinese/ctb9/seg/ctb9-seg-with-extra.txt
3535
CTB9_TEST=/u/nlp/data/chinese/ctb9/seg/ctb9.test.txt
36+
37+
GSD_TRAIN=/u/nlp/data/chinese/ctb9/seg/zh_gsdsimp.train.seg.txt
38+
GSD_TEST=/u/nlp/data/chinese/ctb9/seg/zh_gsdsimp.test.seg.txt
3639
else
3740
CTB9_TRAIN=$(CHINESE_SEGMENTER_HOME)/ctb9-seg-with-extra.txt
3841
CTB9_TEST=$(CHINESE_SEGMENTER_HOME)/ctb9.test.txt
42+
43+
GSD_TRAIN=$(CHINESE_SEGMENTER_HOME)/zh_gsdsimp.train.seg.txt
44+
GSD_TEST=$(CHINESE_SEGMENTER_HOME)/zh_gsdsimp.test.seg.txt
3945
endif
4046

4147
# Special prerelease segmentation data from Bolt. Do not release publicly!
@@ -81,6 +87,11 @@ ctb9.train-small.chris6.ser.gz: dict-chris6.ser.gz
8187
time java -mx60g edu.stanford.nlp.ie.crf.CRFClassifier -prop ctb9-chris6.prop -serDictionary $+ -sighanCorporaDict $(SIGHAN2007_CORPORA_DICT) -featureDiffThresh 0.005 -trainFile $(CTB9_TRAIN) -serializeTo $@ > $@.log 2> $@.err
8288
time java -mx5g edu.stanford.nlp.ie.crf.CRFClassifier -sighanCorporaDict $(SIGHAN2007_CORPORA_DICT) -loadClassifier $@ -testFile $(CTB9_TEST) -inputEncoding UTF-8 -sighanPostProcessing true -serDictionary $+ -keepAllWhitespaces false >> $@.log 2>> $@.err
8389

90+
# train on train GSD, with all external lexicons, without training lexicon
91+
gsd.ser.gz: dict-chris6.ser.gz
92+
time java -mx60g edu.stanford.nlp.ie.crf.CRFClassifier -prop ctb9-chris6.prop -serDictionary $+ -sighanCorporaDict $(SIGHAN2007_CORPORA_DICT) -trainFile $(GSD_TRAIN) -serializeTo $@ > $@.log 2> $@.err
93+
time java -mx5g edu.stanford.nlp.ie.crf.CRFClassifier -sighanCorporaDict $(SIGHAN2007_CORPORA_DICT) -loadClassifier $@ -testFile $(GSD_TEST) -inputEncoding UTF-8 -sighanPostProcessing true -serDictionary $+ -keepAllWhitespaces false >> $@.log 2>> $@.err
94+
8495
# train on all CTB7, with all external lexicons, without training lexicon
8596
bolt.chris6.ser.gz: dict-chris6.ser.gz
8697
time java -mx15g edu.stanford.nlp.ie.crf.CRFClassifier -prop $(DIR)/ctb6-chris6.prop -serDictionary $+ -sighanCorporaDict $(SIGHAN2007_CORPORA_DICT) -trainFile $(BOLT) -serializeTo $@ > $@.log 2> $@.err

0 commit comments

Comments
 (0)