Skip to content

Commit 2df7c92

Browse files
committed
Add a comment on how to make a GSD segmenter
1 parent 1fff2cb commit 2df7c92

File tree

1 file changed

+2
-0
lines changed

1 file changed

+2
-0
lines changed

scripts/chinese-segmenter/Makefile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,8 @@ ctb9.train-small.chris6.ser.gz: dict-chris6.ser.gz
8888
time java -mx5g edu.stanford.nlp.ie.crf.CRFClassifier -sighanCorporaDict $(SIGHAN2007_CORPORA_DICT) -loadClassifier $@ -testFile $(CTB9_TEST) -inputEncoding UTF-8 -sighanPostProcessing true -serDictionary $+ -keepAllWhitespaces false >> $@.log 2>> $@.err
8989

9090
# train on train GSD, with all external lexicons, without training lexicon
91+
# there is a script in Stanza which converts the UD GSD treebank to a segmenter training file:
92+
# stanza/utils/datasets/corenlp_segmenter_dataset.py
9193
gsd.ser.gz: dict-chris6.ser.gz
9294
time java -mx60g edu.stanford.nlp.ie.crf.CRFClassifier -prop ctb9-chris6.prop -serDictionary $+ -sighanCorporaDict $(SIGHAN2007_CORPORA_DICT) -trainFile $(GSD_TRAIN) -serializeTo $@ > $@.log 2> $@.err
9395
time java -mx5g edu.stanford.nlp.ie.crf.CRFClassifier -sighanCorporaDict $(SIGHAN2007_CORPORA_DICT) -loadClassifier $@ -testFile $(GSD_TEST) -inputEncoding UTF-8 -sighanPostProcessing true -serDictionary $+ -keepAllWhitespaces false >> $@.log 2>> $@.err

0 commit comments

Comments
 (0)