@@ -31,9 +31,11 @@ CTB7_ALL=/u/nlp/data/chinese/ctb7/seg/ctb7-seg-with-extra.txt
31
31
CTB7_TRAIN =/u/nlp/data/chinese/ctb7/seg/ctb7-seg.train.txt
32
32
33
33
ifndef CHINESE_SEGMENTER_HOME
34
- CTB9_ALL =/u/nlp/data/chinese/ctb9/seg/ctb9-seg-with-extra.txt
34
+ CTB9_TRAIN =/u/nlp/data/chinese/ctb9/seg/ctb9-seg-with-extra.txt
35
+ CTB9_TEST =/u/nlp/data/chinese/ctb9/seg/ctb9.test.txt
35
36
else
36
- CTB9_ALL =$(CHINESE_SEGMENTER_HOME ) /ctb9-seg-with-extra.txt
37
+ CTB9_TRAIN =$(CHINESE_SEGMENTER_HOME ) /ctb9-seg-with-extra.txt
38
+ CTB9_TEST =$(CHINESE_SEGMENTER_HOME ) /ctb9.test.txt
37
39
endif
38
40
39
41
# Special prerelease segmentation data from Bolt. Do not release publicly!
@@ -71,11 +73,13 @@ ctb7.train.chris6.ser.gz: dict-chris6.ser.gz
71
73
72
74
# train on train CTB9 + extras, with all external lexicons, without training lexicon
73
75
ctb9.train.chris6.ser.gz : dict-chris6.ser.gz
74
- time java -mx60g edu.stanford.nlp.ie.crf.CRFClassifier -prop ctb9-chris6.prop -serDictionary $+ -sighanCorporaDict $(SIGHAN2007_CORPORA_DICT ) -trainFile $(CTB9_ALL ) -serializeTo $@ > $@ .log 2> $@ .err
76
+ time java -mx60g edu.stanford.nlp.ie.crf.CRFClassifier -prop ctb9-chris6.prop -serDictionary $+ -sighanCorporaDict $(SIGHAN2007_CORPORA_DICT ) -trainFile $(CTB9_TRAIN ) -serializeTo $@ > $@ .log 2> $@ .err
77
+ time java -mx5g edu.stanford.nlp.ie.crf.CRFClassifier -sighanCorporaDict $(SIGHAN2007_CORPORA_DICT ) -loadClassifier $@ -testFile $(CTB9_TEST ) -inputEncoding UTF-8 -sighanPostProcessing true -serDictionary $+ -keepAllWhitespaces false >> $@ .log 2>> $@ .err
75
78
76
79
# train on train CTB9 + extras, with all external lexicons, without training lexicon, use the threshold to make it smaller
77
80
ctb9.train-small.chris6.ser.gz : dict-chris6.ser.gz
78
- time java -mx60g edu.stanford.nlp.ie.crf.CRFClassifier -prop ctb9-chris6.prop -serDictionary $+ -sighanCorporaDict $(SIGHAN2007_CORPORA_DICT ) -featureDiffThresh 0.005 -trainFile $(CTB9_ALL ) -serializeTo $@ > $@ .log 2> $@ .err
81
+ time java -mx60g edu.stanford.nlp.ie.crf.CRFClassifier -prop ctb9-chris6.prop -serDictionary $+ -sighanCorporaDict $(SIGHAN2007_CORPORA_DICT ) -featureDiffThresh 0.005 -trainFile $(CTB9_TRAIN ) -serializeTo $@ > $@ .log 2> $@ .err
82
+ time java -mx5g edu.stanford.nlp.ie.crf.CRFClassifier -sighanCorporaDict $(SIGHAN2007_CORPORA_DICT ) -loadClassifier $@ -testFile $(CTB9_TEST ) -inputEncoding UTF-8 -sighanPostProcessing true -serDictionary $+ -keepAllWhitespaces false >> $@ .log 2>> $@ .err
79
83
80
84
# train on all CTB7, with all external lexicons, without training lexicon
81
85
bolt.chris6.ser.gz : dict-chris6.ser.gz
0 commit comments