@@ -33,9 +33,15 @@ CTB7_TRAIN=/u/nlp/data/chinese/ctb7/seg/ctb7-seg.train.txt
33
33
ifndef CHINESE_SEGMENTER_HOME
34
34
CTB9_TRAIN =/u/nlp/data/chinese/ctb9/seg/ctb9-seg-with-extra.txt
35
35
CTB9_TEST =/u/nlp/data/chinese/ctb9/seg/ctb9.test.txt
36
+
37
+ GSD_TRAIN =/u/nlp/data/chinese/ctb9/seg/zh_gsdsimp.train.seg.txt
38
+ GSD_TEST =/u/nlp/data/chinese/ctb9/seg/zh_gsdsimp.test.seg.txt
36
39
else
37
40
CTB9_TRAIN =$(CHINESE_SEGMENTER_HOME ) /ctb9-seg-with-extra.txt
38
41
CTB9_TEST =$(CHINESE_SEGMENTER_HOME ) /ctb9.test.txt
42
+
43
+ GSD_TRAIN =$(CHINESE_SEGMENTER_HOME ) /zh_gsdsimp.train.seg.txt
44
+ GSD_TEST =$(CHINESE_SEGMENTER_HOME ) /zh_gsdsimp.test.seg.txt
39
45
endif
40
46
41
47
# Special prerelease segmentation data from Bolt. Do not release publicly!
@@ -81,6 +87,11 @@ ctb9.train-small.chris6.ser.gz: dict-chris6.ser.gz
81
87
time java -mx60g edu.stanford.nlp.ie.crf.CRFClassifier -prop ctb9-chris6.prop -serDictionary $+ -sighanCorporaDict $(SIGHAN2007_CORPORA_DICT ) -featureDiffThresh 0.005 -trainFile $(CTB9_TRAIN ) -serializeTo $@ > $@ .log 2> $@ .err
82
88
time java -mx5g edu.stanford.nlp.ie.crf.CRFClassifier -sighanCorporaDict $(SIGHAN2007_CORPORA_DICT ) -loadClassifier $@ -testFile $(CTB9_TEST ) -inputEncoding UTF-8 -sighanPostProcessing true -serDictionary $+ -keepAllWhitespaces false >> $@ .log 2>> $@ .err
83
89
90
+ # train on train GSD, with all external lexicons, without training lexicon
91
+ gsd.ser.gz : dict-chris6.ser.gz
92
+ time java -mx60g edu.stanford.nlp.ie.crf.CRFClassifier -prop ctb9-chris6.prop -serDictionary $+ -sighanCorporaDict $(SIGHAN2007_CORPORA_DICT ) -trainFile $(GSD_TRAIN ) -serializeTo $@ > $@ .log 2> $@ .err
93
+ time java -mx5g edu.stanford.nlp.ie.crf.CRFClassifier -sighanCorporaDict $(SIGHAN2007_CORPORA_DICT ) -loadClassifier $@ -testFile $(GSD_TEST ) -inputEncoding UTF-8 -sighanPostProcessing true -serDictionary $+ -keepAllWhitespaces false >> $@ .log 2>> $@ .err
94
+
84
95
# train on all CTB7, with all external lexicons, without training lexicon
85
96
bolt.chris6.ser.gz : dict-chris6.ser.gz
86
97
time java -mx15g edu.stanford.nlp.ie.crf.CRFClassifier -prop $(DIR ) /ctb6-chris6.prop -serDictionary $+ -sighanCorporaDict $(SIGHAN2007_CORPORA_DICT ) -trainFile $(BOLT ) -serializeTo $@ > $@ .log 2> $@ .err
0 commit comments