Skip to content

Commit d8ce461

Browse files
committed
Add an argument for augmenting fewer sentences. Also, change the name to something more understandable
1 parent 79dfde2 commit d8ce461

File tree

4 files changed

+18
-7
lines changed

4 files changed

+18
-7
lines changed

scripts/srparser/Makefile

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,7 @@ TRAIN_ORACLE_10 = -l1Reg 0.10 -featureFrequencyCutoff 10 -trainingMethod REOR
114114
TRAIN_ORACLE_25 = -l1Reg 0.25 -featureFrequencyCutoff 25 -trainingMethod REORDER_ORACLE
115115

116116
SHARDS_5 = -retrainShards 5
117+
AUGMENT_LESS = -augmentSubsentences 0.1
117118

118119
all-beam: wsjSR.beam.ser.gz englishSR.beam.ser.gz frenchSR.beam.ser.gz chineseSR.beam.ser.gz germanSR.beam.ser.gz arabicSR.beam.ser.gz spanishSR.beam.ser.gz
119120
all-nobeam: wsjSR.ser.gz englishSR.ser.gz frenchSR.ser.gz chineseSR.ser.gz germanSR.ser.gz arabicSR.ser.gz spanishSR.ser.gz
@@ -135,13 +136,13 @@ wsjSR.beam.ser.gz:
135136
englishSR.ser.gz:
136137
@echo Training $@
137138
@echo Will test on $(ENGLISH_TEST)
138-
java -mx10g edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser -trainTreebank $(ENGLISH_TRAIN) -devTreebank $(ENGLISH_DEV) -serializedPath $@ $(DEFAULT_OPTIONS) -preTag -taggerSerializedFile $(ENGLISH_TAGGER) -tlpp $(ENGLISH_TLPP) $(TRAIN_ORACLE_25) > $@.out 2>&1
139+
java -mx10g edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser -trainTreebank $(ENGLISH_TRAIN) -devTreebank $(ENGLISH_DEV) -serializedPath $@ $(DEFAULT_OPTIONS) -preTag -taggerSerializedFile $(ENGLISH_TAGGER) -tlpp $(ENGLISH_TLPP) $(TRAIN_ORACLE_25) $(AUGMENT_LESS) > $@.out 2>&1
139140
java -mx5g edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser -testTreebank $(ENGLISH_TEST) -serializedPath $@ -preTag -taggerSerializedFile $(ENGLISH_TAGGER) >> $@.out 2>&1
140141

141142
englishSR.beam.ser.gz:
142143
@echo Training $@
143144
@echo Will test on $(ENGLISH_TEST)
144-
java -mx50g edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser -trainTreebank $(ENGLISH_TRAIN) -devTreebank $(ENGLISH_DEV) -serializedPath $@ $(DEFAULT_OPTIONS) -preTag -taggerSerializedFile $(ENGLISH_TAGGER) -tlpp $(ENGLISH_TLPP) $(TRAIN_BEAM) > $@.out 2>&1
145+
java -mx50g edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser -trainTreebank $(ENGLISH_TRAIN) -devTreebank $(ENGLISH_DEV) -serializedPath $@ $(DEFAULT_OPTIONS) -preTag -taggerSerializedFile $(ENGLISH_TAGGER) -tlpp $(ENGLISH_TLPP) $(TRAIN_BEAM) $(AUGMENT_LESS) > $@.out 2>&1
145146
java -mx5g edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser -testTreebank $(ENGLISH_TEST) -serializedPath $@ -preTag -taggerSerializedFile $(ENGLISH_TAGGER) >> $@.out 2>&1
146147

147148
frenchSR.ser.gz:

src/edu/stanford/nlp/parser/shiftreduce/PerceptronModel.java

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -476,12 +476,10 @@ private double evaluate(Tagger tagger, Treebank devTreebank, String message) {
476476
/**
477477
* This increases f1 slightly, probably by letting the parser know
478478
* what to do in situations it doesn't get to during the training.
479-
* <br>
480-
* TODO: make constants out of 10, 0.5, etc
481479
*/
482-
static void augmentData(List<TrainingExample> augmentedData, List<TrainingExample> trainingData, Random random) {
480+
static void augmentSubsentences(List<TrainingExample> augmentedData, List<TrainingExample> trainingData, Random random, float augmentFraction) {
483481
for (TrainingExample example : trainingData) {
484-
if (example.transitions.size() > 10 && random.nextDouble() < 0.5) {
482+
if (example.transitions.size() > 10 && random.nextDouble() < augmentFraction) {
485483
int pivot = random.nextInt(example.transitions.size() - 10) + 7;
486484
augmentedData.add(new TrainingExample(example.binarizedTree, example.transitions, pivot));
487485
}
@@ -540,9 +538,10 @@ private void trainModel(String serializedPath, Tagger tagger, Random random, Lis
540538
IntCounter<Pair<Integer, Integer>> firstErrors = new IntCounter<>();
541539

542540
List<TrainingExample> augmentedData = new ArrayList<TrainingExample>(trainingData);
543-
augmentData(augmentedData, trainingData, random);
541+
augmentSubsentences(augmentedData, trainingData, random, op.trainOptions().augmentSubsentences);
544542
Collections.shuffle(augmentedData, random);
545543
log.info("Original list " + trainingData.size() + "; augmented " + augmentedData.size());
544+
546545
for (int start = 0; start < augmentedData.size(); start += op.trainOptions.batchSize) {
547546
int end = Math.min(start + op.trainOptions.batchSize, augmentedData.size());
548547
TrainingResult result = trainBatch(augmentedData.subList(start, end), wrapper);

src/edu/stanford/nlp/parser/shiftreduce/ShiftReduceOptions.java

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,9 @@ protected int setOptionFlag(String[] args, int i) {
114114
} else if (args[i].equalsIgnoreCase("-retrainShardFeatureDrop")) {
115115
trainOptions().retrainShardFeatureDrop = Double.parseDouble(args[i + 1]);
116116
i += 2;
117+
} else if (args[i].equalsIgnoreCase("-augmentSubsentences")) {
118+
trainOptions().augmentSubsentences = Float.parseFloat(args[i + 1]);
119+
i += 2;
117120
}
118121
return i;
119122
}

src/edu/stanford/nlp/parser/shiftreduce/ShiftReduceTrainOptions.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,14 @@ public enum TrainingMethod {
6060
*/
6161
public double retrainShardFeatureDrop = 0.25;
6262

63+
/**
64+
* Some training trees will be repeated, with gold transitions given
65+
* for the first several steps to ensure the parser starts from a
66+
* good place. For some datasets, such as the English training set,
67+
* 0.5 is excessively large.
68+
*/
69+
public float augmentSubsentences = 0.5f;
70+
6371
// version id randomly chosen by forgetting to set the version id when serializing models
6472
private static final long serialVersionUID = -8158249539308373819L;
6573
}

0 commit comments

Comments
 (0)