Add an argument for augmenting fewer sentences. Also, change the name to something more understandable

AngledLuffa · AngledLuffa · commit d8ce4619c55d · 2021-01-27T00:28:52.000-08:00
diff --git a/scripts/srparser/Makefile b/scripts/srparser/Makefile
@@ -114,6 +114,7 @@ TRAIN_ORACLE_10    = -l1Reg 0.10 -featureFrequencyCutoff 10 -trainingMethod REOR
 TRAIN_ORACLE_25    = -l1Reg 0.25 -featureFrequencyCutoff 25 -trainingMethod REORDER_ORACLE
 
 SHARDS_5           = -retrainShards 5
+AUGMENT_LESS       = -augmentSubsentences 0.1
 
 all-beam:   wsjSR.beam.ser.gz englishSR.beam.ser.gz frenchSR.beam.ser.gz chineseSR.beam.ser.gz germanSR.beam.ser.gz arabicSR.beam.ser.gz spanishSR.beam.ser.gz
 all-nobeam: wsjSR.ser.gz      englishSR.ser.gz      frenchSR.ser.gz      chineseSR.ser.gz      germanSR.ser.gz      arabicSR.ser.gz      spanishSR.ser.gz
@@ -135,13 +136,13 @@ wsjSR.beam.ser.gz:
 englishSR.ser.gz:
 	@echo Training $@
 	@echo Will test on $(ENGLISH_TEST)
-	java -mx10g edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser -trainTreebank $(ENGLISH_TRAIN) -devTreebank $(ENGLISH_DEV) -serializedPath $@ $(DEFAULT_OPTIONS) -preTag -taggerSerializedFile $(ENGLISH_TAGGER) -tlpp $(ENGLISH_TLPP) $(TRAIN_ORACLE_25) > $@.out 2>&1
+	java -mx10g edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser -trainTreebank $(ENGLISH_TRAIN) -devTreebank $(ENGLISH_DEV) -serializedPath $@ $(DEFAULT_OPTIONS) -preTag -taggerSerializedFile $(ENGLISH_TAGGER) -tlpp $(ENGLISH_TLPP) $(TRAIN_ORACLE_25) $(AUGMENT_LESS) > $@.out 2>&1
 	java -mx5g edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser -testTreebank $(ENGLISH_TEST) -serializedPath $@ -preTag -taggerSerializedFile $(ENGLISH_TAGGER) >> $@.out 2>&1
 
 englishSR.beam.ser.gz:
 	@echo Training $@
 	@echo Will test on $(ENGLISH_TEST)
-	java -mx50g edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser -trainTreebank $(ENGLISH_TRAIN) -devTreebank $(ENGLISH_DEV) -serializedPath $@ $(DEFAULT_OPTIONS) -preTag -taggerSerializedFile $(ENGLISH_TAGGER) -tlpp $(ENGLISH_TLPP) $(TRAIN_BEAM) > $@.out 2>&1
+	java -mx50g edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser -trainTreebank $(ENGLISH_TRAIN) -devTreebank $(ENGLISH_DEV) -serializedPath $@ $(DEFAULT_OPTIONS) -preTag -taggerSerializedFile $(ENGLISH_TAGGER) -tlpp $(ENGLISH_TLPP) $(TRAIN_BEAM) $(AUGMENT_LESS) > $@.out 2>&1
 	java -mx5g edu.stanford.nlp.parser.shiftreduce.ShiftReduceParser -testTreebank $(ENGLISH_TEST) -serializedPath $@ -preTag -taggerSerializedFile $(ENGLISH_TAGGER) >> $@.out 2>&1
 
 frenchSR.ser.gz:
diff --git a/src/edu/stanford/nlp/parser/shiftreduce/PerceptronModel.java b/src/edu/stanford/nlp/parser/shiftreduce/PerceptronModel.java
@@ -476,12 +476,10 @@ private double evaluate(Tagger tagger, Treebank devTreebank, String message) {
   /**
    * This increases f1 slightly, probably by letting the parser know
    * what to do in situations it doesn't get to during the training.
-   * <br>
-   * TODO: make constants out of 10, 0.5, etc
    */
-  static void augmentData(List<TrainingExample> augmentedData, List<TrainingExample> trainingData, Random random) {
+  static void augmentSubsentences(List<TrainingExample> augmentedData, List<TrainingExample> trainingData, Random random, float augmentFraction) {
     for (TrainingExample example : trainingData) {
-      if (example.transitions.size() > 10 && random.nextDouble() < 0.5) {
+      if (example.transitions.size() > 10 && random.nextDouble() < augmentFraction) {
         int pivot = random.nextInt(example.transitions.size() - 10) + 7;
         augmentedData.add(new TrainingExample(example.binarizedTree, example.transitions, pivot));
       }
@@ -540,9 +538,10 @@ private void trainModel(String serializedPath, Tagger tagger, Random random, Lis
       IntCounter<Pair<Integer, Integer>> firstErrors = new IntCounter<>();
 
       List<TrainingExample> augmentedData = new ArrayList<TrainingExample>(trainingData);
-      augmentData(augmentedData, trainingData, random);
+      augmentSubsentences(augmentedData, trainingData, random, op.trainOptions().augmentSubsentences);
       Collections.shuffle(augmentedData, random);
       log.info("Original list " + trainingData.size() + "; augmented " + augmentedData.size());
+
       for (int start = 0; start < augmentedData.size(); start += op.trainOptions.batchSize) {
         int end = Math.min(start + op.trainOptions.batchSize, augmentedData.size());
         TrainingResult result = trainBatch(augmentedData.subList(start, end), wrapper);
diff --git a/src/edu/stanford/nlp/parser/shiftreduce/ShiftReduceOptions.java b/src/edu/stanford/nlp/parser/shiftreduce/ShiftReduceOptions.java
@@ -114,6 +114,9 @@ protected int setOptionFlag(String[] args, int i) {
     } else if (args[i].equalsIgnoreCase("-retrainShardFeatureDrop")) {
       trainOptions().retrainShardFeatureDrop = Double.parseDouble(args[i + 1]);
       i += 2;
+    } else if (args[i].equalsIgnoreCase("-augmentSubsentences")) {
+      trainOptions().augmentSubsentences = Float.parseFloat(args[i + 1]);
+      i += 2;
     }
     return i;
   }
diff --git a/src/edu/stanford/nlp/parser/shiftreduce/ShiftReduceTrainOptions.java b/src/edu/stanford/nlp/parser/shiftreduce/ShiftReduceTrainOptions.java
@@ -60,6 +60,14 @@ public enum TrainingMethod {
    */
   public double retrainShardFeatureDrop = 0.25;
 
+  /**
+   * Some training trees will be repeated, with gold transitions given
+   * for the first several steps to ensure the parser starts from a
+   * good place.  For some datasets, such as the English training set,
+   * 0.5 is excessively large.
+   */
+  public float augmentSubsentences = 0.5f;
+
   // version id randomly chosen by forgetting to set the version id when serializing models
   private static final long serialVersionUID = -8158249539308373819L;
 }

Original file line number	Diff line number	Diff line change
`@@ -114,6 +114,9 @@ protected int setOptionFlag(String[] args, int i) {`
`114`	`114`	`} else if (args[i].equalsIgnoreCase("-retrainShardFeatureDrop")) {`
`115`	`115`	`trainOptions().retrainShardFeatureDrop = Double.parseDouble(args[i + 1]);`
`116`	`116`	`i += 2;`
	`117`	`+ } else if (args[i].equalsIgnoreCase("-augmentSubsentences")) {`
	`118`	`+ trainOptions().augmentSubsentences = Float.parseFloat(args[i + 1]);`
	`119`	`+ i += 2;`
`117`	`120`	`}`
`118`	`121`	`return i;`
`119`	`122`	`}`