Fixed problems with shifted baselines so recognition can recover from…

… layout analysis errors.
tesseract-ocr · May 12, 2015 · b6d0184 · b6d0184
1 parent 4a3caef
commit b6d0184
Show file tree

Hide file tree

Showing 8 changed files with 186 additions and 97 deletions.
diff --git a/ccmain/control.cpp b/ccmain/control.cpp
@@ -1044,45 +1044,77 @@ bool Tesseract::TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row) {
   int original_misfits = CountMisfitTops(word);
   if (original_misfits == 0)
     return false;
-  float new_x_ht = ComputeCompatibleXheight(word);
-  if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
-    WERD_RES new_x_ht_word(word->word);
-    if (word->blamer_bundle != NULL) {
-      new_x_ht_word.blamer_bundle = new BlamerBundle();
-      new_x_ht_word.blamer_bundle->CopyTruth(*(word->blamer_bundle));
-    }
-    new_x_ht_word.x_height = new_x_ht;
-    new_x_ht_word.caps_height = 0.0;
-    new_x_ht_word.SetupForRecognition(
-          unicharset, this, BestPix(), tessedit_ocr_engine_mode, NULL,
-          classify_bln_numeric_mode, textord_use_cjk_fp_model,
-          poly_allow_detailed_fx, row, block);
-    match_word_pass_n(2, &new_x_ht_word, row, block);
-    if (!new_x_ht_word.tess_failed) {
-      int new_misfits = CountMisfitTops(&new_x_ht_word);
-      if (debug_x_ht_level >= 1) {
-        tprintf("Old misfits=%d with x-height %f, new=%d with x-height %f\n",
-                original_misfits, word->x_height,
-                new_misfits, new_x_ht);
-        tprintf("Old rating= %f, certainty=%f, new=%f, %f\n",
-                word->best_choice->rating(), word->best_choice->certainty(),
-                new_x_ht_word.best_choice->rating(),
-                new_x_ht_word.best_choice->certainty());
-      }
-      // The misfits must improve and either the rating or certainty.
-      accept_new_x_ht = new_misfits < original_misfits &&
-                        (new_x_ht_word.best_choice->certainty() >
-                            word->best_choice->certainty() ||
-                         new_x_ht_word.best_choice->rating() <
-                            word->best_choice->rating());
-      if (debug_x_ht_level >= 1) {
-        ReportXhtFixResult(accept_new_x_ht, new_x_ht, word, &new_x_ht_word);
+  float baseline_shift = 0.0f;
+  float new_x_ht = ComputeCompatibleXheight(word, &baseline_shift);
+  if (baseline_shift != 0.0f) {
+    // Try the shift on its own first.
+    if (!TestNewNormalization(original_misfits, baseline_shift, word->x_height,
+                              word, block, row))
+      return false;
+    original_misfits = CountMisfitTops(word);
+    if (original_misfits > 0) {
+      float new_baseline_shift;
+      // Now recompute the new x_height.
+      new_x_ht = ComputeCompatibleXheight(word, &new_baseline_shift);
+      if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
+        // No test of return value here, as we are definitely making a change
+        // to the word by shifting the baseline.
+        TestNewNormalization(original_misfits, baseline_shift, new_x_ht,
+                             word, block, row);
       }
     }
-    if (accept_new_x_ht) {
-      word->ConsumeWordResults(&new_x_ht_word);
-      return true;
+    return true;
+  } else if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
+    return TestNewNormalization(original_misfits, 0.0f, new_x_ht,
+                                word, block, row);
+  } else {
+    return false;
+  }
+}
+
+// Runs recognition with the test baseline shift and x-height and returns true
+// if there was an improvement in recognition result.
+bool Tesseract::TestNewNormalization(int original_misfits,
+                                     float baseline_shift, float new_x_ht,
+                                     WERD_RES *word, BLOCK* block, ROW *row) {
+  bool accept_new_x_ht = false;
+  WERD_RES new_x_ht_word(word->word);
+  if (word->blamer_bundle != NULL) {
+    new_x_ht_word.blamer_bundle = new BlamerBundle();
+    new_x_ht_word.blamer_bundle->CopyTruth(*(word->blamer_bundle));
+  }
+  new_x_ht_word.x_height = new_x_ht;
+  new_x_ht_word.baseline_shift = baseline_shift;
+  new_x_ht_word.caps_height = 0.0;
+  new_x_ht_word.SetupForRecognition(
+        unicharset, this, BestPix(), tessedit_ocr_engine_mode, NULL,
+        classify_bln_numeric_mode, textord_use_cjk_fp_model,
+      poly_allow_detailed_fx, row, block);
+  match_word_pass_n(2, &new_x_ht_word, row, block);
+  if (!new_x_ht_word.tess_failed) {
+    int new_misfits = CountMisfitTops(&new_x_ht_word);
+    if (debug_x_ht_level >= 1) {
+      tprintf("Old misfits=%d with x-height %f, new=%d with x-height %f\n",
+              original_misfits, word->x_height,
+              new_misfits, new_x_ht);
+      tprintf("Old rating= %f, certainty=%f, new=%f, %f\n",
+              word->best_choice->rating(), word->best_choice->certainty(),
+              new_x_ht_word.best_choice->rating(),
+              new_x_ht_word.best_choice->certainty());
     }
+    // The misfits must improve and either the rating or certainty.
+    accept_new_x_ht = new_misfits < original_misfits &&
+                      (new_x_ht_word.best_choice->certainty() >
+                          word->best_choice->certainty() ||
+                       new_x_ht_word.best_choice->rating() <
+                          word->best_choice->rating());
+    if (debug_x_ht_level >= 1) {
+      ReportXhtFixResult(accept_new_x_ht, new_x_ht, word, &new_x_ht_word);
+    }
+  }
+  if (accept_new_x_ht) {
+    word->ConsumeWordResults(&new_x_ht_word);
+    return true;
   }
   return false;
 }
@@ -1380,13 +1412,13 @@ BOOL8 Tesseract::check_debug_pt(WERD_RES *word, int location) {
     return FALSE;
 
   tessedit_rejection_debug.set_value (FALSE);
-  debug_x_ht_level.set_value (0);
+  debug_x_ht_level.set_value(0);
 
   if (word->word->bounding_box ().contains (FCOORD (test_pt_x, test_pt_y))) {
     if (location < 0)
       return TRUE;               // For breakpoint use
     tessedit_rejection_debug.set_value (TRUE);
-    debug_x_ht_level.set_value (20);
+    debug_x_ht_level.set_value(2);
     tprintf ("\n\nTESTWD::");
     switch (location) {
       case 0:

diff --git a/ccmain/fixxht.cpp b/ccmain/fixxht.cpp
@@ -35,6 +35,8 @@ namespace tesseract {
 // guessed that the blob tops are caps and will have placed the xheight too low.
 // 3. Noise/logos beside words, or changes in font size on a line. Such
 // things can blow the statistics and cause an incorrect estimate.
+// 4. Incorrect baseline. Can happen when 2 columns are incorrectly merged.
+// In this case the x-height is often still correct.
 //
 // Algorithm.
 // Compare the vertical position (top only) of alphnumerics in a word with
@@ -54,6 +56,10 @@ namespace tesseract {
 // even if the x-height is incorrect. This is not a terrible assumption, but
 // it is not great. An improvement would be to use a classifier that does
 // not care about vertical position or scaling at all.
+// Separately collect stats on shifted baselines and apply the same logic to
+// computing a best-fit shift to fix the error. If the baseline needs to be
+// shifted, but the x-height is OK, returns the original x-height along with
+// the baseline shift to indicate that recognition needs to re-run.
 
 // If the max-min top of a unicharset char is bigger than kMaxCharTopRange
 // then the char top cannot be used to judge misfits or suggest a new top.
@@ -92,65 +98,108 @@ int Tesseract::CountMisfitTops(WERD_RES *word_res) {
 
 // Returns a new x-height maximally compatible with the result in word_res.
 // See comment above for overall algorithm.
-float Tesseract::ComputeCompatibleXheight(WERD_RES *word_res) {
+float Tesseract::ComputeCompatibleXheight(WERD_RES *word_res,
+                                          float* baseline_shift) {
   STATS top_stats(0, MAX_UINT8);
+  STATS shift_stats(-MAX_UINT8, MAX_UINT8);
+  int bottom_shift = 0;
   int num_blobs = word_res->rebuild_word->NumBlobs();
-  for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
-    TBLOB* blob = word_res->rebuild_word->blobs[blob_id];
-    UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
-    if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
-      int top = blob->bounding_box().top();
-      // Clip the top to the limit of normalized feature space.
-      if (top >= INT_FEAT_RANGE)
-        top = INT_FEAT_RANGE - 1;
-      int bottom = blob->bounding_box().bottom();
-      int min_bottom, max_bottom, min_top, max_top;
-      unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
-                                &min_top, &max_top);
-      // Chars with a wild top range would mess up the result so ignore them.
-      if (max_top - min_top > kMaxCharTopRange)
-        continue;
-      int misfit_dist = MAX((min_top - x_ht_acceptance_tolerance) - top,
-                          top - (max_top + x_ht_acceptance_tolerance));
-      int height = top - kBlnBaselineOffset;
-      if (debug_x_ht_level >= 20) {
-        tprintf("Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d : ",
-                unicharset.id_to_unichar(class_id),
-                height, min_bottom, max_bottom, min_top, max_top,
-                bottom, top);
-      }
-      // Use only chars that fit in the expected bottom range, and where
-      // the range of tops is sensibly near the xheight.
-      if (min_bottom <= bottom + x_ht_acceptance_tolerance &&
-          bottom - x_ht_acceptance_tolerance <= max_bottom &&
-          min_top > kBlnBaselineOffset &&
-          max_top - kBlnBaselineOffset >= kBlnXHeight &&
-          misfit_dist > 0) {
-        // Compute the x-height position using proportionality between the
-        // actual height and expected height.
-        int min_xht = DivRounded(height * kBlnXHeight,
-                                 max_top - kBlnBaselineOffset);
-        int max_xht = DivRounded(height * kBlnXHeight,
-                                 min_top - kBlnBaselineOffset);
-        if (debug_x_ht_level >= 20) {
-          tprintf(" xht range min=%d, max=%d\n",
-                  min_xht, max_xht);
+  do {
+    top_stats.clear();
+    shift_stats.clear();
+    for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
+      TBLOB* blob = word_res->rebuild_word->blobs[blob_id];
+      UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
+      if (unicharset.get_isalpha(class_id) ||
+          unicharset.get_isdigit(class_id)) {
+        int top = blob->bounding_box().top() + bottom_shift;
+        // Clip the top to the limit of normalized feature space.
+        if (top >= INT_FEAT_RANGE)
+          top = INT_FEAT_RANGE - 1;
+        int bottom = blob->bounding_box().bottom() + bottom_shift;
+        int min_bottom, max_bottom, min_top, max_top;
+        unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
+                                  &min_top, &max_top);
+        // Chars with a wild top range would mess up the result so ignore them.
+        if (max_top - min_top > kMaxCharTopRange)
+          continue;
+        int misfit_dist = MAX((min_top - x_ht_acceptance_tolerance) - top,
+                            top - (max_top + x_ht_acceptance_tolerance));
+        int height = top - kBlnBaselineOffset;
+        if (debug_x_ht_level >= 2) {
+          tprintf("Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d: ",
+                  unicharset.id_to_unichar(class_id),
+                  height, min_bottom, max_bottom, min_top, max_top,
+                  bottom, top);
+        }
+        // Use only chars that fit in the expected bottom range, and where
+        // the range of tops is sensibly near the xheight.
+        if (min_bottom <= bottom + x_ht_acceptance_tolerance &&
+            bottom - x_ht_acceptance_tolerance <= max_bottom &&
+            min_top > kBlnBaselineOffset &&
+            max_top - kBlnBaselineOffset >= kBlnXHeight &&
+            misfit_dist > 0) {
+          // Compute the x-height position using proportionality between the
+          // actual height and expected height.
+          int min_xht = DivRounded(height * kBlnXHeight,
+                                   max_top - kBlnBaselineOffset);
+          int max_xht = DivRounded(height * kBlnXHeight,
+                                   min_top - kBlnBaselineOffset);
+          if (debug_x_ht_level >= 2) {
+            tprintf(" xht range min=%d, max=%d\n", min_xht, max_xht);
+          }
+          // The range of expected heights gets a vote equal to the distance
+          // of the actual top from the expected top.
+          for (int y = min_xht; y <= max_xht; ++y)
+            top_stats.add(y, misfit_dist);
+        } else if ((min_bottom > bottom + x_ht_acceptance_tolerance ||
+                    bottom - x_ht_acceptance_tolerance > max_bottom) &&
+                   bottom_shift == 0) {
+          // Get the range of required bottom shift.
+          int min_shift = min_bottom - bottom;
+          int max_shift = max_bottom - bottom;
+          if (debug_x_ht_level >= 2) {
+            tprintf(" bottom shift min=%d, max=%d\n", min_shift, max_shift);
+          }
+          // The range of expected shifts gets a vote equal to the min distance
+          // of the actual bottom from the expected bottom, spread over the
+          // range of its acceptance.
+          int misfit_weight = abs(min_shift);
+          if (max_shift > min_shift)
+            misfit_weight /= max_shift - min_shift;
+          for (int y = min_shift; y <= max_shift; ++y)
+            shift_stats.add(y, misfit_weight);
+        } else {
+          if (bottom_shift == 0) {
+            // Things with bottoms that are already ok need to say so, on the
+            // 1st iteration only.
+            shift_stats.add(0, kBlnBaselineOffset);
+          }
+          if (debug_x_ht_level >= 2) {
+            tprintf(" already OK\n");
+          }
         }
-        // The range of expected heights gets a vote equal to the distance
-        // of the actual top from the expected top.
-        for (int y = min_xht; y <= max_xht; ++y)
-          top_stats.add(y, misfit_dist);
-      } else if (debug_x_ht_level >= 20) {
-        tprintf(" already OK\n");
       }
     }
+    if (shift_stats.get_total() > top_stats.get_total()) {
+      bottom_shift = IntCastRounded(shift_stats.median());
+      if (debug_x_ht_level >= 2) {
+        tprintf("Applying bottom shift=%d\n", bottom_shift);
+      }
+    }
+  } while (bottom_shift != 0 &&
+           top_stats.get_total() < shift_stats.get_total());
+  // Baseline shift is opposite sign to the bottom shift.
+  *baseline_shift = -bottom_shift / word_res->denorm.y_scale();
+  if (debug_x_ht_level >= 2) {
+    tprintf("baseline shift=%g\n", *baseline_shift);
   }
   if (top_stats.get_total() == 0)
-    return 0.0f;
+    return bottom_shift != 0 ? word_res->x_height : 0.0f;
   // The new xheight is just the median vote, which is then scaled out
   // of BLN space back to pixel space to get the x-height in pixel space.
   float new_xht = top_stats.median();
-  if (debug_x_ht_level >= 20) {
+  if (debug_x_ht_level >= 2) {
     tprintf("Median xht=%f\n", new_xht);
     tprintf("Mode20:A: New x-height = %f (norm), %f (orig)\n",
             new_xht, new_xht / word_res->denorm.y_scale());
@@ -159,7 +208,7 @@ float Tesseract::ComputeCompatibleXheight(WERD_RES *word_res) {
   if (fabs(new_xht - kBlnXHeight) >= x_ht_min_change)
     return new_xht / word_res->denorm.y_scale();
   else
-    return 0.0f;
+    return bottom_shift != 0 ? word_res->x_height : 0.0f;
 }
 
 }  // namespace tesseract
diff --git a/ccmain/tesseractclass.h b/ccmain/tesseractclass.h
@@ -350,6 +350,11 @@ class Tesseract : public Wordrec {
                           WERD_RES* word, WERD_RES* new_word);
   bool RunOldFixXht(WERD_RES *word, BLOCK* block, ROW *row);
   bool TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row);
+  // Runs recognition with the test baseline shift and x-height and returns true
+  // if there was an improvement in recognition result.
+  bool TestNewNormalization(int original_misfits, float baseline_shift,
+                            float new_x_ht, WERD_RES *word, BLOCK* block,
+                            ROW *row);
   BOOL8 recog_interactive(PAGE_RES_IT* pr_it);
 
   // Set fonts of this word.
@@ -729,7 +734,7 @@ class Tesseract : public Wordrec {
   // maximally compatible with the result in word_res.
   // Returns 0.0f if no x-height is found that is better than the current
   // estimate.
-  float ComputeCompatibleXheight(WERD_RES *word_res);
+  float ComputeCompatibleXheight(WERD_RES *word_res, float* baseline_shift);
   //// Data members ///////////////////////////////////////////////////////
   // TODO(ocr-team): Find and remove obsolete parameters.
   BOOL_VAR_H(tessedit_resegment_from_boxes, false,

diff --git a/ccstruct/blobs.cpp b/ccstruct/blobs.cpp
@@ -805,8 +805,8 @@ TWERD* TWERD::PolygonalCopy(bool allow_detailed_fx, WERD* src) {
 // Baseline normalizes the blobs in-place, recording the normalization in the
 // DENORMs in the blobs.
 void TWERD::BLNormalize(const BLOCK* block, const ROW* row, Pix* pix,
-                        bool inverse, float x_height, bool numeric_mode,
-                        tesseract::OcrEngineMode hint,
+                        bool inverse, float x_height, float baseline_shift,
+                        bool numeric_mode, tesseract::OcrEngineMode hint,
                         const TBOX* norm_box,
                         DENORM* word_denorm) {
   TBOX word_box = bounding_box();
@@ -822,7 +822,7 @@ void TWERD::BLNormalize(const BLOCK* block, const ROW* row, Pix* pix,
     if (hint == tesseract::OEM_CUBE_ONLY)
       scale = 1.0f;
   } else {
-    input_y_offset = row->base_line(word_middle);
+    input_y_offset = row->base_line(word_middle) + baseline_shift;
   }
   for (int b = 0; b < blobs.size(); ++b) {
     TBLOB* blob = blobs[b];
@@ -835,7 +835,7 @@ void TWERD::BLNormalize(const BLOCK* block, const ROW* row, Pix* pix,
       blob_scale = ClipToRange(kBlnXHeight * 4.0f / (3 * blob_box.height()),
                                scale, scale * 1.5f);
     } else if (row != NULL && hint != tesseract::OEM_CUBE_ONLY) {
-      baseline = row->base_line(mid_x);
+      baseline = row->base_line(mid_x) + baseline_shift;
     }
     // The image will be 8-bit grey if the input was grey or color. Note that in
     // a grey image 0 is black and 255 is white. If the input was binary, then

diff --git a/ccstruct/blobs.h b/ccstruct/blobs.h
@@ -410,7 +410,7 @@ struct TWERD {
   // Baseline normalizes the blobs in-place, recording the normalization in the
   // DENORMs in the blobs.
   void BLNormalize(const BLOCK* block, const ROW* row, Pix* pix, bool inverse,
-                   float x_height, bool numeric_mode,
+                   float x_height, float baseline_shift, bool numeric_mode,
                    tesseract::OcrEngineMode hint,
                    const TBOX* norm_box,
                    DENORM* word_denorm);

diff --git a/ccstruct/normalis.cpp b/ccstruct/normalis.cpp
@@ -487,7 +487,7 @@ void DENORM::XHeightRange(int unichar_id, const UNICHARSET& unicharset,
       top > kBlnCellHeight - kBlnBaselineOffset / 2)
     max_top += kBlnBaselineOffset;
   top -= bln_yshift;
-  int height = top - kBlnBaselineOffset - bottom_shift;
+  int height = top - kBlnBaselineOffset;
   double min_height = min_top - kBlnBaselineOffset - tolerance;
   double max_height = max_top - kBlnBaselineOffset + tolerance;