Fixed issue 1252: Refactored LearnBlob and its call hierarchy to make…

… it a member of Classify. Eliminated the flexfx scheme for calling global feature extractor functions through an array of function pointers. Deleted dead code I found as a by-product. This CL does not change BlobToTrainingSample or ExtractFeatures to be full members of Classify (the eventual goal) as that would make it even bigger, since there are a lot of callers to these functions. When ExtractFeatures and BlobToTrainingSample are members of Classify they will be able to access control parameters in Classify, which will greatly simplify developing variations to the feature extraction process.
tesseract-ocr · May 12, 2015 · 53fc445 · 53fc445
1 parent e735a90
commit 53fc445
Show file tree

Hide file tree

Showing 31 changed files with 220 additions and 745 deletions.
diff --git a/api/baseapi.cpp b/api/baseapi.cpp
@@ -51,6 +51,7 @@
 #include "allheaders.h"
 
 #include "baseapi.h"
+#include "blobclass.h"
 #include "resultiterator.h"
 #include "mutableiterator.h"
 #include "thresholder.h"
@@ -870,7 +871,9 @@ int TessBaseAPI::Recognize(ETEXT_DESC* monitor) {
     page_res_ = NULL;
     return -1;
   } else if (tesseract_->tessedit_train_from_boxes) {
-    tesseract_->ApplyBoxTraining(*output_file_, page_res_);
+    STRING fontname;
+    ExtractFontName(*output_file_, &fontname);
+    tesseract_->ApplyBoxTraining(fontname, page_res_);
   } else if (tesseract_->tessedit_ambigs_training) {
     FILE *training_output_file = tesseract_->init_recog_training(*input_file_);
     // OCR the page segmented into words by tesseract.
@@ -1051,6 +1054,23 @@ bool TessBaseAPI::ProcessPagesMultipageTiff(const l_uint8 *data,
   return true;
 }
 
+// Master ProcessPages calls ProcessPagesInternal and then does any post-
+// processing required due to being in a training mode.
+bool TessBaseAPI::ProcessPages(const char* filename, const char* retry_config,
+                               int timeout_millisec,
+                               TessResultRenderer* renderer) {
+  bool result =
+      ProcessPagesInternal(filename, retry_config, timeout_millisec, renderer);
+  if (result) {
+    if (tesseract_->tessedit_train_from_boxes &&
+        !tesseract_->WriteTRFile(*output_file_)) {
+      tprintf("Write of TR file failed: %s\n", output_file_->string());
+      return false;
+    }
+  }
+  return result;
+}
+
 // In the ideal scenario, Tesseract will start working on data as soon
 // as it can. For example, if you steam a filelist through stdin, we
 // should start the OCR process as soon as the first filename is
@@ -1063,9 +1083,10 @@ bool TessBaseAPI::ProcessPagesMultipageTiff(const l_uint8 *data,
 // identify the scenario that really matters: filelists on
 // stdin. We'll still do our best if the user likes pipes.  That means
 // piling up any data coming into stdin into a memory buffer.
-bool TessBaseAPI::ProcessPages(const char* filename,
-                               const char* retry_config, int timeout_millisec,
-                               TessResultRenderer* renderer) {
+bool TessBaseAPI::ProcessPagesInternal(const char* filename,
+                                       const char* retry_config,
+                                       int timeout_millisec,
+                                       TessResultRenderer* renderer) {
   PERF_COUNT_START("ProcessPages")
   bool stdInput = !strcmp(filename, "stdin") || !strcmp(filename, "-");
   if (stdInput) {

diff --git a/api/baseapi.h b/api/baseapi.h
@@ -538,9 +538,11 @@ class TESS_API TessBaseAPI {
    *
    * Returns true if successful, false on error.
    */
-  bool ProcessPages(const char* filename,
-                    const char* retry_config, int timeout_millisec,
-                    TessResultRenderer* renderer);
+  bool ProcessPages(const char* filename, const char* retry_config,
+                    int timeout_millisec, TessResultRenderer* renderer);
+  // Does the real work of ProcessPages.
+  bool ProcessPagesInternal(const char* filename, const char* retry_config,
+                            int timeout_millisec, TessResultRenderer* renderer);
 
   /**
    * Turn a single image into symbolic text.

diff --git a/ccmain/applybox.cpp b/ccmain/applybox.cpp
@@ -775,13 +775,13 @@ void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) {
 }
 
 // Calls LearnWord to extract features for labelled blobs within each word.
-// Features are written to the given filename.
-void Tesseract::ApplyBoxTraining(const STRING& filename, PAGE_RES* page_res) {
+// Features are stored in an internal buffer.
+void Tesseract::ApplyBoxTraining(const STRING& fontname, PAGE_RES* page_res) {
   PAGE_RES_IT pr_it(page_res);
   int word_count = 0;
   for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
        word_res = pr_it.forward()) {
-    LearnWord(filename.string(), word_res);
+    LearnWord(fontname.string(), word_res);
     ++word_count;
   }
   tprintf("Generated training data for %d words\n", word_count);

diff --git a/classify/adaptmatch.cpp b/classify/adaptmatch.cpp
@@ -220,17 +220,15 @@ void Classify::RefreshDebugWindow(ScrollView **win, const char *msg,
 
 // Learns the given word using its chopped_word, seam_array, denorm,
 // box_word, best_state, and correct_text to learn both correctly and
-// incorrectly segmented blobs. If filename is not NULL, then LearnBlob
-// is called and the data will be written to a file for static training.
+// incorrectly segmented blobs. If fontname is not NULL, then LearnBlob
+// is called and the data will be saved in an internal buffer.
 // Otherwise AdaptToBlob is called for adaption within a document.
-// If rejmap is not NULL, then only chars with a rejmap entry of '1' will
-// be learned, otherwise all chars with good correct_text are learned.
-void Classify::LearnWord(const char* filename, WERD_RES *word) {
+void Classify::LearnWord(const char* fontname, WERD_RES* word) {
   int word_len = word->correct_text.size();
   if (word_len == 0) return;
 
   float* thresholds = NULL;
-  if (filename == NULL) {
+  if (fontname == NULL) {
     // Adaption mode.
     if (!EnableLearning || word->best_choice == NULL)
       return;  // Can't or won't adapt.
@@ -267,8 +265,8 @@ void Classify::LearnWord(const char* filename, WERD_RES *word) {
     if (word->correct_text[ch].length() > 0) {
       float threshold = thresholds != NULL ? thresholds[ch] : 0.0f;
 
-      LearnPieces(filename, start_blob, word->best_state[ch],
-                  threshold, CST_WHOLE, word->correct_text[ch].string(), word);
+      LearnPieces(fontname, start_blob, word->best_state[ch], threshold,
+                  CST_WHOLE, word->correct_text[ch].string(), word);
 
       if (word->best_state[ch] > 1 && !disable_character_fragments) {
         // Check that the character breaks into meaningful fragments
@@ -301,8 +299,8 @@ void Classify::LearnWord(const char* filename, WERD_RES *word) {
                 if (i != tokens.size() - 1)
                   full_string += ' ';
               }
-              LearnPieces(filename, start_blob + frag, 1,
-                          threshold, CST_FRAGMENT, full_string.string(), word);
+              LearnPieces(fontname, start_blob + frag, 1, threshold,
+                          CST_FRAGMENT, full_string.string(), word);
             }
           }
         }
@@ -314,13 +312,13 @@ void Classify::LearnWord(const char* filename, WERD_RES *word) {
       if (word->best_state[ch] > 1) {
         // If the next blob is good, make junk with the rightmost fragment.
         if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0) {
-          LearnPieces(filename, start_blob + word->best_state[ch] - 1,
+          LearnPieces(fontname, start_blob + word->best_state[ch] - 1,
                       word->best_state[ch + 1] + 1,
                       threshold, CST_IMPROPER, INVALID_UNICHAR, word);
         }
         // If the previous blob is good, make junk with the leftmost fragment.
         if (ch > 0 && word->correct_text[ch - 1].length() > 0) {
-          LearnPieces(filename, start_blob - word->best_state[ch - 1],
+          LearnPieces(fontname, start_blob - word->best_state[ch - 1],
                       word->best_state[ch - 1] + 1,
                       threshold, CST_IMPROPER, INVALID_UNICHAR, word);
         }
@@ -329,7 +327,7 @@ void Classify::LearnWord(const char* filename, WERD_RES *word) {
       if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0) {
         STRING joined_text = word->correct_text[ch];
         joined_text += word->correct_text[ch + 1];
-        LearnPieces(filename, start_blob,
+        LearnPieces(fontname, start_blob,
                     word->best_state[ch] + word->best_state[ch + 1],
                     threshold, CST_NGRAM, joined_text.string(), word);
       }
@@ -342,16 +340,16 @@ void Classify::LearnWord(const char* filename, WERD_RES *word) {
 
 // Builds a blob of length fragments, from the word, starting at start,
 // and then learns it, as having the given correct_text.
-// If filename is not NULL, then LearnBlob
-// is called and the data will be written to a file for static training.
+// If fontname is not NULL, then LearnBlob is called and the data will be
+// saved in an internal buffer for static training.
 // Otherwise AdaptToBlob is called for adaption within a document.
 // threshold is a magic number required by AdaptToChar and generated by
 // ComputeAdaptionThresholds.
 // Although it can be partly inferred from the string, segmentation is
 // provided to explicitly clarify the character segmentation.
-void Classify::LearnPieces(const char* filename, int start, int length,
+void Classify::LearnPieces(const char* fontname, int start, int length,
                            float threshold, CharSegmentationType segmentation,
-                           const char* correct_text, WERD_RES *word) {
+                           const char* correct_text, WERD_RES* word) {
   // TODO(daria) Remove/modify this if/when we want
   // to train and/or adapt to n-grams.
   if (segmentation != CST_WHOLE &&
@@ -385,16 +383,15 @@ void Classify::LearnPieces(const char* filename, int start, int length,
   }
   #endif  // GRAPHICS_DISABLED
 
-  if (filename != NULL) {
+  if (fontname != NULL) {
     classify_norm_method.set_value(character);  // force char norm spc 30/11/93
     tess_bn_matching.set_value(false);    // turn it off
     tess_cn_matching.set_value(false);
     DENORM bl_denorm, cn_denorm;
     INT_FX_RESULT_STRUCT fx_info;
     SetupBLCNDenorms(*rotated_blob, classify_nonlinear_norm,
                      &bl_denorm, &cn_denorm, &fx_info);
-    LearnBlob(feature_defs_, filename, rotated_blob, bl_denorm, cn_denorm,
-              fx_info, correct_text);
+    LearnBlob(fontname, rotated_blob, cn_denorm, fx_info, correct_text);
   } else if (unicharset.contains_unichar(correct_text)) {
     UNICHAR_ID class_id = unicharset.unichar_to_id(correct_text);
     int font_id = word->fontinfo != NULL

diff --git a/classify/blobclass.cpp b/classify/blobclass.cpp
@@ -20,111 +20,89 @@
       Include Files and Type Defines
 ----------------------------------------------------------------------------**/
 #include "blobclass.h"
-#include "extract.h"
-#include "efio.h"
-#include "featdefs.h"
-#include "callcpp.h"
 
-#include <math.h>
 #include <stdio.h>
-#include <signal.h>
 
-#define MAXFILENAME             80
-#define MAXMATCHES              10
+#include "classify.h"
+#include "efio.h"
+#include "featdefs.h"
+#include "mf.h"
+#include "normfeat.h"
 
 static const char kUnknownFontName[] = "UnknownFont";
 
 STRING_VAR(classify_font_name, kUnknownFontName,
            "Default font name to be used in training");
 
-/**----------------------------------------------------------------------------
-        Global Data Definitions and Declarations
-----------------------------------------------------------------------------**/
-/* name of current image file being processed */
-extern char imagefile[];
-
+namespace tesseract {
 /**----------------------------------------------------------------------------
             Public Code
 ----------------------------------------------------------------------------**/
-
-/*---------------------------------------------------------------------------*/
-// As all TBLOBs, Blob is in baseline normalized coords.
-// See SetupBLCNDenorms in intfx.cpp for other args.
-void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, const STRING& filename,
-               TBLOB * Blob, const DENORM& bl_denorm, const DENORM& cn_denorm,
-               const INT_FX_RESULT_STRUCT& fx_info, const char* BlobText) {
-/*
- **      Parameters:
- **              Blob            blob whose micro-features are to be learned
- **              Row             row of text that blob came from
- **              BlobText        text that corresponds to blob
- **              TextLength      number of characters in blob
- **      Globals:
- **              imagefile       base filename of the page being learned
- **              classify_font_name
- **                              name of font currently being trained on
- **      Operation:
- **              Extract micro-features from the specified blob and append
- **              them to the appropriate file.
- **      Return: none
- **      Exceptions: none
- **      History: 7/28/89, DSJ, Created.
- */
-#define TRAIN_SUFFIX    ".tr"
-  static FILE *FeatureFile = NULL;
-  STRING Filename(filename);
-
-  // If no fontname was set, try to extract it from the filename
-  STRING CurrFontName = classify_font_name;
-  if (CurrFontName == kUnknownFontName) {
+// Finds the name of the training font and returns it in fontname, by cutting
+// it out based on the expectation that the filename is of the form:
+// /path/to/dir/[lang].[fontname].exp[num]
+// The [lang], [fontname] and [num] fields should not have '.' characters.
+// If the global parameter classify_font_name is set, its value is used instead.
+void ExtractFontName(const STRING& filename, STRING* fontname) {
+  *fontname = classify_font_name;
+  if (*fontname == kUnknownFontName) {
     // filename is expected to be of the form [lang].[fontname].exp[num]
     // The [lang], [fontname] and [num] fields should not have '.' characters.
     const char *basename = strrchr(filename.string(), '/');
     const char *firstdot = strchr(basename ? basename : filename.string(), '.');
     const char *lastdot  = strrchr(filename.string(), '.');
     if (firstdot != lastdot && firstdot != NULL && lastdot != NULL) {
       ++firstdot;
-      CurrFontName = firstdot;
-      CurrFontName[lastdot - firstdot] = '\0';
+      *fontname = firstdot;
+      fontname->truncate_at(lastdot - firstdot);
     }
   }
+}
 
-  // if a feature file is not yet open, open it
-  // the name of the file is the name of the image plus TRAIN_SUFFIX
-  if (FeatureFile == NULL) {
-    Filename += TRAIN_SUFFIX;
-    FeatureFile = Efopen(Filename.string(), "wb");
-    cprintf("TRAINING ... Font name = %s\n", CurrFontName.string());
-  }
-
-  LearnBlob(FeatureDefs, FeatureFile, Blob, bl_denorm, cn_denorm, fx_info,
-            BlobText, CurrFontName.string());
-}                                // LearnBlob
-
-void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE* FeatureFile,
-               TBLOB* Blob, const DENORM& bl_denorm, const DENORM& cn_denorm,
-               const INT_FX_RESULT_STRUCT& fx_info,
-               const char* BlobText, const char* FontName) {
-  CHAR_DESC CharDesc;
-
-  ASSERT_HOST(FeatureFile != NULL);
-
-  CharDesc = ExtractBlobFeatures(FeatureDefs, bl_denorm, cn_denorm, fx_info,
-                                 Blob);
-  if (CharDesc == NULL) {
-    cprintf("LearnBLob: CharDesc was NULL. Aborting.\n");
-    return;
-  }
-
-  if (ValidCharDescription(FeatureDefs, CharDesc)) {
-    // label the features with a class name and font name
-    fprintf(FeatureFile, "\n%s %s\n", FontName, BlobText);
+/*---------------------------------------------------------------------------*/
+// Extracts features from the given blob and saves them in the tr_file_data_
+// member variable.
+// fontname:  Name of font that this blob was printed in.
+// cn_denorm: Character normalization transformation to apply to the blob.
+// fx_info:   Character normalization parameters computed with cn_denorm.
+// blob_text: Ground truth text for the blob.
+void Classify::LearnBlob(const STRING& fontname, TBLOB* blob,
+                         const DENORM& cn_denorm,
+                         const INT_FX_RESULT_STRUCT& fx_info,
+                         const char* blob_text) {
+  CHAR_DESC CharDesc = NewCharDescription(feature_defs_);
+  CharDesc->FeatureSets[0] = ExtractMicros(blob, cn_denorm);
+  CharDesc->FeatureSets[1] = ExtractCharNormFeatures(fx_info);
+  CharDesc->FeatureSets[2] = ExtractIntCNFeatures(*blob, fx_info);
+  CharDesc->FeatureSets[3] = ExtractIntGeoFeatures(*blob, fx_info);
+
+  if (ValidCharDescription(feature_defs_, CharDesc)) {
+    // Label the features with a class name and font name.
+    tr_file_data_ += "\n";
+    tr_file_data_ += fontname;
+    tr_file_data_ += " ";
+    tr_file_data_ += blob_text;
+    tr_file_data_ += "\n";
 
     // write micro-features to file and clean up
-    WriteCharDescription(FeatureDefs, FeatureFile, CharDesc);
+    WriteCharDescription(feature_defs_, CharDesc, &tr_file_data_);
   } else {
     tprintf("Blob learned was invalid!\n");
   }
   FreeCharDescription(CharDesc);
-
 }                                // LearnBlob
+
+// Writes stored training data to a .tr file based on the given filename.
+// Returns false on error.
+bool Classify::WriteTRFile(const STRING& filename) {
+  STRING tr_filename = filename + ".tr";
+  FILE* fp = Efopen(tr_filename.string(), "wb");
+  int len = tr_file_data_.length();
+  bool result =
+      fwrite(&tr_file_data_[0], sizeof(tr_file_data_[0]), len, fp) == len;
+  fclose(fp);
+  tr_file_data_.truncate_at(0);
+  return result;
+}
+
+}  // namespace tesseract.