Skip to content

Commit

Permalink
Fixed issue 1252: Refactored LearnBlob and its call hierarchy to make…
Browse files Browse the repository at this point in the history
… it a member of Classify.

Eliminated the flexfx scheme for calling global feature extractor functions
through an array of function pointers.
Deleted dead code I found as a by-product.
This CL does not change BlobToTrainingSample or ExtractFeatures to be full
members of Classify (the eventual goal) as that would make it even bigger,
since there are a lot of callers to these functions.
When ExtractFeatures and BlobToTrainingSample are members of Classify they
will be able to access control parameters in Classify, which will greatly
simplify developing variations to the feature extraction process.
  • Loading branch information
theraysmith committed May 12, 2015
1 parent e735a90 commit 53fc445
Show file tree
Hide file tree
Showing 31 changed files with 220 additions and 745 deletions.
29 changes: 25 additions & 4 deletions api/baseapi.cpp
Expand Up @@ -51,6 +51,7 @@
#include "allheaders.h"

#include "baseapi.h"
#include "blobclass.h"
#include "resultiterator.h"
#include "mutableiterator.h"
#include "thresholder.h"
Expand Down Expand Up @@ -870,7 +871,9 @@ int TessBaseAPI::Recognize(ETEXT_DESC* monitor) {
page_res_ = NULL;
return -1;
} else if (tesseract_->tessedit_train_from_boxes) {
tesseract_->ApplyBoxTraining(*output_file_, page_res_);
STRING fontname;
ExtractFontName(*output_file_, &fontname);
tesseract_->ApplyBoxTraining(fontname, page_res_);
} else if (tesseract_->tessedit_ambigs_training) {
FILE *training_output_file = tesseract_->init_recog_training(*input_file_);
// OCR the page segmented into words by tesseract.
Expand Down Expand Up @@ -1051,6 +1054,23 @@ bool TessBaseAPI::ProcessPagesMultipageTiff(const l_uint8 *data,
return true;
}

// Master ProcessPages calls ProcessPagesInternal and then does any post-
// processing required due to being in a training mode.
bool TessBaseAPI::ProcessPages(const char* filename, const char* retry_config,
int timeout_millisec,
TessResultRenderer* renderer) {
bool result =
ProcessPagesInternal(filename, retry_config, timeout_millisec, renderer);
if (result) {
if (tesseract_->tessedit_train_from_boxes &&
!tesseract_->WriteTRFile(*output_file_)) {
tprintf("Write of TR file failed: %s\n", output_file_->string());
return false;
}
}
return result;
}

// In the ideal scenario, Tesseract will start working on data as soon
// as it can. For example, if you steam a filelist through stdin, we
// should start the OCR process as soon as the first filename is
Expand All @@ -1063,9 +1083,10 @@ bool TessBaseAPI::ProcessPagesMultipageTiff(const l_uint8 *data,
// identify the scenario that really matters: filelists on
// stdin. We'll still do our best if the user likes pipes. That means
// piling up any data coming into stdin into a memory buffer.
bool TessBaseAPI::ProcessPages(const char* filename,
const char* retry_config, int timeout_millisec,
TessResultRenderer* renderer) {
bool TessBaseAPI::ProcessPagesInternal(const char* filename,
const char* retry_config,
int timeout_millisec,
TessResultRenderer* renderer) {
PERF_COUNT_START("ProcessPages")
bool stdInput = !strcmp(filename, "stdin") || !strcmp(filename, "-");
if (stdInput) {
Expand Down
8 changes: 5 additions & 3 deletions api/baseapi.h
Expand Up @@ -538,9 +538,11 @@ class TESS_API TessBaseAPI {
*
* Returns true if successful, false on error.
*/
bool ProcessPages(const char* filename,
const char* retry_config, int timeout_millisec,
TessResultRenderer* renderer);
bool ProcessPages(const char* filename, const char* retry_config,
int timeout_millisec, TessResultRenderer* renderer);
// Does the real work of ProcessPages.
bool ProcessPagesInternal(const char* filename, const char* retry_config,
int timeout_millisec, TessResultRenderer* renderer);

/**
* Turn a single image into symbolic text.
Expand Down
6 changes: 3 additions & 3 deletions ccmain/applybox.cpp
Expand Up @@ -775,13 +775,13 @@ void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) {
}

// Calls LearnWord to extract features for labelled blobs within each word.
// Features are written to the given filename.
void Tesseract::ApplyBoxTraining(const STRING& filename, PAGE_RES* page_res) {
// Features are stored in an internal buffer.
void Tesseract::ApplyBoxTraining(const STRING& fontname, PAGE_RES* page_res) {
PAGE_RES_IT pr_it(page_res);
int word_count = 0;
for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
word_res = pr_it.forward()) {
LearnWord(filename.string(), word_res);
LearnWord(fontname.string(), word_res);
++word_count;
}
tprintf("Generated training data for %d words\n", word_count);
Expand Down
37 changes: 17 additions & 20 deletions classify/adaptmatch.cpp
Expand Up @@ -220,17 +220,15 @@ void Classify::RefreshDebugWindow(ScrollView **win, const char *msg,

// Learns the given word using its chopped_word, seam_array, denorm,
// box_word, best_state, and correct_text to learn both correctly and
// incorrectly segmented blobs. If filename is not NULL, then LearnBlob
// is called and the data will be written to a file for static training.
// incorrectly segmented blobs. If fontname is not NULL, then LearnBlob
// is called and the data will be saved in an internal buffer.
// Otherwise AdaptToBlob is called for adaption within a document.
// If rejmap is not NULL, then only chars with a rejmap entry of '1' will
// be learned, otherwise all chars with good correct_text are learned.
void Classify::LearnWord(const char* filename, WERD_RES *word) {
void Classify::LearnWord(const char* fontname, WERD_RES* word) {
int word_len = word->correct_text.size();
if (word_len == 0) return;

float* thresholds = NULL;
if (filename == NULL) {
if (fontname == NULL) {
// Adaption mode.
if (!EnableLearning || word->best_choice == NULL)
return; // Can't or won't adapt.
Expand Down Expand Up @@ -267,8 +265,8 @@ void Classify::LearnWord(const char* filename, WERD_RES *word) {
if (word->correct_text[ch].length() > 0) {
float threshold = thresholds != NULL ? thresholds[ch] : 0.0f;

LearnPieces(filename, start_blob, word->best_state[ch],
threshold, CST_WHOLE, word->correct_text[ch].string(), word);
LearnPieces(fontname, start_blob, word->best_state[ch], threshold,
CST_WHOLE, word->correct_text[ch].string(), word);

if (word->best_state[ch] > 1 && !disable_character_fragments) {
// Check that the character breaks into meaningful fragments
Expand Down Expand Up @@ -301,8 +299,8 @@ void Classify::LearnWord(const char* filename, WERD_RES *word) {
if (i != tokens.size() - 1)
full_string += ' ';
}
LearnPieces(filename, start_blob + frag, 1,
threshold, CST_FRAGMENT, full_string.string(), word);
LearnPieces(fontname, start_blob + frag, 1, threshold,
CST_FRAGMENT, full_string.string(), word);
}
}
}
Expand All @@ -314,13 +312,13 @@ void Classify::LearnWord(const char* filename, WERD_RES *word) {
if (word->best_state[ch] > 1) {
// If the next blob is good, make junk with the rightmost fragment.
if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0) {
LearnPieces(filename, start_blob + word->best_state[ch] - 1,
LearnPieces(fontname, start_blob + word->best_state[ch] - 1,
word->best_state[ch + 1] + 1,
threshold, CST_IMPROPER, INVALID_UNICHAR, word);
}
// If the previous blob is good, make junk with the leftmost fragment.
if (ch > 0 && word->correct_text[ch - 1].length() > 0) {
LearnPieces(filename, start_blob - word->best_state[ch - 1],
LearnPieces(fontname, start_blob - word->best_state[ch - 1],
word->best_state[ch - 1] + 1,
threshold, CST_IMPROPER, INVALID_UNICHAR, word);
}
Expand All @@ -329,7 +327,7 @@ void Classify::LearnWord(const char* filename, WERD_RES *word) {
if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0) {
STRING joined_text = word->correct_text[ch];
joined_text += word->correct_text[ch + 1];
LearnPieces(filename, start_blob,
LearnPieces(fontname, start_blob,
word->best_state[ch] + word->best_state[ch + 1],
threshold, CST_NGRAM, joined_text.string(), word);
}
Expand All @@ -342,16 +340,16 @@ void Classify::LearnWord(const char* filename, WERD_RES *word) {

// Builds a blob of length fragments, from the word, starting at start,
// and then learns it, as having the given correct_text.
// If filename is not NULL, then LearnBlob
// is called and the data will be written to a file for static training.
// If fontname is not NULL, then LearnBlob is called and the data will be
// saved in an internal buffer for static training.
// Otherwise AdaptToBlob is called for adaption within a document.
// threshold is a magic number required by AdaptToChar and generated by
// ComputeAdaptionThresholds.
// Although it can be partly inferred from the string, segmentation is
// provided to explicitly clarify the character segmentation.
void Classify::LearnPieces(const char* filename, int start, int length,
void Classify::LearnPieces(const char* fontname, int start, int length,
float threshold, CharSegmentationType segmentation,
const char* correct_text, WERD_RES *word) {
const char* correct_text, WERD_RES* word) {
// TODO(daria) Remove/modify this if/when we want
// to train and/or adapt to n-grams.
if (segmentation != CST_WHOLE &&
Expand Down Expand Up @@ -385,16 +383,15 @@ void Classify::LearnPieces(const char* filename, int start, int length,
}
#endif // GRAPHICS_DISABLED

if (filename != NULL) {
if (fontname != NULL) {
classify_norm_method.set_value(character); // force char norm spc 30/11/93
tess_bn_matching.set_value(false); // turn it off
tess_cn_matching.set_value(false);
DENORM bl_denorm, cn_denorm;
INT_FX_RESULT_STRUCT fx_info;
SetupBLCNDenorms(*rotated_blob, classify_nonlinear_norm,
&bl_denorm, &cn_denorm, &fx_info);
LearnBlob(feature_defs_, filename, rotated_blob, bl_denorm, cn_denorm,
fx_info, correct_text);
LearnBlob(fontname, rotated_blob, cn_denorm, fx_info, correct_text);
} else if (unicharset.contains_unichar(correct_text)) {
UNICHAR_ID class_id = unicharset.unichar_to_id(correct_text);
int font_id = word->fontinfo != NULL
Expand Down
136 changes: 57 additions & 79 deletions classify/blobclass.cpp
Expand Up @@ -20,111 +20,89 @@
Include Files and Type Defines
----------------------------------------------------------------------------**/
#include "blobclass.h"
#include "extract.h"
#include "efio.h"
#include "featdefs.h"
#include "callcpp.h"

#include <math.h>
#include <stdio.h>
#include <signal.h>

#define MAXFILENAME 80
#define MAXMATCHES 10
#include "classify.h"
#include "efio.h"
#include "featdefs.h"
#include "mf.h"
#include "normfeat.h"

static const char kUnknownFontName[] = "UnknownFont";

STRING_VAR(classify_font_name, kUnknownFontName,
"Default font name to be used in training");

/**----------------------------------------------------------------------------
Global Data Definitions and Declarations
----------------------------------------------------------------------------**/
/* name of current image file being processed */
extern char imagefile[];

namespace tesseract {
/**----------------------------------------------------------------------------
Public Code
----------------------------------------------------------------------------**/

/*---------------------------------------------------------------------------*/
// As all TBLOBs, Blob is in baseline normalized coords.
// See SetupBLCNDenorms in intfx.cpp for other args.
void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, const STRING& filename,
TBLOB * Blob, const DENORM& bl_denorm, const DENORM& cn_denorm,
const INT_FX_RESULT_STRUCT& fx_info, const char* BlobText) {
/*
** Parameters:
** Blob blob whose micro-features are to be learned
** Row row of text that blob came from
** BlobText text that corresponds to blob
** TextLength number of characters in blob
** Globals:
** imagefile base filename of the page being learned
** classify_font_name
** name of font currently being trained on
** Operation:
** Extract micro-features from the specified blob and append
** them to the appropriate file.
** Return: none
** Exceptions: none
** History: 7/28/89, DSJ, Created.
*/
#define TRAIN_SUFFIX ".tr"
static FILE *FeatureFile = NULL;
STRING Filename(filename);

// If no fontname was set, try to extract it from the filename
STRING CurrFontName = classify_font_name;
if (CurrFontName == kUnknownFontName) {
// Finds the name of the training font and returns it in fontname, by cutting
// it out based on the expectation that the filename is of the form:
// /path/to/dir/[lang].[fontname].exp[num]
// The [lang], [fontname] and [num] fields should not have '.' characters.
// If the global parameter classify_font_name is set, its value is used instead.
void ExtractFontName(const STRING& filename, STRING* fontname) {
*fontname = classify_font_name;
if (*fontname == kUnknownFontName) {
// filename is expected to be of the form [lang].[fontname].exp[num]
// The [lang], [fontname] and [num] fields should not have '.' characters.
const char *basename = strrchr(filename.string(), '/');
const char *firstdot = strchr(basename ? basename : filename.string(), '.');
const char *lastdot = strrchr(filename.string(), '.');
if (firstdot != lastdot && firstdot != NULL && lastdot != NULL) {
++firstdot;
CurrFontName = firstdot;
CurrFontName[lastdot - firstdot] = '\0';
*fontname = firstdot;
fontname->truncate_at(lastdot - firstdot);
}
}
}

// if a feature file is not yet open, open it
// the name of the file is the name of the image plus TRAIN_SUFFIX
if (FeatureFile == NULL) {
Filename += TRAIN_SUFFIX;
FeatureFile = Efopen(Filename.string(), "wb");
cprintf("TRAINING ... Font name = %s\n", CurrFontName.string());
}

LearnBlob(FeatureDefs, FeatureFile, Blob, bl_denorm, cn_denorm, fx_info,
BlobText, CurrFontName.string());
} // LearnBlob

void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE* FeatureFile,
TBLOB* Blob, const DENORM& bl_denorm, const DENORM& cn_denorm,
const INT_FX_RESULT_STRUCT& fx_info,
const char* BlobText, const char* FontName) {
CHAR_DESC CharDesc;

ASSERT_HOST(FeatureFile != NULL);

CharDesc = ExtractBlobFeatures(FeatureDefs, bl_denorm, cn_denorm, fx_info,
Blob);
if (CharDesc == NULL) {
cprintf("LearnBLob: CharDesc was NULL. Aborting.\n");
return;
}

if (ValidCharDescription(FeatureDefs, CharDesc)) {
// label the features with a class name and font name
fprintf(FeatureFile, "\n%s %s\n", FontName, BlobText);
/*---------------------------------------------------------------------------*/
// Extracts features from the given blob and saves them in the tr_file_data_
// member variable.
// fontname: Name of font that this blob was printed in.
// cn_denorm: Character normalization transformation to apply to the blob.
// fx_info: Character normalization parameters computed with cn_denorm.
// blob_text: Ground truth text for the blob.
void Classify::LearnBlob(const STRING& fontname, TBLOB* blob,
const DENORM& cn_denorm,
const INT_FX_RESULT_STRUCT& fx_info,
const char* blob_text) {
CHAR_DESC CharDesc = NewCharDescription(feature_defs_);
CharDesc->FeatureSets[0] = ExtractMicros(blob, cn_denorm);
CharDesc->FeatureSets[1] = ExtractCharNormFeatures(fx_info);
CharDesc->FeatureSets[2] = ExtractIntCNFeatures(*blob, fx_info);
CharDesc->FeatureSets[3] = ExtractIntGeoFeatures(*blob, fx_info);

if (ValidCharDescription(feature_defs_, CharDesc)) {
// Label the features with a class name and font name.
tr_file_data_ += "\n";
tr_file_data_ += fontname;
tr_file_data_ += " ";
tr_file_data_ += blob_text;
tr_file_data_ += "\n";

// write micro-features to file and clean up
WriteCharDescription(FeatureDefs, FeatureFile, CharDesc);
WriteCharDescription(feature_defs_, CharDesc, &tr_file_data_);
} else {
tprintf("Blob learned was invalid!\n");
}
FreeCharDescription(CharDesc);

} // LearnBlob

// Writes stored training data to a .tr file based on the given filename.
// Returns false on error.
bool Classify::WriteTRFile(const STRING& filename) {
STRING tr_filename = filename + ".tr";
FILE* fp = Efopen(tr_filename.string(), "wb");
int len = tr_file_data_.length();
bool result =
fwrite(&tr_file_data_[0], sizeof(tr_file_data_[0]), len, fp) == len;
fclose(fp);
tr_file_data_.truncate_at(0);
return result;
}

} // namespace tesseract.

0 comments on commit 53fc445

Please sign in to comment.