Skip to content

Commit

Permalink
Doxygen
Browse files Browse the repository at this point in the history
Squashed commit from https://github.com/tesseract-ocr/tesseract/tree/more-doxygen
closes #14

Commits:
6317305  doxygen
9f42f69  doxygen
0fc4d52  doxygen
37b4b55  fix typo
bded8f1  some more doxy
020eb00  slight tweak
524666d  doxygenify
2a36a3e  doxygenify
229d218  doxygenify
7fd28ae  doxygenify
a8c64bc  doxygenify
f5d21b6  fix
5d8ede8  doxygenify
a58a4e0  language_model.cpp
fa85709  lm_pain_points.cpp lm_state.cpp
6418da3  merge
06190ba  Merge branch 'old_doxygen_merge' into more-doxygen
84acf08  Merge branch 'master' into more-doxygen
50fe1ff  pagewalk.cpp cube_reco_context.cpp
2982583  change to relative
192a24a  applybox.cpp, take one
8eeb053  delete docs for obsolete params
52e4c77  modernise classify/ocrfeatures.cpp
2a1cba6  modernise cutil/emalloc.cpp
773e006  silence doxygen warning
aeb1731  silence doxygen warning
f18387f  silence doxygen; new params are unused?
15ad6bd  doxygenify cutil/efio.cpp
c8b5dad  doxygenify cutil/danerror.cpp
784450f  the globals and exceptions parts are obsolete; remove
8bca324  doxygen classify/normfeat.cpp
9bcbe16  doxygen classify/normmatch.cpp
aa9a971  doxygen ccmain/cube_control.cpp
c083ff2  doxygen ccmain/cube_reco_context.cpp
f842850  params changed
5c94f12  doxygen ccmain/cubeclassifier.cpp
15ba750  case sensitive
f5c71d4  case sensitive
f85655b  doxygen classify/intproto.cpp
4bbc7aa  partial doxygen classify/mfx.cpp
dbb6041  partial doxygen classify/intproto.cpp
2aa72db  finish doxygen classify/intproto.cpp
0b8de99  doxygen training/mftraining.cpp
0b5b35c  partial doxygen ccstruct/coutln.cpp
b81c766  partial doxygen ccstruct/coutln.cpp
40fc415  finished? doxygen ccstruct/coutln.cpp
6e4165c  doxygen classify/clusttool.cpp
0267dec  doxygen classify/cutoffs.cpp
7f0c70c  doxygen classify/fpoint.cpp
512f3bd  ignore ~ files
5668a52  doxygen classify/intmatcher.cpp
84788d4  doxygen classify/kdtree.cpp
29f36ca  doxygen classify/mfoutline.cpp
40b94b1  silence doxygen warnings
6c511b9  doxygen classify/mfx.cpp
f9b4080  doxygen classify/outfeat.cpp
aa1df05  doxygen classify/picofeat.cpp
cc5f466  doxygen training/cntraining.cpp
cce044f  doxygen training/commontraining.cpp
167e216  missing param
9498383  renamed params
37eeac2  renamed param
d87b5dd  case
c8ee174  renamed params
b858db8  typo
4c2a838  h2 context?
81a2c0c  fix some param names; add some missing params, no docs
bcf8a4c  add some missing params, no docs
af77f86  add some missing params, no docs; fix some param names
01df24e  fix some params
6161056  fix some params
68508b6  fix some params
285aeb6  doxygen complains here no matter what
529bcfa  rm some missing params, typos
cd21226  rm some missing params, add some new ones
48a4bc2  fix params
c844628  missing param
312ce37  missing param; rename one
ec2fdec  missing param
05e15e0  missing params
d515858  change "<" to &lt; to make doxygen happy
b476a28  wrong place
  • Loading branch information
jimregan committed Jul 20, 2015
1 parent 5414087 commit 524a614
Show file tree
Hide file tree
Showing 47 changed files with 2,910 additions and 3,124 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
*~
# Windows
*.user
*.log
Expand Down Expand Up @@ -62,4 +63,4 @@ training/wordlist2dawg
# tessdata
*.cube.*
*.tesseract_cube.*
*.traineddata
*.traineddata
139 changes: 78 additions & 61 deletions ccmain/applybox.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,18 +36,22 @@
#include "tesseractclass.h"
#include "genericvector.h"

// Max number of blobs to classify together in FindSegmentation.
/** Max number of blobs to classify together in FindSegmentation. */
const int kMaxGroupSize = 4;
// Max fraction of median allowed as deviation in xheight before switching
// to median.
/// Max fraction of median allowed as deviation in xheight before switching
/// to median.
const double kMaxXHeightDeviationFraction = 0.125;

/*************************************************************************
/**
* The box file is assumed to contain box definitions, one per line, of the
* following format for blob-level boxes:
* @verbatim
* <UTF8 str> <left> <bottom> <right> <top> <page id>
* @endverbatim
* and for word/line-level boxes:
* @verbatim
* WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str>
* @endverbatim
* NOTES:
* The boxes use tesseract coordinates, i.e. 0,0 is at BOTTOM-LEFT.
*
Expand All @@ -62,13 +66,16 @@ const double kMaxXHeightDeviationFraction = 0.125;
* units in the word/line are listed after the # at the end of the line and
* are space delimited, ignoring any original spaces on the line.
* Eg.
* @verbatim
* word -> #w o r d
* multi word line -> #m u l t i w o r d l i n e
* @endverbatim
* The recognizable units must be space-delimited in order to allow multiple
* unicodes to be used for a single recognizable unit, eg Hindi.
*
* In this mode, the classifier must have been pre-trained with the desired
* character set, or it will not be able to find the character segmentations.
*************************************************************************/
*/

namespace tesseract {

Expand Down Expand Up @@ -181,8 +188,8 @@ static double MedianXHeight(BLOCK_LIST *block_list) {
return xheights.median();
}

// Any row xheight that is significantly different from the median is set
// to the median.
/// Any row xheight that is significantly different from the median is set
/// to the median.
void Tesseract::PreenXHeights(BLOCK_LIST *block_list) {
double median_xheight = MedianXHeight(block_list);
double max_deviation = kMaxXHeightDeviationFraction * median_xheight;
Expand All @@ -205,8 +212,8 @@ void Tesseract::PreenXHeights(BLOCK_LIST *block_list) {
}
}

// Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
// All fuzzy spaces are removed, and all the words are maximally chopped.
/// Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
/// All fuzzy spaces are removed, and all the words are maximally chopped.
PAGE_RES* Tesseract::SetupApplyBoxes(const GenericVector<TBOX>& boxes,
BLOCK_LIST *block_list) {
PreenXHeights(block_list);
Expand Down Expand Up @@ -240,9 +247,9 @@ PAGE_RES* Tesseract::SetupApplyBoxes(const GenericVector<TBOX>& boxes,
return page_res;
}

// Tests the chopper by exhaustively running chop_one_blob.
// The word_res will contain filled chopped_word, seam_array, denorm,
// box_word and best_state for the maximally chopped word.
/// Tests the chopper by exhaustively running chop_one_blob.
/// The word_res will contain filled chopped_word, seam_array, denorm,
/// box_word and best_state for the maximally chopped word.
void Tesseract::MaximallyChopWord(const GenericVector<TBOX>& boxes,
BLOCK* block, ROW* row,
WERD_RES* word_res) {
Expand Down Expand Up @@ -300,17 +307,17 @@ void Tesseract::MaximallyChopWord(const GenericVector<TBOX>& boxes,
word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]);
}

// Helper to compute the dispute resolution metric.
// Disputed blob resolution. The aim is to give the blob to the most
// appropriate boxfile box. Most of the time it is obvious, but if
// two boxfile boxes overlap significantly it is not. If a small boxfile
// box takes most of the blob, and a large boxfile box does too, then
// we want the small boxfile box to get it, but if the small box
// is much smaller than the blob, we don't want it to get it.
// Details of the disputed blob resolution:
// Given a box with area A, and a blob with area B, with overlap area C,
// then the miss metric is (A-C)(B-C)/(AB) and the box with minimum
// miss metric gets the blob.
/// Helper to compute the dispute resolution metric.
/// Disputed blob resolution. The aim is to give the blob to the most
/// appropriate boxfile box. Most of the time it is obvious, but if
/// two boxfile boxes overlap significantly it is not. If a small boxfile
/// box takes most of the blob, and a large boxfile box does too, then
/// we want the small boxfile box to get it, but if the small box
/// is much smaller than the blob, we don't want it to get it.
/// Details of the disputed blob resolution:
/// Given a box with area A, and a blob with area B, with overlap area C,
/// then the miss metric is (A-C)(B-C)/(AB) and the box with minimum
/// miss metric gets the blob.
static double BoxMissMetric(const TBOX& box1, const TBOX& box2) {
int overlap_area = box1.intersection(box2).area();
double miss_metric = box1.area()- overlap_area;
Expand All @@ -320,14 +327,16 @@ static double BoxMissMetric(const TBOX& box1, const TBOX& box2) {
return miss_metric;
}

// Gather consecutive blobs that match the given box into the best_state
// and corresponding correct_text.
// Fights over which box owns which blobs are settled by pre-chopping and
// applying the blobs to box or next_box with the least non-overlap.
// Returns false if the box was in error, which can only be caused by
// failing to find an appropriate blob for a box.
// This means that occasionally, blobs may be incorrectly segmented if the
// chopper fails to find a suitable chop point.
/// Gather consecutive blobs that match the given box into the best_state
/// and corresponding correct_text.
///
/// Fights over which box owns which blobs are settled by pre-chopping and
/// applying the blobs to box or next_box with the least non-overlap.
/// @return false if the box was in error, which can only be caused by
/// failing to find an appropriate blob for a box.
///
/// This means that occasionally, blobs may be incorrectly segmented if the
/// chopper fails to find a suitable chop point.
bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, const TBOX *prev_box,
const TBOX& box, const TBOX& next_box,
const char* correct_text) {
Expand Down Expand Up @@ -420,12 +429,12 @@ bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, const TBOX *prev_box,
return false; // Failure.
}

// Consume all source blobs that strongly overlap the given box,
// putting them into a new word, with the correct_text label.
// Fights over which box owns which blobs are settled by
// applying the blobs to box or next_box with the least non-overlap.
// Returns false if the box was in error, which can only be caused by
// failing to find an overlapping blob for a box.
/// Consume all source blobs that strongly overlap the given box,
/// putting them into a new word, with the correct_text label.
/// Fights over which box owns which blobs are settled by
/// applying the blobs to box or next_box with the least non-overlap.
/// @return false if the box was in error, which can only be caused by
/// failing to find an overlapping blob for a box.
bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list,
const TBOX& box, const TBOX& next_box,
const char* correct_text) {
Expand Down Expand Up @@ -495,8 +504,8 @@ bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list,
return new_word != NULL;
}

// Resegments the words by running the classifier in an attempt to find the
// correct segmentation that produces the required string.
/// Resegments the words by running the classifier in an attempt to find the
/// correct segmentation that produces the required string.
void Tesseract::ReSegmentByClassification(PAGE_RES* page_res) {
PAGE_RES_IT pr_it(page_res);
WERD_RES* word_res;
Expand All @@ -521,8 +530,8 @@ void Tesseract::ReSegmentByClassification(PAGE_RES* page_res) {
}
}

// Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
// Returns false if an invalid UNICHAR_ID is encountered.
/// Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
/// @return false if an invalid UNICHAR_ID is encountered.
bool Tesseract::ConvertStringToUnichars(const char* utf8,
GenericVector<UNICHAR_ID>* class_ids) {
for (int step = 0; *utf8 != '\0'; utf8 += step) {
Expand All @@ -541,12 +550,12 @@ bool Tesseract::ConvertStringToUnichars(const char* utf8,
return true;
}

// Resegments the word to achieve the target_text from the classifier.
// Returns false if the re-segmentation fails.
// Uses brute-force combination of up to kMaxGroupSize adjacent blobs, and
// applies a full search on the classifier results to find the best classified
// segmentation. As a compromise to obtain better recall, 1-1 ambiguity
// substitutions ARE used.
/// Resegments the word to achieve the target_text from the classifier.
/// Returns false if the re-segmentation fails.
/// Uses brute-force combination of up to #kMaxGroupSize adjacent blobs, and
/// applies a full search on the classifier results to find the best classified
/// segmentation. As a compromise to obtain better recall, 1-1 ambiguity
/// substitutions ARE used.
bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
WERD_RES* word_res) {
// Classify all required combinations of blobs and save results in choices.
Expand Down Expand Up @@ -603,12 +612,20 @@ bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
return true;
}

// Recursive helper to find a match to the target_text (from text_index
// position) in the choices (from choices_pos position).
// Choices is an array of GenericVectors, of length choices_length, with each
// element representing a starting position in the word, and the
// GenericVector holding classification results for a sequence of consecutive
// blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
/// Recursive helper to find a match to the target_text (from text_index
/// position) in the choices (from choices_pos position).
/// @param choices is an array of GenericVectors, of length choices_length,
/// with each element representing a starting position in the word, and the
/// #GenericVector holding classification results for a sequence of consecutive
/// blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
/// @param choices_pos
/// @param choices_length
/// @param target_text
/// @param text_index
/// @param rating
/// @param segmentation
/// @param best_rating
/// @param best_segmentation
void Tesseract::SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices,
int choices_pos, int choices_length,
const GenericVector<UNICHAR_ID>& target_text,
Expand Down Expand Up @@ -682,10 +699,10 @@ void Tesseract::SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices,
}
}

// Counts up the labelled words and the blobs within.
// Deletes all unused or emptied words, counting the unused ones.
// Resets W_BOL and W_EOL flags correctly.
// Builds the rebuild_word and rebuilds the box_word and the best_choice.
/// - Counts up the labelled words and the blobs within.
/// - Deletes all unused or emptied words, counting the unused ones.
/// - Resets W_BOL and W_EOL flags correctly.
/// - Builds the rebuild_word and rebuilds the box_word and the best_choice.
void Tesseract::TidyUp(PAGE_RES* page_res) {
int ok_blob_count = 0;
int bad_blob_count = 0;
Expand Down Expand Up @@ -743,15 +760,15 @@ void Tesseract::TidyUp(PAGE_RES* page_res) {
}
}

// Logs a bad box by line in the box file and box coords.
/** Logs a bad box by line in the box file and box coords.*/
void Tesseract::ReportFailedBox(int boxfile_lineno, TBOX box,
const char *box_ch, const char *err_msg) {
tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n",
boxfile_lineno + 1, box_ch,
box.left(), box.bottom(), box.right(), box.top(), err_msg);
}

// Creates a fake best_choice entry in each WERD_RES with the correct text.
/** Creates a fake best_choice entry in each WERD_RES with the correct text.*/
void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) {
PAGE_RES_IT pr_it(page_res);
for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
Expand All @@ -774,8 +791,8 @@ void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) {
}
}

// Calls LearnWord to extract features for labelled blobs within each word.
// Features are stored in an internal buffer.
/// Calls #LearnWord to extract features for labelled blobs within each word.
/// Features are stored in an internal buffer.
void Tesseract::ApplyBoxTraining(const STRING& fontname, PAGE_RES* page_res) {
PAGE_RES_IT pr_it(page_res);
int word_count = 0;
Expand Down
10 changes: 2 additions & 8 deletions ccmain/control.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,8 +59,6 @@ const double kMinRefitXHeightFraction = 0.5;


/**
* recog_pseudo_word
*
* Make a word from the selected blobs and run Tess on them.
*
* @param page_res recognise blobs
Expand All @@ -79,13 +77,9 @@ void Tesseract::recog_pseudo_word(PAGE_RES* page_res,


/**
* recog_interactive
*
* Recognize a single word in interactive mode.
*
* @param block block
* @param row row of word
* @param word_res word to recognise
* @param pr_it the page results iterator
*/
BOOL8 Tesseract::recog_interactive(PAGE_RES_IT* pr_it) {
inT16 char_qual;
Expand Down Expand Up @@ -150,7 +144,7 @@ bool Tesseract::ProcessTargetWord(const TBOX& word_box,
return true;
}

// If tesseract is to be run, sets the words up ready for it.
/** If tesseract is to be run, sets the words up ready for it. */
void Tesseract::SetupAllWordsPassN(int pass_n,
const TBOX* target_word_box,
const char* word_config,
Expand Down
Loading

0 comments on commit 524a614

Please sign in to comment.