Doxygen

Squashed commit from https://github.com/tesseract-ocr/tesseract/tree/more-doxygen closes #14 Commits: 6317305 doxygen 9f42f69 doxygen 0fc4d52 doxygen 37b4b55 fix typo bded8f1 some more doxy 020eb00 slight tweak 524666d doxygenify 2a36a3e doxygenify 229d218 doxygenify 7fd28ae doxygenify a8c64bc doxygenify f5d21b6 fix 5d8ede8 doxygenify a58a4e0 language_model.cpp fa85709 lm_pain_points.cpp lm_state.cpp 6418da3 merge 06190ba Merge branch 'old_doxygen_merge' into more-doxygen 84acf08 Merge branch 'master' into more-doxygen 50fe1ff pagewalk.cpp cube_reco_context.cpp 2982583 change to relative 192a24a applybox.cpp, take one 8eeb053 delete docs for obsolete params 52e4c77 modernise classify/ocrfeatures.cpp 2a1cba6 modernise cutil/emalloc.cpp 773e006 silence doxygen warning aeb1731 silence doxygen warning f18387f silence doxygen; new params are unused? 15ad6bd doxygenify cutil/efio.cpp c8b5dad doxygenify cutil/danerror.cpp 784450f the globals and exceptions parts are obsolete; remove 8bca324 doxygen classify/normfeat.cpp 9bcbe16 doxygen classify/normmatch.cpp aa9a971 doxygen ccmain/cube_control.cpp c083ff2 doxygen ccmain/cube_reco_context.cpp f842850 params changed 5c94f12 doxygen ccmain/cubeclassifier.cpp 15ba750 case sensitive f5c71d4 case sensitive f85655b doxygen classify/intproto.cpp 4bbc7aa partial doxygen classify/mfx.cpp dbb6041 partial doxygen classify/intproto.cpp 2aa72db finish doxygen classify/intproto.cpp 0b8de99 doxygen training/mftraining.cpp 0b5b35c partial doxygen ccstruct/coutln.cpp b81c766 partial doxygen ccstruct/coutln.cpp 40fc415 finished? doxygen ccstruct/coutln.cpp 6e4165c doxygen classify/clusttool.cpp 0267dec doxygen classify/cutoffs.cpp 7f0c70c doxygen classify/fpoint.cpp 512f3bd ignore ~ files 5668a52 doxygen classify/intmatcher.cpp 84788d4 doxygen classify/kdtree.cpp 29f36ca doxygen classify/mfoutline.cpp 40b94b1 silence doxygen warnings 6c511b9 doxygen classify/mfx.cpp f9b4080 doxygen classify/outfeat.cpp aa1df05 doxygen classify/picofeat.cpp cc5f466 doxygen training/cntraining.cpp cce044f doxygen training/commontraining.cpp 167e216 missing param 9498383 renamed params 37eeac2 renamed param d87b5dd case c8ee174 renamed params b858db8 typo 4c2a838 h2 context? 81a2c0c fix some param names; add some missing params, no docs bcf8a4c add some missing params, no docs af77f86 add some missing params, no docs; fix some param names 01df24e fix some params 6161056 fix some params 68508b6 fix some params 285aeb6 doxygen complains here no matter what 529bcfa rm some missing params, typos cd21226 rm some missing params, add some new ones 48a4bc2 fix params c844628 missing param 312ce37 missing param; rename one ec2fdec missing param 05e15e0 missing params d515858 change "<" to < to make doxygen happy b476a28 wrong place
tesseract-ocr · Jul 20, 2015 · 524a614 · 524a614
1 parent 5414087
commit 524a614
Show file tree

Hide file tree

Showing 47 changed files with 2,910 additions and 3,124 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
+*~
 # Windows
 *.user
 *.log
@@ -62,4 +63,4 @@ training/wordlist2dawg
 # tessdata
 *.cube.*
 *.tesseract_cube.*
-*.traineddata
+*.traineddata
diff --git a/ccmain/applybox.cpp b/ccmain/applybox.cpp
@@ -36,18 +36,22 @@
 #include "tesseractclass.h"
 #include "genericvector.h"
 
-// Max number of blobs to classify together in FindSegmentation.
+/** Max number of blobs to classify together in FindSegmentation. */
 const int kMaxGroupSize = 4;
-// Max fraction of median allowed as deviation in xheight before switching
-// to median.
+/// Max fraction of median allowed as deviation in xheight before switching
+/// to median.
 const double kMaxXHeightDeviationFraction = 0.125;
 
-/*************************************************************************
+/**
  * The box file is assumed to contain box definitions, one per line, of the
  * following format for blob-level boxes:
+ * @verbatim
  *   <UTF8 str> <left> <bottom> <right> <top> <page id>
+ * @endverbatim
  * and for word/line-level boxes:
+ * @verbatim
  *   WordStr <left> <bottom> <right> <top> <page id> #<space-delimited word str>
+ * @endverbatim
  * NOTES:
  * The boxes use tesseract coordinates, i.e. 0,0 is at BOTTOM-LEFT.
  *
@@ -62,13 +66,16 @@ const double kMaxXHeightDeviationFraction = 0.125;
  * units in the word/line are listed after the # at the end of the line and
  * are space delimited, ignoring any original spaces on the line.
  * Eg.
+ * @verbatim
  * word -> #w o r d
  * multi word line -> #m u l t i w o r d l i n e
+ * @endverbatim
  * The recognizable units must be space-delimited in order to allow multiple
  * unicodes to be used for a single recognizable unit, eg Hindi.
+ *
  * In this mode, the classifier must have been pre-trained with the desired
  * character set, or it will not be able to find the character segmentations.
- *************************************************************************/
+ */
 
 namespace tesseract {
 
@@ -181,8 +188,8 @@ static double MedianXHeight(BLOCK_LIST *block_list) {
   return xheights.median();
 }
 
-// Any row xheight that is significantly different from the median is set
-// to the median.
+/// Any row xheight that is significantly different from the median is set
+/// to the median.
 void Tesseract::PreenXHeights(BLOCK_LIST *block_list) {
   double median_xheight = MedianXHeight(block_list);
   double max_deviation = kMaxXHeightDeviationFraction * median_xheight;
@@ -205,8 +212,8 @@ void Tesseract::PreenXHeights(BLOCK_LIST *block_list) {
   }
 }
 
-// Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
-// All fuzzy spaces are removed, and all the words are maximally chopped.
+/// Builds a PAGE_RES from the block_list in the way required for ApplyBoxes:
+/// All fuzzy spaces are removed, and all the words are maximally chopped.
 PAGE_RES* Tesseract::SetupApplyBoxes(const GenericVector<TBOX>& boxes,
                                      BLOCK_LIST *block_list) {
   PreenXHeights(block_list);
@@ -240,9 +247,9 @@ PAGE_RES* Tesseract::SetupApplyBoxes(const GenericVector<TBOX>& boxes,
   return page_res;
 }
 
-// Tests the chopper by exhaustively running chop_one_blob.
-// The word_res will contain filled chopped_word, seam_array, denorm,
-// box_word and best_state for the maximally chopped word.
+/// Tests the chopper by exhaustively running chop_one_blob.
+/// The word_res will contain filled chopped_word, seam_array, denorm,
+/// box_word and best_state for the maximally chopped word.
 void Tesseract::MaximallyChopWord(const GenericVector<TBOX>& boxes,
                                   BLOCK* block, ROW* row,
                                   WERD_RES* word_res) {
@@ -300,17 +307,17 @@ void Tesseract::MaximallyChopWord(const GenericVector<TBOX>& boxes,
   word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]);
 }
 
-// Helper to compute the dispute resolution metric.
-// Disputed blob resolution. The aim is to give the blob to the most
-// appropriate boxfile box. Most of the time it is obvious, but if
-// two boxfile boxes overlap significantly it is not. If a small boxfile
-// box takes most of the blob, and a large boxfile box does too, then
-// we want the small boxfile box to get it, but if the small box
-// is much smaller than the blob, we don't want it to get it.
-// Details of the disputed blob resolution:
-// Given a box with area A, and a blob with area B, with overlap area C,
-// then the miss metric is (A-C)(B-C)/(AB) and the box with minimum
-// miss metric gets the blob.
+/// Helper to compute the dispute resolution metric.
+/// Disputed blob resolution. The aim is to give the blob to the most
+/// appropriate boxfile box. Most of the time it is obvious, but if
+/// two boxfile boxes overlap significantly it is not. If a small boxfile
+/// box takes most of the blob, and a large boxfile box does too, then
+/// we want the small boxfile box to get it, but if the small box
+/// is much smaller than the blob, we don't want it to get it.
+/// Details of the disputed blob resolution:
+/// Given a box with area A, and a blob with area B, with overlap area C,
+/// then the miss metric is (A-C)(B-C)/(AB) and the box with minimum
+/// miss metric gets the blob.
 static double BoxMissMetric(const TBOX& box1, const TBOX& box2) {
   int overlap_area = box1.intersection(box2).area();
   double miss_metric = box1.area()- overlap_area;
@@ -320,14 +327,16 @@ static double BoxMissMetric(const TBOX& box1, const TBOX& box2) {
   return miss_metric;
 }
 
-// Gather consecutive blobs that match the given box into the best_state
-// and corresponding correct_text.
-// Fights over which box owns which blobs are settled by pre-chopping and
-// applying the blobs to box or next_box with the least non-overlap.
-// Returns false if the box was in error, which can only be caused by
-// failing to find an appropriate blob for a box.
-// This means that occasionally, blobs may be incorrectly segmented if the
-// chopper fails to find a suitable chop point.
+/// Gather consecutive blobs that match the given box into the best_state
+/// and corresponding correct_text.
+///
+/// Fights over which box owns which blobs are settled by pre-chopping and
+/// applying the blobs to box or next_box with the least non-overlap.
+/// @return false if the box was in error, which can only be caused by
+/// failing to find an appropriate blob for a box.
+///
+/// This means that occasionally, blobs may be incorrectly segmented if the
+/// chopper fails to find a suitable chop point.
 bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, const TBOX *prev_box,
                                  const TBOX& box, const TBOX& next_box,
                                  const char* correct_text) {
@@ -420,12 +429,12 @@ bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, const TBOX *prev_box,
   return false;  // Failure.
 }
 
-// Consume all source blobs that strongly overlap the given box,
-// putting them into a new word, with the correct_text label.
-// Fights over which box owns which blobs are settled by
-// applying the blobs to box or next_box with the least non-overlap.
-// Returns false if the box was in error, which can only be caused by
-// failing to find an overlapping blob for a box.
+/// Consume all source blobs that strongly overlap the given box,
+/// putting them into a new word, with the correct_text label.
+/// Fights over which box owns which blobs are settled by
+/// applying the blobs to box or next_box with the least non-overlap.
+/// @return false if the box was in error, which can only be caused by
+/// failing to find an overlapping blob for a box.
 bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list,
                                  const TBOX& box, const TBOX& next_box,
                                  const char* correct_text) {
@@ -495,8 +504,8 @@ bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list,
   return new_word != NULL;
 }
 
-// Resegments the words by running the classifier in an attempt to find the
-// correct segmentation that produces the required string.
+/// Resegments the words by running the classifier in an attempt to find the
+/// correct segmentation that produces the required string.
 void Tesseract::ReSegmentByClassification(PAGE_RES* page_res) {
   PAGE_RES_IT pr_it(page_res);
   WERD_RES* word_res;
@@ -521,8 +530,8 @@ void Tesseract::ReSegmentByClassification(PAGE_RES* page_res) {
   }
 }
 
-// Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
-// Returns false if an invalid UNICHAR_ID is encountered.
+/// Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
+/// @return false if an invalid UNICHAR_ID is encountered.
 bool Tesseract::ConvertStringToUnichars(const char* utf8,
                                         GenericVector<UNICHAR_ID>* class_ids) {
   for (int step = 0; *utf8 != '\0'; utf8 += step) {
@@ -541,12 +550,12 @@ bool Tesseract::ConvertStringToUnichars(const char* utf8,
   return true;
 }
 
-// Resegments the word to achieve the target_text from the classifier.
-// Returns false if the re-segmentation fails.
-// Uses brute-force combination of up to kMaxGroupSize adjacent blobs, and
-// applies a full search on the classifier results to find the best classified
-// segmentation. As a compromise to obtain better recall, 1-1 ambiguity
-// substitutions ARE used.
+/// Resegments the word to achieve the target_text from the classifier.
+/// Returns false if the re-segmentation fails.
+/// Uses brute-force combination of up to #kMaxGroupSize adjacent blobs, and
+/// applies a full search on the classifier results to find the best classified
+/// segmentation. As a compromise to obtain better recall, 1-1 ambiguity
+/// substitutions ARE used.
 bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
                                  WERD_RES* word_res) {
   // Classify all required combinations of blobs and save results in choices.
@@ -603,12 +612,20 @@ bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
   return true;
 }
 
-// Recursive helper to find a match to the target_text (from text_index
-// position) in the choices (from choices_pos position).
-// Choices is an array of GenericVectors, of length choices_length, with each
-// element representing a starting position in the word, and the
-// GenericVector holding classification results for a sequence of consecutive
-// blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
+/// Recursive helper to find a match to the target_text (from text_index
+/// position) in the choices (from choices_pos position).
+/// @param choices is an array of GenericVectors, of length choices_length,
+/// with each element representing a starting position in the word, and the
+/// #GenericVector holding classification results for a sequence of consecutive
+/// blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
+/// @param choices_pos
+/// @param choices_length
+/// @param target_text
+/// @param text_index
+/// @param rating
+/// @param segmentation
+/// @param best_rating
+/// @param best_segmentation
 void Tesseract::SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices,
                               int choices_pos, int choices_length,
                               const GenericVector<UNICHAR_ID>& target_text,
@@ -682,10 +699,10 @@ void Tesseract::SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices,
   }
 }
 
-// Counts up the labelled words and the blobs within.
-// Deletes all unused or emptied words, counting the unused ones.
-// Resets W_BOL and W_EOL flags correctly.
-// Builds the rebuild_word and rebuilds the box_word and the best_choice.
+/// - Counts up the labelled words and the blobs within.
+/// - Deletes all unused or emptied words, counting the unused ones.
+/// - Resets W_BOL and W_EOL flags correctly.
+/// - Builds the rebuild_word and rebuilds the box_word and the best_choice.
 void Tesseract::TidyUp(PAGE_RES* page_res) {
   int ok_blob_count = 0;
   int bad_blob_count = 0;
@@ -743,15 +760,15 @@ void Tesseract::TidyUp(PAGE_RES* page_res) {
   }
 }
 
-// Logs a bad box by line in the box file and box coords.
+/** Logs a bad box by line in the box file and box coords.*/
 void Tesseract::ReportFailedBox(int boxfile_lineno, TBOX box,
                                 const char *box_ch, const char *err_msg) {
   tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n",
           boxfile_lineno + 1, box_ch,
           box.left(), box.bottom(), box.right(), box.top(), err_msg);
 }
 
-// Creates a fake best_choice entry in each WERD_RES with the correct text.
+/** Creates a fake best_choice entry in each WERD_RES with the correct text.*/
 void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) {
   PAGE_RES_IT pr_it(page_res);
   for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
@@ -774,8 +791,8 @@ void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) {
   }
 }
 
-// Calls LearnWord to extract features for labelled blobs within each word.
-// Features are stored in an internal buffer.
+/// Calls #LearnWord to extract features for labelled blobs within each word.
+/// Features are stored in an internal buffer.
 void Tesseract::ApplyBoxTraining(const STRING& fontname, PAGE_RES* page_res) {
   PAGE_RES_IT pr_it(page_res);
   int word_count = 0;

diff --git a/ccmain/control.cpp b/ccmain/control.cpp
@@ -59,8 +59,6 @@ const double kMinRefitXHeightFraction = 0.5;
 
 
 /**
- * recog_pseudo_word
- *
  * Make a word from the selected blobs and run Tess on them.
  *
  * @param page_res recognise blobs
@@ -79,13 +77,9 @@ void Tesseract::recog_pseudo_word(PAGE_RES* page_res,
 
 
 /**
- * recog_interactive
- *
  * Recognize a single word in interactive mode.
  *
- * @param block block
- * @param row row of word
- * @param word_res word to recognise
+ * @param pr_it the page results iterator
  */
 BOOL8 Tesseract::recog_interactive(PAGE_RES_IT* pr_it) {
   inT16 char_qual;
@@ -150,7 +144,7 @@ bool Tesseract::ProcessTargetWord(const TBOX& word_box,
   return true;
 }
 
-// If tesseract is to be run, sets the words up ready for it.
+/** If tesseract is to be run, sets the words up ready for it. */
 void Tesseract::SetupAllWordsPassN(int pass_n,
                                    const TBOX* target_word_box,
                                    const char* word_config,