Skip to content

Commit

Permalink
If there is no explicit renderer(s), default to TessTextRenderer
Browse files Browse the repository at this point in the history
Revert fd429c3, 43834da, 05de195.

See #49, #59.

The code in this commit solves the issue in a more elegant way, IMHO.

Now you can use:
  * `tesseract eurotext.tif eurotext txt pdf`
  * `tesseract eurotext.tif eurotext txt hocr`
  * `tesseract eurotext.tif eurotext txt hocr pdf`

NOTE:
  With `tesseract eurotext.tif eurotext`
  or `tesseract eurotext.tif eurotext txt`
  the psm will be set to '3', but...
  With `tesseract eurotext.tif eurotext txt pdf`
  or `tesseract eurotext.tif eurotext txt hocr`
  the psm will be set to '1'.
  • Loading branch information
amitdo committed Dec 11, 2015
1 parent d4e0c64 commit c2f5e9b
Show file tree
Hide file tree
Showing 8 changed files with 38 additions and 21 deletions.
48 changes: 33 additions & 15 deletions api/tesseractmain.cpp
Expand Up @@ -176,16 +176,16 @@ void PrintLangsList(tesseract::TessBaseAPI* api) {
/**
* We have 2 possible sources of pagesegmode: a config file and
* the command line. For backwards compatibility reasons, the
* default in tesseract is tesseract::PSM_SINGLE_BLOCK, but the
* default for this program is tesseract::PSM_AUTO. We will let
* the config file take priority, so the command-line default
* can take priority over the tesseract default, so we use the
* value from the command line only if the retrieved mode
* is still tesseract::PSM_SINGLE_BLOCK, indicating no change
* in any config file. Therefore the only way to force
* tesseract::PSM_SINGLE_BLOCK is from the command line.
* It would be simpler if we could set the value before Init,
* but that doesn't work.
* default in tesseract is tesseract::PSM_SINGLE_BLOCK, but the
* default for this program is tesseract::PSM_AUTO. We will let
* the config file take priority, so the command-line default
* can take priority over the tesseract default, so we use the
* value from the command line only if the retrieved mode
* is still tesseract::PSM_SINGLE_BLOCK, indicating no change
* in any config file. Therefore the only way to force
* tesseract::PSM_SINGLE_BLOCK is from the command line.
* It would be simpler if we could set the value before Init,
* but that doesn't work.
*/
void FixPageSegMode(tesseract::TessBaseAPI* api,
tesseract::PageSegMode pagesegmode) {
Expand Down Expand Up @@ -295,19 +295,37 @@ void PreloadRenderers(tesseract::TessBaseAPI* api,
if (b) {
bool font_info;
api->GetBoolVariable("hocr_font_info", &font_info);
renderers->push_back(new tesseract::TessHOcrRenderer(outputbase, font_info));
renderers->push_back(
new tesseract::TessHOcrRenderer(outputbase, font_info));
}

api->GetBoolVariable("tessedit_create_pdf", &b);
if (b) {
renderers->push_back(new tesseract::TessPDFRenderer(outputbase,
api->GetDatapath()));
api->GetDatapath()));
}

api->GetBoolVariable("tessedit_write_unlv", &b);
if (b) renderers->push_back(new tesseract::TessUnlvRenderer(outputbase));
if (b) {
renderers->push_back(new tesseract::TessUnlvRenderer(outputbase));
}

api->GetBoolVariable("tessedit_create_boxfile", &b);
if (b) renderers->push_back(new tesseract::TessBoxTextRenderer(outputbase));
if (b) {
renderers->push_back(new tesseract::TessBoxTextRenderer(outputbase));
}

// disable text renderer when using one of these configs:
// ambigs.train, box.train, box.train.stderr, linebox, rebox
bool disable_text_renderer =
(api->GetBoolVariable("tessedit_ambigs_training", &b) && b) ||
(api->GetBoolVariable("tessedit_resegment_from_boxes", &b) && b) ||
(api->GetBoolVariable("tessedit_make_boxes_from_boxes", &b) && b);

api->GetBoolVariable("tessedit_create_txt", &b);
if (b) renderers->push_back(new tesseract::TessTextRenderer(outputbase));
if (b || (renderers->empty() && !disable_text_renderer) {
renderers->push_back(new tesseract::TessTextRenderer(outputbase));
}
}

if (!renderers->empty()) {
Expand Down
2 changes: 1 addition & 1 deletion ccmain/tesseractclass.cpp
Expand Up @@ -381,7 +381,7 @@ Tesseract::Tesseract()
this->params()),
BOOL_MEMBER(tessedit_write_unlv, false, "Write .unlv output file",
this->params()),
BOOL_MEMBER(tessedit_create_txt, true, "Write .txt output file",
BOOL_MEMBER(tessedit_create_txt, false, "Write .txt output file",
this->params()),
BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file",
this->params()),
Expand Down
2 changes: 1 addition & 1 deletion ccmain/tesseractclass.h
Expand Up @@ -1001,7 +1001,7 @@ class Tesseract : public Wordrec {
BOOL_VAR_H(tessedit_write_rep_codes, false,
"Write repetition char code");
BOOL_VAR_H(tessedit_write_unlv, false, "Write .unlv output file");
BOOL_VAR_H(tessedit_create_txt, true, "Write .txt output file");
BOOL_VAR_H(tessedit_create_txt, false, "Write .txt output file");
BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file");
BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file");
STRING_VAR_H(unrecognised_char, "|",
Expand Down
1 change: 0 additions & 1 deletion tessdata/configs/hocr
@@ -1,3 +1,2 @@
tessedit_create_txt 0
tessedit_create_hocr 1
tessedit_pageseg_mode 1
1 change: 0 additions & 1 deletion tessdata/configs/makebox
@@ -1,2 +1 @@
tessedit_create_txt 0
tessedit_create_boxfile 1
1 change: 0 additions & 1 deletion tessdata/configs/pdf
@@ -1,3 +1,2 @@
tessedit_create_txt 0
tessedit_create_pdf 1
tessedit_pageseg_mode 1
3 changes: 3 additions & 0 deletions tessdata/configs/txt
@@ -0,0 +1,3 @@
# This config file should be used with other cofig files which creates renderers.
# usage example: tesseract eurotext.tif eurotext txt hocr pdf
tessedit_create_txt 1
1 change: 0 additions & 1 deletion tessdata/configs/unlv
@@ -1,3 +1,2 @@
tessedit_create_txt 0
tessedit_write_unlv 1
tessedit_pageseg_mode 6

0 comments on commit c2f5e9b

Please sign in to comment.