Skip to content

Commit

Permalink
implement parameter min_characters_to_try for minimum characters to t…
Browse files Browse the repository at this point in the history
…ry to skip page entirely.

fixes #1729
  • Loading branch information
zdenop committed Oct 5, 2018
1 parent 2cb609d commit 660dbaa
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 6 deletions.
11 changes: 5 additions & 6 deletions src/ccmain/osdetect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,6 @@
#include <algorithm>
#include <memory>

const int kMinCharactersToTry = 50;
const int kMaxCharactersToTry = 5 * kMinCharactersToTry;

const float kSizeRatioToReject = 2.0;
const int kMinAcceptableBlobHeight = 10;

Expand Down Expand Up @@ -278,6 +275,8 @@ int os_detect_blobs(const GenericVector<int>* allowed_scripts,
BLOBNBOX_CLIST* blob_list, OSResults* osr,
tesseract::Tesseract* tess) {
OSResults osr_;
int minCharactersToTry = tess->min_characters_to_try;
int maxCharactersToTry = 5 * minCharactersToTry;
if (osr == nullptr)
osr = &osr_;

Expand All @@ -286,13 +285,13 @@ int os_detect_blobs(const GenericVector<int>* allowed_scripts,
ScriptDetector s(allowed_scripts, osr, tess);

BLOBNBOX_C_IT filtered_it(blob_list);
int real_max = std::min(filtered_it.length(), kMaxCharactersToTry);
int real_max = std::min(filtered_it.length(), maxCharactersToTry);
// tprintf("Total blobs found = %d\n", blobs_total);
// tprintf("Number of blobs post-filtering = %d\n", filtered_it.length());
// tprintf("Number of blobs to try = %d\n", real_max);

// If there are too few characters, skip this page entirely.
if (real_max < kMinCharactersToTry / 2) {
if (real_max < minCharactersToTry / 2) {
tprintf("Too few characters. Skipping this page\n");
return 0;
}
Expand All @@ -307,7 +306,7 @@ int os_detect_blobs(const GenericVector<int>* allowed_scripts,
int num_blobs_evaluated = 0;
for (int i = 0; i < real_max; ++i) {
if (os_detect_blob(blobs[sequence.GetVal()], &o, &s, osr, tess)
&& i > kMinCharactersToTry) {
&& i > minCharactersToTry) {
break;
}
++num_blobs_evaluated;
Expand Down
3 changes: 3 additions & 0 deletions src/ccmain/tesseractclass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -397,6 +397,9 @@ Tesseract::Tesseract()
INT_MEMBER(jpg_quality, 85, "Set JPEG quality level", this->params()),
INT_MEMBER(user_defined_dpi, 0, "Specify DPI for input image",
this->params()),
INT_MEMBER(min_characters_to_try, 50,
"Specify minimum characters to try to skip page entirely",
this->params()),
STRING_MEMBER(unrecognised_char, "|",
"Output char for unidentified blobs", this->params()),
INT_MEMBER(suspect_level, 99, "Suspect marker level", this->params()),
Expand Down
2 changes: 2 additions & 0 deletions src/ccmain/tesseractclass.h
Original file line number Diff line number Diff line change
Expand Up @@ -1043,6 +1043,8 @@ class Tesseract : public Wordrec {
"Create PDF with only one invisible text layer");
INT_VAR_H(jpg_quality, 85, "Set JPEG quality level");
INT_VAR_H(user_defined_dpi, 0, "Specify DPI for input image");
INT_VAR_H(min_characters_to_try, 50,
"Specify minimum characters to try to skip page entirely");

This comment has been minimized.

Copy link
@amitdo

amitdo Oct 6, 2018

Collaborator

"Specify minimum characters to try to skip page entirely")

Maybe change it to:

"Specify minimum characters to try during OSD")

This comment has been minimized.

Copy link
@zdenop

zdenop Oct 6, 2018

Author Contributor

Thanks. Changed in 345e5ee

STRING_VAR_H(unrecognised_char, "|",
"Output char for unidentified blobs");
INT_VAR_H(suspect_level, 99, "Suspect marker level");
Expand Down

2 comments on commit 660dbaa

@amitdo
Copy link
Collaborator

@amitdo amitdo commented on 660dbaa Oct 6, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You meant 424dbd5dc733 :-)

@zdenop
Copy link
Contributor Author

@zdenop zdenop commented on 660dbaa Oct 6, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

=-( I am sure I click on button "Copy the full SHA" next to my commit...
Maybe I should go a and take a rest...

Please sign in to comment.