Skip to content

Commit

Permalink
Fix assertion caused by access to default TBOX
Browse files Browse the repository at this point in the history
Instead of adding an empty TBOX at the end of the box list,
that corner case is now handled by passing a nullptr (like
it was already done for the first box in the list).

This avoids the calls of BoxMissMetric with a TBOX
which raises an assertion there (b == 0).

Signed-off-by: Stefan Weil <sw@weilnetz.de>
  • Loading branch information
stweil committed Aug 22, 2018
1 parent 97f6864 commit b08966a
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 43 deletions.
81 changes: 42 additions & 39 deletions src/ccmain/applybox.cpp
Expand Up @@ -122,29 +122,24 @@ PAGE_RES* Tesseract::ApplyBoxes(const STRING& fname,

const int box_count = boxes.size();
int box_failures = 0;
// Add an empty everything to the end.
boxes.push_back(TBOX());
texts.push_back(STRING());
full_texts.push_back(STRING());

// In word mode, we use the boxes to make a word for each box, but
// in blob mode we use the existing words and maximally chop them first.
PAGE_RES* page_res = find_segmentation ?
nullptr : SetupApplyBoxes(boxes, block_list);
clear_any_old_text(block_list);

for (int i = 0; i < boxes.size() - 1; i++) {
for (int i = 0; i < box_count; i++) {
bool foundit = false;
if (page_res != nullptr) {
if (i == 0) {
foundit = ResegmentCharBox(page_res, nullptr, boxes[i], boxes[i + 1],
full_texts[i].string());
} else {
foundit = ResegmentCharBox(page_res, &boxes[i-1], boxes[i],
boxes[i + 1], full_texts[i].string());
}
foundit = ResegmentCharBox(page_res,
(i == 0) ? nullptr : &boxes[i - 1],
boxes[i],
(i == box_count - 1) ? nullptr : &boxes[i + 1],
full_texts[i].string());
} else {
foundit = ResegmentWordBox(block_list, boxes[i], boxes[i + 1],
foundit = ResegmentWordBox(block_list, boxes[i],
(i == box_count - 1) ? nullptr : &boxes[i + 1],
texts[i].string());
}
if (!foundit) {
Expand Down Expand Up @@ -339,8 +334,8 @@ static double BoxMissMetric(const TBOX& box1, const TBOX& box2) {
///
/// This means that occasionally, blobs may be incorrectly segmented if the
/// chopper fails to find a suitable chop point.
bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, const TBOX *prev_box,
const TBOX& box, const TBOX& next_box,
bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, const TBOX* prev_box,
const TBOX& box, const TBOX* next_box,
const char* correct_text) {
if (applybox_debug > 1) {
tprintf("\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text);
Expand All @@ -365,24 +360,26 @@ bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, const TBOX *prev_box,
break;
if (word_res->correct_text[i + blob_count].length() > 0)
break; // Blob is claimed already.
const double current_box_miss_metric = BoxMissMetric(blob_box, box);
const double next_box_miss_metric = BoxMissMetric(blob_box, next_box);
if (applybox_debug > 2) {
tprintf("Checking blob:");
blob_box.print();
tprintf("Current miss metric = %g, next = %g\n",
current_box_miss_metric, next_box_miss_metric);
if (next_box != nullptr) {
const double current_box_miss_metric = BoxMissMetric(blob_box, box);
const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);
if (applybox_debug > 2) {
tprintf("Checking blob:");
blob_box.print();
tprintf("Current miss metric = %g, next = %g\n",
current_box_miss_metric, next_box_miss_metric);
}
if (current_box_miss_metric > next_box_miss_metric)
break; // Blob is a better match for next box.
}
if (current_box_miss_metric > next_box_miss_metric)
break; // Blob is a better match for next box.
char_box += blob_box;
}
if (blob_count > 0) {
if (applybox_debug > 1) {
tprintf("Index [%d, %d) seem good.\n", i, i + blob_count);
}
if (!char_box.almost_equal(box, 3) &&
(box.x_gap(next_box) < -3 ||
((next_box != nullptr && box.x_gap(*next_box) < -3)||
(prev_box != nullptr && prev_box->x_gap(box) < -3))) {
return false;
}
Expand All @@ -398,8 +395,10 @@ bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, const TBOX *prev_box,
word_res->box_word->BlobBox(i).print();
tprintf("Matches box:");
box.print();
tprintf("With next box:");
next_box.print();
if (next_box != nullptr) {
tprintf("With next box:");
next_box->print();
}
}
// Eliminated best_state and correct_text entries for the consumed
// blobs.
Expand Down Expand Up @@ -438,7 +437,7 @@ bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, const TBOX *prev_box,
/// @return false if the box was in error, which can only be caused by
/// failing to find an overlapping blob for a box.
bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list,
const TBOX& box, const TBOX& next_box,
const TBOX& box, const TBOX* next_box,
const char* correct_text) {
if (applybox_debug > 1) {
tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text);
Expand Down Expand Up @@ -472,23 +471,27 @@ bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list,
TBOX blob_box = blob->bounding_box();
if (!blob_box.major_overlap(box))
continue;
const double current_box_miss_metric = BoxMissMetric(blob_box, box);
const double next_box_miss_metric = BoxMissMetric(blob_box, next_box);
if (applybox_debug > 2) {
tprintf("Checking blob:");
blob_box.print();
tprintf("Current miss metric = %g, next = %g\n",
current_box_miss_metric, next_box_miss_metric);
if (next_box != nullptr) {
const double current_box_miss_metric = BoxMissMetric(blob_box, box);
const double next_box_miss_metric = BoxMissMetric(blob_box, *next_box);
if (applybox_debug > 2) {
tprintf("Checking blob:");
blob_box.print();
tprintf("Current miss metric = %g, next = %g\n",
current_box_miss_metric, next_box_miss_metric);
}
if (current_box_miss_metric > next_box_miss_metric)
continue; // Blob is a better match for next box.
}
if (current_box_miss_metric > next_box_miss_metric)
continue; // Blob is a better match for next box.
if (applybox_debug > 2) {
tprintf("Blob match: blob:");
blob_box.print();
tprintf("Matches box:");
box.print();
tprintf("With next box:");
next_box.print();
if (next_box != nullptr) {
tprintf("With next box:");
next_box->print();
}
}
if (new_word == nullptr) {
// Make a new word with a single blob.
Expand Down
8 changes: 4 additions & 4 deletions src/ccmain/tesseractclass.h
Expand Up @@ -744,17 +744,17 @@ class Tesseract : public Wordrec {
// failing to find an appropriate blob for a box.
// This means that occasionally, blobs may be incorrectly segmented if the
// chopper fails to find a suitable chop point.
bool ResegmentCharBox(PAGE_RES* page_res, const TBOX *prev_box,
const TBOX& box, const TBOX& next_box,
bool ResegmentCharBox(PAGE_RES* page_res, const TBOX* prev_box,
const TBOX& box, const TBOX* next_box,
const char* correct_text);
// Consume all source blobs that strongly overlap the given box,
// putting them into a new word, with the correct_text label.
// Fights over which box owns which blobs are settled by
// applying the blobs to box or next_box with the least non-overlap.
// Returns false if the box was in error, which can only be caused by
// failing to find an overlapping blob for a box.
bool ResegmentWordBox(BLOCK_LIST *block_list,
const TBOX& box, const TBOX& next_box,
bool ResegmentWordBox(BLOCK_LIST* block_list,
const TBOX& box, const TBOX* next_box,
const char* correct_text);
// Resegments the words by running the classifier in an attempt to find the
// correct segmentation that produces the required string.
Expand Down

0 comments on commit b08966a

Please sign in to comment.