From 9c89cd51cf7c409cdf1594173d9ece3defd3d037 Mon Sep 17 00:00:00 2001 From: Shree Devi Kumar Date: Thu, 31 Jan 2019 17:30:59 +0000 Subject: [PATCH] Add a new renderer to create box files from images for LSTM training (cherry picked from commit 921da6be2bdbda2ddd64514f9b6bec40a336246a) fix typo (cherry picked from commit 7bd1a0c80393fce2f34e2845cb26760bcf3791cd) Add lstmboxrenderer to CMakeLists (cherry picked from commit cfef3a889aef830725921b5c0218d5e9c633b03e) fix formatting (cherry picked from commit 7ba2b01ede7940ed609a073364948ef8c838cd10) --- CMakeLists.txt | 1 + src/api/Makefile.am | 1 + src/api/baseapi.h | 8 +++ src/api/hocrrenderer.cpp | 2 +- src/api/lstmboxrenderer.cpp | 110 ++++++++++++++++++++++++++++++++++ src/api/renderer.h | 11 ++++ src/api/tesseractmain.cpp | 14 +++++ src/ccmain/tesseractclass.cpp | 2 + src/ccmain/tesseractclass.h | 1 + tessdata/configs/lstmbox | 1 + 10 files changed, 150 insertions(+), 1 deletion(-) create mode 100644 src/api/lstmboxrenderer.cpp create mode 100644 tessdata/configs/lstmbox diff --git a/CMakeLists.txt b/CMakeLists.txt index 1ec87f9816..62a6f5a363 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -252,6 +252,7 @@ set(tesseract_src ${tesseract_src} src/api/renderer.cpp src/api/altorenderer.cpp src/api/hocrrenderer.cpp + src/api/lstmboxrenderer.cpp src/api/pdfrenderer.cpp ) diff --git a/src/api/Makefile.am b/src/api/Makefile.am index ca2215fb62..894d957e56 100644 --- a/src/api/Makefile.am +++ b/src/api/Makefile.am @@ -35,6 +35,7 @@ endif libtesseract_api_la_SOURCES = baseapi.cpp capi.cpp libtesseract_api_la_SOURCES += altorenderer.cpp libtesseract_api_la_SOURCES += hocrrenderer.cpp +libtesseract_api_la_SOURCES += lstmboxrenderer.cpp libtesseract_api_la_SOURCES += pdfrenderer.cpp libtesseract_api_la_SOURCES += renderer.cpp diff --git a/src/api/baseapi.h b/src/api/baseapi.h index efa97ecd8f..d32ded8161 100644 --- a/src/api/baseapi.h +++ b/src/api/baseapi.h @@ -613,6 +613,14 @@ class TESS_API TessBaseAPI { * Returned string must be freed with the delete [] operator. */ char* GetTSVText(int page_number); + + /** + * Make a box file for LSTM training from the internal data structures. + * Constructs coordinates in the original image - not just the rectangle. + * page_number is a 0-based page index that will appear in the box file. + * Returned string must be freed with the delete [] operator. + */ + char* GetLSTMBOXText(int page_number); /** * The recognized text is returned as a char* which is coded in the same diff --git a/src/api/hocrrenderer.cpp b/src/api/hocrrenderer.cpp index dc3dc36ad4..5d2de9a38e 100644 --- a/src/api/hocrrenderer.cpp +++ b/src/api/hocrrenderer.cpp @@ -268,7 +268,7 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) { if (grapheme && grapheme[0] != 0) { if (hocr_boxes) { res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom); - hocr_str << ""; } diff --git a/src/api/lstmboxrenderer.cpp b/src/api/lstmboxrenderer.cpp new file mode 100644 index 0000000000..ec747abd44 --- /dev/null +++ b/src/api/lstmboxrenderer.cpp @@ -0,0 +1,110 @@ +/********************************************************************** + * File: lstmboxrenderer.cpp + * Description: Renderer for creating box file for LSTM training. + * based on the tsv renderer. + * + * (C) Copyright 2006, Google Inc. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + **********************************************************************/ + + +#include // for std::locale::classic +#include // for std::unique_ptr +#include // for std::stringstream +#include "baseapi.h" // for TessBaseAPI +#include "renderer.h" +#include "tesseractclass.h" // for Tesseract + +namespace tesseract { + +/** + * Create a UTF8 box file for LSTM training from the internal data structures. + * page_number is a 0-base page index that will appear in the box file. + * Returned string must be freed with the delete [] operator. + */ + +char* TessBaseAPI::GetLSTMBOXText(int page_number) { + if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0)) + return nullptr; + + STRING lstm_box_str(""); + + int page_num = page_number; + bool first_word = true; + + LTRResultIterator* res_it = GetLTRIterator(); + while (!res_it->Empty(RIL_BLOCK)) { + if (res_it->Empty(RIL_SYMBOL)) { + res_it->Next(RIL_SYMBOL); + continue; + } + + int left, top, right, bottom; + + if (!first_word) { + if (res_it->IsAtBeginningOf(RIL_WORD)) { + lstm_box_str.add_str_int(" ", left); + lstm_box_str.add_str_int(" ", image_height_ - bottom); + lstm_box_str.add_str_int(" ", right + 2); + lstm_box_str.add_str_int(" ", image_height_ - top); + lstm_box_str.add_str_int(" ", page_num); // level 5 - word + lstm_box_str += "\n"; // end of row for word + } + if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { + lstm_box_str.add_str_int("\t ", left); + lstm_box_str.add_str_int(" ", image_height_ - bottom); + lstm_box_str.add_str_int(" ", right + 5); + lstm_box_str.add_str_int(" ", image_height_ - top); + lstm_box_str.add_str_int(" ", page_num); // level 4 - line + lstm_box_str += "\n"; // end of row for line + } + } + first_word=false; + res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom); + + do { + lstm_box_str +=std::unique_ptr(res_it->GetUTF8Text(RIL_SYMBOL)).get(); + res_it->Next(RIL_SYMBOL); + } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_SYMBOL)); + + lstm_box_str.add_str_int(" ", left); + lstm_box_str.add_str_int(" ", image_height_ - bottom); + lstm_box_str.add_str_int(" ", right); + lstm_box_str.add_str_int(" ", image_height_ - top); + lstm_box_str.add_str_int(" ", page_num); // level 6 - symbol + lstm_box_str += "\n"; // end of row + + } + + char* ret = new char[lstm_box_str.length() + 1]; + strcpy(ret, lstm_box_str.string()); + delete res_it; + return ret; +} + +/********************************************************************** + * LSTMBOX Renderer interface implementation + **********************************************************************/ +TessLSTMBOXRenderer::TessLSTMBOXRenderer(const char *outputbase) + : TessResultRenderer(outputbase, "box") { +} + +bool TessLSTMBOXRenderer::AddImageHandler(TessBaseAPI* api) { + const std::unique_ptr lstmbox(api->GetLSTMBOXText(imagenum())); + if (lstmbox == nullptr) return false; + + AppendString(lstmbox.get()); + + return true; +} + +} // namespace tesseract. diff --git a/src/api/renderer.h b/src/api/renderer.h index 7941029764..109a29d9ca 100644 --- a/src/api/renderer.h +++ b/src/api/renderer.h @@ -247,6 +247,17 @@ class TESS_API TessUnlvRenderer : public TessResultRenderer { virtual bool AddImageHandler(TessBaseAPI* api); }; +/** + * Renders tesseract output into a plain UTF-8 text string for LSTMBOX + */ +class TESS_API TessLSTMBOXRenderer : public TessResultRenderer { + public: + explicit TessLSTMBOXRenderer(const char *outputbase); + + protected: + virtual bool AddImageHandler(TessBaseAPI* api); +}; + /** * Renders tesseract output into a plain UTF-8 text string */ diff --git a/src/api/tesseractmain.cpp b/src/api/tesseractmain.cpp index ce261b1015..40313b4d8f 100644 --- a/src/api/tesseractmain.cpp +++ b/src/api/tesseractmain.cpp @@ -494,6 +494,20 @@ static void PreloadRenderers( } } + api->GetBoolVariable("tessedit_create_lstmbox", &b); + if (b) { + tesseract::TessLSTMBOXRenderer* renderer = + new tesseract::TessLSTMBOXRenderer(outputbase); + if (renderer->happy()) { + renderers->push_back(renderer); + } else { + delete renderer; + tprintf("Error, could not create LSTM BOX output file: %s\n", + strerror(errno)); + error = true; + } + } + api->GetBoolVariable("tessedit_create_boxfile", &b); if (b) { tesseract::TessBoxTextRenderer* renderer = diff --git a/src/ccmain/tesseractclass.cpp b/src/ccmain/tesseractclass.cpp index 4dc65a1e7f..83ecbc2a4f 100644 --- a/src/ccmain/tesseractclass.cpp +++ b/src/ccmain/tesseractclass.cpp @@ -391,6 +391,8 @@ Tesseract::Tesseract() this->params()), BOOL_MEMBER(tessedit_create_alto, false, "Write .xml ALTO file", this->params()), + BOOL_MEMBER(tessedit_create_lstmbox, false, "Write .box file for LSTM training", + this->params()), BOOL_MEMBER(tessedit_create_tsv, false, "Write .tsv output file", this->params()), BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file", diff --git a/src/ccmain/tesseractclass.h b/src/ccmain/tesseractclass.h index 3fccd73483..0fc1d04aeb 100644 --- a/src/ccmain/tesseractclass.h +++ b/src/ccmain/tesseractclass.h @@ -1040,6 +1040,7 @@ class Tesseract : public Wordrec { BOOL_VAR_H(tessedit_create_txt, false, "Write .txt output file"); BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file"); BOOL_VAR_H(tessedit_create_alto, false, "Write .xml ALTO output file"); + BOOL_VAR_H(tessedit_create_lstmbox, false, "Write .box file for LSTM training"); BOOL_VAR_H(tessedit_create_tsv, false, "Write .tsv output file"); BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file"); BOOL_VAR_H(textonly_pdf, false, diff --git a/tessdata/configs/lstmbox b/tessdata/configs/lstmbox new file mode 100644 index 0000000000..a6f2cedc50 --- /dev/null +++ b/tessdata/configs/lstmbox @@ -0,0 +1 @@ +tessedit_create_lstmbox 1