diff --git a/CMakeLists.txt b/CMakeLists.txt index 1ec87f9816..62a6f5a363 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -252,6 +252,7 @@ set(tesseract_src ${tesseract_src} src/api/renderer.cpp src/api/altorenderer.cpp src/api/hocrrenderer.cpp + src/api/lstmboxrenderer.cpp src/api/pdfrenderer.cpp ) diff --git a/src/api/Makefile.am b/src/api/Makefile.am index ca2215fb62..894d957e56 100644 --- a/src/api/Makefile.am +++ b/src/api/Makefile.am @@ -35,6 +35,7 @@ endif libtesseract_api_la_SOURCES = baseapi.cpp capi.cpp libtesseract_api_la_SOURCES += altorenderer.cpp libtesseract_api_la_SOURCES += hocrrenderer.cpp +libtesseract_api_la_SOURCES += lstmboxrenderer.cpp libtesseract_api_la_SOURCES += pdfrenderer.cpp libtesseract_api_la_SOURCES += renderer.cpp diff --git a/src/api/baseapi.h b/src/api/baseapi.h index efa97ecd8f..d32ded8161 100644 --- a/src/api/baseapi.h +++ b/src/api/baseapi.h @@ -613,6 +613,14 @@ class TESS_API TessBaseAPI { * Returned string must be freed with the delete [] operator. */ char* GetTSVText(int page_number); + + /** + * Make a box file for LSTM training from the internal data structures. + * Constructs coordinates in the original image - not just the rectangle. + * page_number is a 0-based page index that will appear in the box file. + * Returned string must be freed with the delete [] operator. + */ + char* GetLSTMBOXText(int page_number); /** * The recognized text is returned as a char* which is coded in the same diff --git a/src/api/hocrrenderer.cpp b/src/api/hocrrenderer.cpp index dc3dc36ad4..5d2de9a38e 100644 --- a/src/api/hocrrenderer.cpp +++ b/src/api/hocrrenderer.cpp @@ -268,7 +268,7 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) { if (grapheme && grapheme[0] != 0) { if (hocr_boxes) { res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom); - hocr_str << ""; } diff --git a/src/api/lstmboxrenderer.cpp b/src/api/lstmboxrenderer.cpp new file mode 100644 index 0000000000..ec747abd44 --- /dev/null +++ b/src/api/lstmboxrenderer.cpp @@ -0,0 +1,110 @@ +/********************************************************************** + * File: lstmboxrenderer.cpp + * Description: Renderer for creating box file for LSTM training. + * based on the tsv renderer. + * + * (C) Copyright 2006, Google Inc. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + **********************************************************************/ + + +#include // for std::locale::classic +#include // for std::unique_ptr +#include // for std::stringstream +#include "baseapi.h" // for TessBaseAPI +#include "renderer.h" +#include "tesseractclass.h" // for Tesseract + +namespace tesseract { + +/** + * Create a UTF8 box file for LSTM training from the internal data structures. + * page_number is a 0-base page index that will appear in the box file. + * Returned string must be freed with the delete [] operator. + */ + +char* TessBaseAPI::GetLSTMBOXText(int page_number) { + if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0)) + return nullptr; + + STRING lstm_box_str(""); + + int page_num = page_number; + bool first_word = true; + + LTRResultIterator* res_it = GetLTRIterator(); + while (!res_it->Empty(RIL_BLOCK)) { + if (res_it->Empty(RIL_SYMBOL)) { + res_it->Next(RIL_SYMBOL); + continue; + } + + int left, top, right, bottom; + + if (!first_word) { + if (res_it->IsAtBeginningOf(RIL_WORD)) { + lstm_box_str.add_str_int(" ", left); + lstm_box_str.add_str_int(" ", image_height_ - bottom); + lstm_box_str.add_str_int(" ", right + 2); + lstm_box_str.add_str_int(" ", image_height_ - top); + lstm_box_str.add_str_int(" ", page_num); // level 5 - word + lstm_box_str += "\n"; // end of row for word + } + if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { + lstm_box_str.add_str_int("\t ", left); + lstm_box_str.add_str_int(" ", image_height_ - bottom); + lstm_box_str.add_str_int(" ", right + 5); + lstm_box_str.add_str_int(" ", image_height_ - top); + lstm_box_str.add_str_int(" ", page_num); // level 4 - line + lstm_box_str += "\n"; // end of row for line + } + } + first_word=false; + res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom); + + do { + lstm_box_str +=std::unique_ptr(res_it->GetUTF8Text(RIL_SYMBOL)).get(); + res_it->Next(RIL_SYMBOL); + } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_SYMBOL)); + + lstm_box_str.add_str_int(" ", left); + lstm_box_str.add_str_int(" ", image_height_ - bottom); + lstm_box_str.add_str_int(" ", right); + lstm_box_str.add_str_int(" ", image_height_ - top); + lstm_box_str.add_str_int(" ", page_num); // level 6 - symbol + lstm_box_str += "\n"; // end of row + + } + + char* ret = new char[lstm_box_str.length() + 1]; + strcpy(ret, lstm_box_str.string()); + delete res_it; + return ret; +} + +/********************************************************************** + * LSTMBOX Renderer interface implementation + **********************************************************************/ +TessLSTMBOXRenderer::TessLSTMBOXRenderer(const char *outputbase) + : TessResultRenderer(outputbase, "box") { +} + +bool TessLSTMBOXRenderer::AddImageHandler(TessBaseAPI* api) { + const std::unique_ptr lstmbox(api->GetLSTMBOXText(imagenum())); + if (lstmbox == nullptr) return false; + + AppendString(lstmbox.get()); + + return true; +} + +} // namespace tesseract. diff --git a/src/api/renderer.h b/src/api/renderer.h index 7941029764..109a29d9ca 100644 --- a/src/api/renderer.h +++ b/src/api/renderer.h @@ -247,6 +247,17 @@ class TESS_API TessUnlvRenderer : public TessResultRenderer { virtual bool AddImageHandler(TessBaseAPI* api); }; +/** + * Renders tesseract output into a plain UTF-8 text string for LSTMBOX + */ +class TESS_API TessLSTMBOXRenderer : public TessResultRenderer { + public: + explicit TessLSTMBOXRenderer(const char *outputbase); + + protected: + virtual bool AddImageHandler(TessBaseAPI* api); +}; + /** * Renders tesseract output into a plain UTF-8 text string */ diff --git a/src/api/tesseractmain.cpp b/src/api/tesseractmain.cpp index ce261b1015..40313b4d8f 100644 --- a/src/api/tesseractmain.cpp +++ b/src/api/tesseractmain.cpp @@ -494,6 +494,20 @@ static void PreloadRenderers( } } + api->GetBoolVariable("tessedit_create_lstmbox", &b); + if (b) { + tesseract::TessLSTMBOXRenderer* renderer = + new tesseract::TessLSTMBOXRenderer(outputbase); + if (renderer->happy()) { + renderers->push_back(renderer); + } else { + delete renderer; + tprintf("Error, could not create LSTM BOX output file: %s\n", + strerror(errno)); + error = true; + } + } + api->GetBoolVariable("tessedit_create_boxfile", &b); if (b) { tesseract::TessBoxTextRenderer* renderer = diff --git a/src/ccmain/tesseractclass.cpp b/src/ccmain/tesseractclass.cpp index 4dc65a1e7f..83ecbc2a4f 100644 --- a/src/ccmain/tesseractclass.cpp +++ b/src/ccmain/tesseractclass.cpp @@ -391,6 +391,8 @@ Tesseract::Tesseract() this->params()), BOOL_MEMBER(tessedit_create_alto, false, "Write .xml ALTO file", this->params()), + BOOL_MEMBER(tessedit_create_lstmbox, false, "Write .box file for LSTM training", + this->params()), BOOL_MEMBER(tessedit_create_tsv, false, "Write .tsv output file", this->params()), BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file", diff --git a/src/ccmain/tesseractclass.h b/src/ccmain/tesseractclass.h index 3fccd73483..0fc1d04aeb 100644 --- a/src/ccmain/tesseractclass.h +++ b/src/ccmain/tesseractclass.h @@ -1040,6 +1040,7 @@ class Tesseract : public Wordrec { BOOL_VAR_H(tessedit_create_txt, false, "Write .txt output file"); BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file"); BOOL_VAR_H(tessedit_create_alto, false, "Write .xml ALTO output file"); + BOOL_VAR_H(tessedit_create_lstmbox, false, "Write .box file for LSTM training"); BOOL_VAR_H(tessedit_create_tsv, false, "Write .tsv output file"); BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file"); BOOL_VAR_H(textonly_pdf, false, diff --git a/tessdata/configs/lstmbox b/tessdata/configs/lstmbox new file mode 100644 index 0000000000..a6f2cedc50 --- /dev/null +++ b/tessdata/configs/lstmbox @@ -0,0 +1 @@ +tessedit_create_lstmbox 1