diff --git a/api/baseapi.cpp b/api/baseapi.cpp index 1045468d74..991abf736e 100644 --- a/api/baseapi.cpp +++ b/api/baseapi.cpp @@ -1417,6 +1417,19 @@ static void AddBoxTohOCR(const ResultIterator *it, *hocr_str += "\">"; } +static void AddBoxTohOCRTSV(const PageIterator *it, + PageIteratorLevel level, + STRING* hocr_str) { + int left, top, right, bottom; + it->BoundingBox(level, &left, &top, &right, &bottom); + hocr_str->add_str_int("\t", left); + hocr_str->add_str_int("\t", top); + hocr_str->add_str_int("\t", right - left + 1); + hocr_str->add_str_int("\t", bottom - top + 1); +} + + + /** * Make a HTML-formatted string with hOCR markup from the internal * data structures. @@ -1641,19 +1654,18 @@ char* TessBaseAPI::GetHOCRTSVText(int page_number) { delete[] utf8_str; #endif - hocr_str.add_str_int("
\n"; + int page_num = page_id, block_num = 0, par_num = 0, line_num = 0, word_num = 0; + + hocr_str.add_str_int("1\t", page_num); + hocr_str.add_str_int("\t", block_num); + hocr_str.add_str_int("\t", par_num); + hocr_str.add_str_int("\t", line_num); + hocr_str.add_str_int("\t", word_num); + hocr_str.add_str_int("\t", rect_left_); + hocr_str.add_str_int("\t", rect_top_); + hocr_str.add_str_int("\t", rect_width_); + hocr_str.add_str_int("\t", rect_height_); + hocr_str += "\t-1\t\n"; ResultIterator *res_it = GetIterator(); while (!res_it->Empty(RIL_BLOCK)) { @@ -1664,31 +1676,37 @@ char* TessBaseAPI::GetHOCRTSVText(int page_number) { // Open any new block/paragraph/textline. if (res_it->IsAtBeginningOf(RIL_BLOCK)) { - hocr_str.add_str_int("
WordRecognitionLanguage()) { - hocr_str += " lang='"; - hocr_str += res_it->WordRecognitionLanguage(); - hocr_str += "'"; - } - switch (res_it->WordDirection()) { - case DIR_LEFT_TO_RIGHT: hocr_str += " dir='ltr'"; break; - case DIR_RIGHT_TO_LEFT: hocr_str += " dir='rtl'"; break; - default: // Do nothing. - break; - } - hocr_str += ">"; + word_num++; + hocr_str.add_str_int("5\t", page_num); + hocr_str.add_str_int("\t", block_num); + hocr_str.add_str_int("\t", par_num); + hocr_str.add_str_int("\t", line_num); + hocr_str.add_str_int("\t", word_num); + hocr_str.add_str_int("\t", left); + hocr_str.add_str_int("\t", top); + hocr_str.add_str_int("\t", right - left + 1); + hocr_str.add_str_int("\t", bottom - top + 1); + hocr_str.add_str_int("\t", res_it->Confidence(RIL_WORD)); bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD); bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD); bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD); - if (bold) hocr_str += ""; - if (italic) hocr_str += ""; + hocr_str += "\t"; do { const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL); if (grapheme && grapheme[0] != 0) { @@ -1737,25 +1742,19 @@ char* TessBaseAPI::GetHOCRTSVText(int page_number) { delete []grapheme; res_it->Next(RIL_SYMBOL); } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); - if (italic) hocr_str += ""; - if (bold) hocr_str += ""; - hocr_str += " "; + hocr_str += "\n"; wcnt++; // Close any ending block/paragraph/textline. if (last_word_in_line) { - hocr_str += "\n "; lcnt++; } if (last_word_in_para) { - hocr_str += "\n

\n"; pcnt++; } if (last_word_in_block) { - hocr_str += "
\n"; bcnt++; } } - hocr_str += "
\n"; char *ret = new char[hocr_str.length() + 1]; strcpy(ret, hocr_str.string()); diff --git a/api/renderer.cpp b/api/renderer.cpp index 2aaa36992f..127d20053f 100644 --- a/api/renderer.cpp +++ b/api/renderer.cpp @@ -193,43 +193,20 @@ TessHOcrTsvRenderer::TessHOcrTsvRenderer(const char *outputbase, bool font_info) } bool TessHOcrTsvRenderer::BeginDocumentHandler() { - AppendString( - "\n" - "\n" - "\n \n \n"); - AppendString(title()); - AppendString( - "\n" - "\n" - " \n" - " \n" - "\n\n"); - + AppendString("level\tpage_num\tblock_num\tpar_num\tline_num\tword_num\tleft\ttop\twidth\theight\tconf\ttext\n"); return true; } bool TessHOcrTsvRenderer::EndDocumentHandler() { - AppendString(" \n\n"); - return true; } bool TessHOcrTsvRenderer::AddImageHandler(TessBaseAPI* api) { - char* hocr = api->GetHOCRText(imagenum()); - if (hocr == NULL) return false; + char* hocrtsv = api->GetHOCRTSVText(imagenum()); + if (hocrtsv == NULL) return false; - AppendString(hocr); - delete[] hocr; + AppendString(hocrtsv); + delete[] hocrtsv; return true; }