diff --git a/api/baseapi.cpp b/api/baseapi.cpp
index 1045468d74..991abf736e 100644
--- a/api/baseapi.cpp
+++ b/api/baseapi.cpp
@@ -1417,6 +1417,19 @@ static void AddBoxTohOCR(const ResultIterator *it,
*hocr_str += "\">";
}
+static void AddBoxTohOCRTSV(const PageIterator *it,
+ PageIteratorLevel level,
+ STRING* hocr_str) {
+ int left, top, right, bottom;
+ it->BoundingBox(level, &left, &top, &right, &bottom);
+ hocr_str->add_str_int("\t", left);
+ hocr_str->add_str_int("\t", top);
+ hocr_str->add_str_int("\t", right - left + 1);
+ hocr_str->add_str_int("\t", bottom - top + 1);
+}
+
+
+
/**
* Make a HTML-formatted string with hOCR markup from the internal
* data structures.
@@ -1641,19 +1654,18 @@ char* TessBaseAPI::GetHOCRTSVText(int page_number) {
delete[] utf8_str;
#endif
- hocr_str.add_str_int("
\n";
+ int page_num = page_id, block_num = 0, par_num = 0, line_num = 0, word_num = 0;
+
+ hocr_str.add_str_int("1\t", page_num);
+ hocr_str.add_str_int("\t", block_num);
+ hocr_str.add_str_int("\t", par_num);
+ hocr_str.add_str_int("\t", line_num);
+ hocr_str.add_str_int("\t", word_num);
+ hocr_str.add_str_int("\t", rect_left_);
+ hocr_str.add_str_int("\t", rect_top_);
+ hocr_str.add_str_int("\t", rect_width_);
+ hocr_str.add_str_int("\t", rect_height_);
+ hocr_str += "\t-1\t\n";
ResultIterator *res_it = GetIterator();
while (!res_it->Empty(RIL_BLOCK)) {
@@ -1664,31 +1676,37 @@ char* TessBaseAPI::GetHOCRTSVText(int page_number) {
// Open any new block/paragraph/textline.
if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
- hocr_str.add_str_int("
WordRecognitionLanguage()) {
- hocr_str += " lang='";
- hocr_str += res_it->WordRecognitionLanguage();
- hocr_str += "'";
- }
- switch (res_it->WordDirection()) {
- case DIR_LEFT_TO_RIGHT: hocr_str += " dir='ltr'"; break;
- case DIR_RIGHT_TO_LEFT: hocr_str += " dir='rtl'"; break;
- default: // Do nothing.
- break;
- }
- hocr_str += ">";
+ word_num++;
+ hocr_str.add_str_int("5\t", page_num);
+ hocr_str.add_str_int("\t", block_num);
+ hocr_str.add_str_int("\t", par_num);
+ hocr_str.add_str_int("\t", line_num);
+ hocr_str.add_str_int("\t", word_num);
+ hocr_str.add_str_int("\t", left);
+ hocr_str.add_str_int("\t", top);
+ hocr_str.add_str_int("\t", right - left + 1);
+ hocr_str.add_str_int("\t", bottom - top + 1);
+ hocr_str.add_str_int("\t", res_it->Confidence(RIL_WORD));
bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
- if (bold) hocr_str += "";
- if (italic) hocr_str += "";
+ hocr_str += "\t";
do {
const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL);
if (grapheme && grapheme[0] != 0) {
@@ -1737,25 +1742,19 @@ char* TessBaseAPI::GetHOCRTSVText(int page_number) {
delete []grapheme;
res_it->Next(RIL_SYMBOL);
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
- if (italic) hocr_str += "";
- if (bold) hocr_str += "";
- hocr_str += " ";
+ hocr_str += "\n";
wcnt++;
// Close any ending block/paragraph/textline.
if (last_word_in_line) {
- hocr_str += "\n ";
lcnt++;
}
if (last_word_in_para) {
- hocr_str += "\n \n";
pcnt++;
}
if (last_word_in_block) {
- hocr_str += "
\n";
bcnt++;
}
}
- hocr_str += "
\n";
char *ret = new char[hocr_str.length() + 1];
strcpy(ret, hocr_str.string());
diff --git a/api/renderer.cpp b/api/renderer.cpp
index 2aaa36992f..127d20053f 100644
--- a/api/renderer.cpp
+++ b/api/renderer.cpp
@@ -193,43 +193,20 @@ TessHOcrTsvRenderer::TessHOcrTsvRenderer(const char *outputbase, bool font_info)
}
bool TessHOcrTsvRenderer::BeginDocumentHandler() {
- AppendString(
- "\n"
- "\n"
- "\n \n