Skip to content

Commit

Permalink
Render output in TSV format.
Browse files Browse the repository at this point in the history
  • Loading branch information
sundarcf authored and tfmorris committed Mar 1, 2016
1 parent 738fe4f commit b1e4a82
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 91 deletions.
125 changes: 62 additions & 63 deletions api/baseapi.cpp
Expand Up @@ -1417,6 +1417,19 @@ static void AddBoxTohOCR(const ResultIterator *it,
*hocr_str += "\">";
}

static void AddBoxTohOCRTSV(const PageIterator *it,
PageIteratorLevel level,
STRING* hocr_str) {
int left, top, right, bottom;
it->BoundingBox(level, &left, &top, &right, &bottom);
hocr_str->add_str_int("\t", left);
hocr_str->add_str_int("\t", top);
hocr_str->add_str_int("\t", right - left + 1);
hocr_str->add_str_int("\t", bottom - top + 1);
}



/**
* Make a HTML-formatted string with hOCR markup from the internal
* data structures.
Expand Down Expand Up @@ -1641,19 +1654,18 @@ char* TessBaseAPI::GetHOCRTSVText(int page_number) {
delete[] utf8_str;
#endif

hocr_str.add_str_int(" <div class='ocr_page' id='page_", page_id);
hocr_str += "' title='image \"";
if (input_file_) {
hocr_str += HOcrEscape(input_file_->string());
} else {
hocr_str += "unknown";
}
hocr_str.add_str_int("\"; bbox ", rect_left_);
hocr_str.add_str_int(" ", rect_top_);
hocr_str.add_str_int(" ", rect_width_);
hocr_str.add_str_int(" ", rect_height_);
hocr_str.add_str_int("; ppageno ", page_number);
hocr_str += "'>\n";
int page_num = page_id, block_num = 0, par_num = 0, line_num = 0, word_num = 0;

hocr_str.add_str_int("1\t", page_num);
hocr_str.add_str_int("\t", block_num);
hocr_str.add_str_int("\t", par_num);
hocr_str.add_str_int("\t", line_num);
hocr_str.add_str_int("\t", word_num);
hocr_str.add_str_int("\t", rect_left_);
hocr_str.add_str_int("\t", rect_top_);
hocr_str.add_str_int("\t", rect_width_);
hocr_str.add_str_int("\t", rect_height_);
hocr_str += "\t-1\t\n";

ResultIterator *res_it = GetIterator();
while (!res_it->Empty(RIL_BLOCK)) {
Expand All @@ -1664,31 +1676,37 @@ char* TessBaseAPI::GetHOCRTSVText(int page_number) {

// Open any new block/paragraph/textline.
if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
hocr_str.add_str_int(" <div class='ocr_carea' id='block_", page_id);
hocr_str.add_str_int("_", bcnt);
AddBoxTohOCR(res_it, RIL_BLOCK, &hocr_str);
block_num++, par_num = 0, line_num = 0, word_num = 0;
hocr_str.add_str_int("2\t", page_num);
hocr_str.add_str_int("\t", block_num);
hocr_str.add_str_int("\t", par_num);
hocr_str.add_str_int("\t", line_num);
hocr_str.add_str_int("\t", word_num);
AddBoxTohOCRTSV(res_it, RIL_BLOCK, &hocr_str);
hocr_str += "\t-1\t\n";
}
if (res_it->IsAtBeginningOf(RIL_PARA)) {
if (res_it->ParagraphIsLtr()) {
hocr_str.add_str_int("\n <p class='ocr_par' dir='ltr' id='par_",
page_id);
hocr_str.add_str_int("_", pcnt);
} else {
hocr_str.add_str_int("\n <p class='ocr_par' dir='rtl' id='par_",
page_id);
hocr_str.add_str_int("_", pcnt);
}
AddBoxTohOCR(res_it, RIL_PARA, &hocr_str);
par_num++, line_num = 0, word_num = 0;
hocr_str.add_str_int("3\t", page_num);
hocr_str.add_str_int("\t", block_num);
hocr_str.add_str_int("\t", par_num);
hocr_str.add_str_int("\t", line_num);
hocr_str.add_str_int("\t", word_num);
AddBoxTohOCRTSV(res_it, RIL_PARA, &hocr_str);
hocr_str += "\t-1\t\n";
}
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
hocr_str.add_str_int("\n <span class='ocr_line' id='line_", page_id);
hocr_str.add_str_int("_", lcnt);
AddBoxTohOCR(res_it, RIL_TEXTLINE, &hocr_str);
line_num++, word_num = 0;
hocr_str.add_str_int("4\t", page_num);
hocr_str.add_str_int("\t", block_num);
hocr_str.add_str_int("\t", par_num);
hocr_str.add_str_int("\t", line_num);
hocr_str.add_str_int("\t", word_num);
AddBoxTohOCRTSV(res_it, RIL_TEXTLINE, &hocr_str);
hocr_str += "\t-1\t\n";
}

// Now, process the word...
hocr_str.add_str_int("<span class='ocrx_word' id='word_", page_id);
hocr_str.add_str_int("_", wcnt);
int left, top, right, bottom;
bool bold, italic, underlined, monospace, serif, smallcaps;
int pointsize, font_id;
Expand All @@ -1697,34 +1715,21 @@ char* TessBaseAPI::GetHOCRTSVText(int page_number) {
font_name = res_it->WordFontAttributes(&bold, &italic, &underlined,
&monospace, &serif, &smallcaps,
&pointsize, &font_id);
hocr_str.add_str_int("' title='bbox ", left);
hocr_str.add_str_int(" ", top);
hocr_str.add_str_int(" ", right);
hocr_str.add_str_int(" ", bottom);
hocr_str.add_str_int("; x_wconf ", res_it->Confidence(RIL_WORD));
if (font_info) {
hocr_str += "; x_font ";
hocr_str += HOcrEscape(font_name);
hocr_str.add_str_int("; x_fsize ", pointsize);
}
hocr_str += "'";
if (res_it->WordRecognitionLanguage()) {
hocr_str += " lang='";
hocr_str += res_it->WordRecognitionLanguage();
hocr_str += "'";
}
switch (res_it->WordDirection()) {
case DIR_LEFT_TO_RIGHT: hocr_str += " dir='ltr'"; break;
case DIR_RIGHT_TO_LEFT: hocr_str += " dir='rtl'"; break;
default: // Do nothing.
break;
}
hocr_str += ">";
word_num++;
hocr_str.add_str_int("5\t", page_num);
hocr_str.add_str_int("\t", block_num);
hocr_str.add_str_int("\t", par_num);
hocr_str.add_str_int("\t", line_num);
hocr_str.add_str_int("\t", word_num);
hocr_str.add_str_int("\t", left);
hocr_str.add_str_int("\t", top);
hocr_str.add_str_int("\t", right - left + 1);
hocr_str.add_str_int("\t", bottom - top + 1);
hocr_str.add_str_int("\t", res_it->Confidence(RIL_WORD));
bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
if (bold) hocr_str += "<strong>";
if (italic) hocr_str += "<em>";
hocr_str += "\t";
do {
const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL);
if (grapheme && grapheme[0] != 0) {
Expand All @@ -1737,25 +1742,19 @@ char* TessBaseAPI::GetHOCRTSVText(int page_number) {
delete []grapheme;
res_it->Next(RIL_SYMBOL);
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
if (italic) hocr_str += "</em>";
if (bold) hocr_str += "</strong>";
hocr_str += "</span> ";
hocr_str += "\n";
wcnt++;
// Close any ending block/paragraph/textline.
if (last_word_in_line) {
hocr_str += "\n </span>";
lcnt++;
}
if (last_word_in_para) {
hocr_str += "\n </p>\n";
pcnt++;
}
if (last_word_in_block) {
hocr_str += " </div>\n";
bcnt++;
}
}
hocr_str += " </div>\n";

char *ret = new char[hocr_str.length() + 1];
strcpy(ret, hocr_str.string());
Expand Down
33 changes: 5 additions & 28 deletions api/renderer.cpp
Expand Up @@ -193,43 +193,20 @@ TessHOcrTsvRenderer::TessHOcrTsvRenderer(const char *outputbase, bool font_info)
}

bool TessHOcrTsvRenderer::BeginDocumentHandler() {
AppendString(
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n"
" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"
"<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" "
"lang=\"en\">\n <head>\n <title>\n");
AppendString(title());
AppendString(
"</title>\n"
"<meta http-equiv=\"Content-Type\" content=\"text/html;"
"charset=utf-8\" />\n"
" <meta name='ocr-system' content='tesseract " TESSERACT_VERSION_STR
"' />\n"
" <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par"
" ocr_line ocrx_word");
if (font_info_)
AppendString(
" ocrp_lang ocrp_dir ocrp_font ocrp_fsize ocrp_wconf");
AppendString(
"'/>\n"
"</head>\n<body>\n");

AppendString("level\tpage_num\tblock_num\tpar_num\tline_num\tword_num\tleft\ttop\twidth\theight\tconf\ttext\n");
return true;
}

bool TessHOcrTsvRenderer::EndDocumentHandler() {
AppendString(" </body>\n</html>\n");

return true;
}

bool TessHOcrTsvRenderer::AddImageHandler(TessBaseAPI* api) {
char* hocr = api->GetHOCRText(imagenum());
if (hocr == NULL) return false;
char* hocrtsv = api->GetHOCRTSVText(imagenum());
if (hocrtsv == NULL) return false;

AppendString(hocr);
delete[] hocr;
AppendString(hocrtsv);
delete[] hocrtsv;

return true;
}
Expand Down

0 comments on commit b1e4a82

Please sign in to comment.