Skip to content

Commit

Permalink
Adds char* GetHOCRTSVText(int) as placeholder. Copy of char* GetHOCRT…
Browse files Browse the repository at this point in the history
…ext(int).
  • Loading branch information
sundarcf authored and tfmorris committed Mar 1, 2016
1 parent 2597296 commit d04e325
Show file tree
Hide file tree
Showing 2 changed files with 170 additions and 0 deletions.
162 changes: 162 additions & 0 deletions api/baseapi.cpp
Expand Up @@ -1601,6 +1601,168 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) {
return ret;
}

/**
* Make a TSV-formatted string with hOCR markup from the internal
* data structures.
* page_number is 0-based but will appear in the output as 1-based.
* Image name/input_file_ can be set by SetInputName before calling
* GetHOCRText
* STL removed from original patch submission and refactored by rays.
*/
char* TessBaseAPI::GetHOCRTSVText(int page_number) {
if (tesseract_ == NULL ||
(page_res_ == NULL && Recognize(NULL) < 0))
return NULL;

int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1;
int page_id = page_number + 1; // hOCR uses 1-based page numbers.
bool font_info = false;
GetBoolVariable("hocr_font_info", &font_info);

STRING hocr_str("");

if (input_file_ == NULL)
SetInputName(NULL);

#ifdef _WIN32
// convert input name from ANSI encoding to utf-8
int str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1,
NULL, NULL);
wchar_t *uni16_str = new WCHAR[str16_len];
str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1,
uni16_str, str16_len);
int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, NULL,
NULL, NULL, NULL);
char *utf8_str = new char[utf8_len];
WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str,
utf8_len, NULL, NULL);
*input_file_ = utf8_str;
delete[] uni16_str;
delete[] utf8_str;
#endif

hocr_str.add_str_int(" <div class='ocr_page' id='page_", page_id);
hocr_str += "' title='image \"";
if (input_file_) {
hocr_str += HOcrEscape(input_file_->string());
} else {
hocr_str += "unknown";
}
hocr_str.add_str_int("\"; bbox ", rect_left_);
hocr_str.add_str_int(" ", rect_top_);
hocr_str.add_str_int(" ", rect_width_);
hocr_str.add_str_int(" ", rect_height_);
hocr_str.add_str_int("; ppageno ", page_number);
hocr_str += "'>\n";

ResultIterator *res_it = GetIterator();
while (!res_it->Empty(RIL_BLOCK)) {
if (res_it->Empty(RIL_WORD)) {
res_it->Next(RIL_WORD);
continue;
}

// Open any new block/paragraph/textline.
if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
hocr_str.add_str_int(" <div class='ocr_carea' id='block_", page_id);
hocr_str.add_str_int("_", bcnt);
AddBoxTohOCR(res_it, RIL_BLOCK, &hocr_str);
}
if (res_it->IsAtBeginningOf(RIL_PARA)) {
if (res_it->ParagraphIsLtr()) {
hocr_str.add_str_int("\n <p class='ocr_par' dir='ltr' id='par_",
page_id);
hocr_str.add_str_int("_", pcnt);
} else {
hocr_str.add_str_int("\n <p class='ocr_par' dir='rtl' id='par_",
page_id);
hocr_str.add_str_int("_", pcnt);
}
AddBoxTohOCR(res_it, RIL_PARA, &hocr_str);
}
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
hocr_str.add_str_int("\n <span class='ocr_line' id='line_", page_id);
hocr_str.add_str_int("_", lcnt);
AddBoxTohOCR(res_it, RIL_TEXTLINE, &hocr_str);
}

// Now, process the word...
hocr_str.add_str_int("<span class='ocrx_word' id='word_", page_id);
hocr_str.add_str_int("_", wcnt);
int left, top, right, bottom;
bool bold, italic, underlined, monospace, serif, smallcaps;
int pointsize, font_id;
const char *font_name;
res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
font_name = res_it->WordFontAttributes(&bold, &italic, &underlined,
&monospace, &serif, &smallcaps,
&pointsize, &font_id);
hocr_str.add_str_int("' title='bbox ", left);
hocr_str.add_str_int(" ", top);
hocr_str.add_str_int(" ", right);
hocr_str.add_str_int(" ", bottom);
hocr_str.add_str_int("; x_wconf ", res_it->Confidence(RIL_WORD));
if (font_info) {
hocr_str += "; x_font ";
hocr_str += HOcrEscape(font_name);
hocr_str.add_str_int("; x_fsize ", pointsize);
}
hocr_str += "'";
if (res_it->WordRecognitionLanguage()) {
hocr_str += " lang='";
hocr_str += res_it->WordRecognitionLanguage();
hocr_str += "'";
}
switch (res_it->WordDirection()) {
case DIR_LEFT_TO_RIGHT: hocr_str += " dir='ltr'"; break;
case DIR_RIGHT_TO_LEFT: hocr_str += " dir='rtl'"; break;
default: // Do nothing.
break;
}
hocr_str += ">";
bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
if (bold) hocr_str += "<strong>";
if (italic) hocr_str += "<em>";
do {
const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL);
if (grapheme && grapheme[0] != 0) {
if (grapheme[1] == 0) {
hocr_str += HOcrEscape(grapheme);
} else {
hocr_str += grapheme;
}
}
delete []grapheme;
res_it->Next(RIL_SYMBOL);
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
if (italic) hocr_str += "</em>";
if (bold) hocr_str += "</strong>";
hocr_str += "</span> ";
wcnt++;
// Close any ending block/paragraph/textline.
if (last_word_in_line) {
hocr_str += "\n </span>";
lcnt++;
}
if (last_word_in_para) {
hocr_str += "\n </p>\n";
pcnt++;
}
if (last_word_in_block) {
hocr_str += " </div>\n";
bcnt++;
}
}
hocr_str += " </div>\n";

char *ret = new char[hocr_str.length() + 1];
strcpy(ret, hocr_str.string());
delete res_it;
return ret;
}

/** The 5 numbers output for each box (the usual 4 and a page number.) */
const int kNumbersPerBlob = 5;
/**
Expand Down
8 changes: 8 additions & 0 deletions api/baseapi.h
Expand Up @@ -602,6 +602,14 @@ class TESS_API TessBaseAPI {
*/
char* GetHOCRText(int page_number);

/**
* Make a TSV-formatted string with hOCR markup from the internal
* data structures.
* page_number is 0-based but will appear in the output as 1-based.
*/
char* GetHOCRTSVText(int page_number);


/**
* The recognized text is returned as a char* which is coded in the same
* format as a box file used in training. Returned string must be freed with
Expand Down

0 comments on commit d04e325

Please sign in to comment.