Skip to content

Commit

Permalink
Emit fewer "lang" attributes
Browse files Browse the repository at this point in the history
Add "lang" attribute to paragraph markup and only include
word lang attribute if it's different from the paragraph's value.
  • Loading branch information
tfmorris committed Feb 17, 2016
1 parent ea401c9 commit 6c44775
Showing 1 changed file with 10 additions and 2 deletions.
12 changes: 10 additions & 2 deletions api/baseapi.cpp
Expand Up @@ -1445,6 +1445,7 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) {
int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1;
int page_id = page_number + 1; // hOCR uses 1-based page numbers.
bool para_is_ltr = true; // Default direction is LTR
const char* paragraph_lang = NULL;
bool font_info = false;
GetBoolVariable("hocr_font_info", &font_info);

Expand Down Expand Up @@ -1506,6 +1507,12 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) {
hocr_str += " dir='rtl'";
}
AddIdTohOCR(&hocr_str, "par", page_id, pcnt);
paragraph_lang = res_it->WordRecognitionLanguage();
if (paragraph_lang) {
hocr_str += " lang='";
hocr_str += paragraph_lang;
hocr_str += "'";
}
AddBoxTohOCR(res_it, RIL_PARA, &hocr_str);
}
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
Expand Down Expand Up @@ -1538,9 +1545,10 @@ char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) {
hocr_str.add_str_int("; x_fsize ", pointsize);
}
hocr_str += "'";
if (res_it->WordRecognitionLanguage()) {
const char* lang = res_it->WordRecognitionLanguage();
if (lang && (!paragraph_lang || strcmp(lang, paragraph_lang))) {
hocr_str += " lang='";
hocr_str += res_it->WordRecognitionLanguage();
hocr_str += lang;
hocr_str += "'";
}
switch (res_it->WordDirection()) {
Expand Down

0 comments on commit 6c44775

Please sign in to comment.