Skip to content

Commit

Permalink
check for punctuations and CJK characters
Browse files Browse the repository at this point in the history
that should be treated as words when loading text page
  • Loading branch information
chrox committed Jul 19, 2013
1 parent 660b65a commit d45c0ac
Showing 1 changed file with 12 additions and 0 deletions.
12 changes: 12 additions & 0 deletions pdf.c
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,18 @@ static void load_lua_text_page(lua_State *L, fz_text_page *page)
}
bbox = fz_union_rect(bbox, span->text[i].bbox);
linebbox = fz_union_rect(linebbox, span->text[i].bbox);
/* check for punctuations and CJK characters */
if ((span->text[i].c >= 0x4e00 && span->text[i].c <= 0x9FFF) || // CJK Unified Ideographs
(span->text[i].c >= 0x2000 && span->text[i].c <= 0x206F) || // General Punctuation
(span->text[i].c >= 0x3000 && span->text[i].c <= 0x303F) || // CJK Symbols and Punctuation
(span->text[i].c >= 0x3400 && span->text[i].c <= 0x4DBF) || // CJK Unified Ideographs Extension A
(span->text[i].c >= 0xF900 && span->text[i].c <= 0xFAFF) || // CJK Compatibility Ideographs
(span->text[i].c >= 0xFF01 && span->text[i].c <= 0xFFEE) || // Halfwidth and Fullwidth Forms
(span->text[i].c >= 0x20000 && span->text[i].c <= 0x2A6DF) // CJK Unified Ideographs Extension B
) {
i++;
break;
}
}
lua_pushstring(L, "word");
luaL_pushresult(&textbuf);
Expand Down

0 comments on commit d45c0ac

Please sign in to comment.