Skip to content

Commit

Permalink
replace CubeUtils::UTF8ToUTF32 in pdfrenderer
Browse files Browse the repository at this point in the history
  • Loading branch information
jbreiden authored and zdenop committed Aug 7, 2015
1 parent 66a76a9 commit 9dcf2c6
Showing 1 changed file with 18 additions and 6 deletions.
24 changes: 18 additions & 6 deletions api/pdfrenderer.cpp
Expand Up @@ -413,13 +413,25 @@ char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
do {
const char *grapheme = res_it->GetUTF8Text(RIL_SYMBOL);
if (grapheme && grapheme[0] != '\0') {
// TODO(jbreiden) Do a real UTF-16BE conversion
// http://en.wikipedia.org/wiki/UTF-16#Example_UTF-16_encoding_procedure
string_32 utf32;
CubeUtils::UTF8ToUTF32(grapheme, &utf32);
GenericVector<int> unicodes;
UNICHAR::UTF8ToUnicode(grapheme, &unicodes);
char utf16[20];
for (int i = 0; i < static_cast<int>(utf32.length()); i++) {
snprintf(utf16, sizeof(utf16), "<%04X>", utf32[i]);
for (int i = 0; i < unicodes.length(); i++) {
int code = unicodes[i];
// Convert to UTF-16BE https://en.wikipedia.org/wiki/UTF-16
if ((code > 0xD7FF && code < 0xE0000) || code > 0x10FFFF) {
tprintf("Dropping invalid codepoint %d\n", code);
continue;
}
if (code < 0x10000) {
snprintf(utf16, sizeof(utf16), "<%04X>", code);
} else {
int a = code - 0x010000;
int high_surrogate = (0x03FF & (a >> 10)) + 0xD800;
int low_surrogate = (0x03FF & a) + 0xDC00;
snprintf(utf16, sizeof(utf16), "<%04X%04X>",
high_surrogate, low_surrogate);
}
pdf_word += utf16;
pdf_word_len++;
}
Expand Down

0 comments on commit 9dcf2c6

Please sign in to comment.