Skip to content

Commit

Permalink
Always use isascii() with isspace()
Browse files Browse the repository at this point in the history
isspace() must only used with an unsigned char or EOF argument,
and even then its result can depend on the current locale settings.

While this is not a problem for C/C++ executables which use the default
"C" locale, it becomes a problem when the Tesseract API is called from
languages like Python or Java which don't use the "C" locale.

By calling isasci() before calling isspace() this uncertainty can be
avoided, because any locale will hopefully give identical results for
the basic ASCII character set.

Signed-off-by: Stefan Weil <sw@weilnetz.de>
  • Loading branch information
stweil committed Oct 8, 2018
1 parent 59ebd58 commit dcd0377
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 11 deletions.
2 changes: 1 addition & 1 deletion src/ccmain/paragraphs.cpp
Expand Up @@ -2455,7 +2455,7 @@ static void InitializeRowInfo(bool after_recognition,
int trailing_ws_idx = strlen(text.get()); // strip trailing space
while (trailing_ws_idx > 0 &&
// isspace() only takes ASCII
((text[trailing_ws_idx - 1] & 0x80) == 0) &&
isascii(text[trailing_ws_idx - 1]) &&
isspace(text[trailing_ws_idx - 1]))
trailing_ws_idx--;
if (trailing_ws_idx > 0) {
Expand Down
14 changes: 5 additions & 9 deletions src/ccutil/scanutils.cpp
Expand Up @@ -75,7 +75,7 @@ inline size_t LongBit() {
static inline int
SkipSpace(FILE *s) {
int p;
while (isspace(p = fgetc(s)));
while (isascii(p = fgetc(s)) && isspace(p));
ungetc(p, s); // Make sure next char is available for reading
return p;
}
Expand Down Expand Up @@ -108,9 +108,7 @@ static uintmax_t streamtoumax(FILE* s, int base) {
uintmax_t v = 0;
int d, c = 0;

for (c = fgetc(s);
isspace(static_cast<unsigned char>(c)) && (c != EOF);
c = fgetc(s)) {}
for (c = fgetc(s); isascii(c) && isspace(c); c = fgetc(s));

// Single optional + or -
if (c == '-' || c == '+') {
Expand Down Expand Up @@ -151,9 +149,7 @@ static double streamtofloat(FILE* s) {
int k = 1;
int w = 0;

for (c = fgetc(s);
isspace(static_cast<unsigned char>(c)) && (c != EOF);
c = fgetc(s));
for (c = fgetc(s); isascii(c) && isspace(c); c = fgetc(s));

// Single optional + or -
if (c == '-' || c == '+') {
Expand Down Expand Up @@ -265,7 +261,7 @@ static int tvfscanf(FILE* stream, const char *format, va_list ap) {
if (ch == '%') {
state = ST_FLAGS;
flags = 0; rank = RANK_INT; width = UINT_MAX;
} else if (isspace(static_cast<unsigned char>(ch))) {
} else if (isascii(ch) && isspace(ch)) {
SkipSpace(stream);
} else {
if (fgetc(stream) != ch)
Expand Down Expand Up @@ -445,7 +441,7 @@ static int tvfscanf(FILE* stream, const char *format, va_list ap) {
unsigned length = 0;
while (width--) {
q = fgetc(stream);
if (isspace(static_cast<unsigned char>(q)) || q <= 0) {
if (isascii(q) && isspace(q) || q <= 0) {
ungetc(q, stream);
break;
}
Expand Down
5 changes: 4 additions & 1 deletion src/wordrec/params_model.cpp
Expand Up @@ -58,7 +58,10 @@ bool ParamsModel::ParseLine(char *line, char** key, float *val) {
if (line[0] == '#')
return false;
int end_of_key = 0;
while (line[end_of_key] && !isspace(line[end_of_key])) end_of_key++;
while (line[end_of_key] &&
!(isascii(line[end_of_key]) && isspace(line[end_of_key]))) {
end_of_key++;
}
if (!line[end_of_key]) {
tprintf("ParamsModel::Incomplete line %s\n", line);
return false;
Expand Down

0 comments on commit dcd0377

Please sign in to comment.