Skip to content

Commit

Permalink
Added script-specific validation and normalization for virama-using s…
Browse files Browse the repository at this point in the history
…cripts and updated normalization for others
  • Loading branch information
theraysmith committed Jul 14, 2017
1 parent da03e4e commit df41eab
Show file tree
Hide file tree
Showing 15 changed files with 1,518 additions and 90 deletions.
16 changes: 16 additions & 0 deletions ccutil/unichar.cpp
Expand Up @@ -226,3 +226,19 @@ std::vector<char32> UNICHAR::UTF8ToUTF32(const char* utf8_str) {
return unicodes;
}

// Returns an empty string if the input contains an invalid unicode.
string UNICHAR::UTF32ToUTF8(const std::vector<char32>& str32) {
string utf8_str;
for (char32 ch : str32) {
UNICHAR uni_ch(ch);
int step;
if (uni_ch.utf8_len() > 0 && (step = utf8_step(uni_ch.utf8())) > 0) {
utf8_str.append(uni_ch.utf8(), step);
} else {
return "";
}
}
return utf8_str;
}

} // namespace tesseract
8 changes: 6 additions & 2 deletions training/Makefile.am
Expand Up @@ -21,7 +21,9 @@ noinst_HEADERS = \
boxchar.h commandlineflags.h commontraining.h degradeimage.h \
fileio.h icuerrorcode.h ligature_table.h lstmtester.h normstrngs.h \
mergenf.h pango_font_info.h stringrenderer.h \
tessopt.h tlog.h unicharset_training_utils.h util.h
tessopt.h tlog.h unicharset_training_utils.h util.h \
validate_grapheme.h validate_indic.h validate_khmer.h \
validate_myanmar.h validator.h

noinst_LTLIBRARIES = libtesseract_training.la libtesseract_tessopt.la

Expand All @@ -32,7 +34,9 @@ libtesseract_training_la_LIBADD = \
libtesseract_training_la_SOURCES = \
boxchar.cpp commandlineflags.cpp commontraining.cpp degradeimage.cpp \
fileio.cpp ligature_table.cpp lstmtester.cpp normstrngs.cpp pango_font_info.cpp \
stringrenderer.cpp tlog.cpp unicharset_training_utils.cpp
stringrenderer.cpp tlog.cpp unicharset_training_utils.cpp \
validate_grapheme.cpp validate_indic.cpp validate_khmer.cpp \
validate_myanmar.cpp validator.cpp

libtesseract_tessopt_la_SOURCES = \
tessopt.cpp
Expand Down
195 changes: 133 additions & 62 deletions training/normstrngs.cpp
Expand Up @@ -21,6 +21,10 @@
#include "normstrngs.h"

#include <assert.h>
#include <string>
#include <unordered_map>
#include <vector>

#include "icuerrorcode.h"
#include "unichar.h"
#include "unicode/normalizer2.h" // From libicu
Expand All @@ -34,99 +38,166 @@ namespace tesseract {
bool is_hyphen_punc(const char32 ch) {
static const int kNumHyphenPuncUnicodes = 13;
static const char32 kHyphenPuncUnicodes[kNumHyphenPuncUnicodes] = {
'-',
0x2010, 0x2011, 0x2012, 0x2013, 0x2014, 0x2015, // hyphen..horizontal bar
0x207b, // superscript minus
0x208b, // subscript minus
0x2212, // minus sign
0xfe58, // small em dash
0xfe63, // small hyphen-minus
0xff0d, // fullwidth hyphen-minus
'-', 0x2010, 0x2011, 0x2012,
0x2013, 0x2014, 0x2015, // hyphen..horizontal bar
0x207b, // superscript minus
0x208b, // subscript minus
0x2212, // minus sign
0xfe58, // small em dash
0xfe63, // small hyphen-minus
0xff0d, // fullwidth hyphen-minus
};
for (int i = 0; i < kNumHyphenPuncUnicodes; ++i) {
if (kHyphenPuncUnicodes[i] == ch)
return true;
if (kHyphenPuncUnicodes[i] == ch) return true;
}
return false;
}

bool is_single_quote(const char32 ch) {
static const int kNumSingleQuoteUnicodes = 8;
static const char32 kSingleQuoteUnicodes[kNumSingleQuoteUnicodes] = {
'\'',
'`',
0x2018, // left single quotation mark (English, others)
0x2019, // right single quotation mark (Danish, Finnish, Swedish, Norw.)
// We may have to introduce a comma set with 0x201a
0x201B, // single high-reveresed-9 quotation mark (PropList.txt)
0x2032, // prime
0x300C, // left corner bracket (East Asian languages)
0xFF07, // fullwidth apostrophe
'\'', '`',
0x2018, // left single quotation mark (English, others)
0x2019, // right single quotation mark (Danish, Finnish, Swedish, Norw.)
// We may have to introduce a comma set with 0x201a
0x201B, // single high-reveresed-9 quotation mark (PropList.txt)
0x2032, // prime
0x300C, // left corner bracket (East Asian languages)
0xFF07, // fullwidth apostrophe
};
for (int i = 0; i < kNumSingleQuoteUnicodes; ++i) {
if (kSingleQuoteUnicodes[i] == ch)
return true;
if (kSingleQuoteUnicodes[i] == ch) return true;
}
return false;
}

bool is_double_quote(const char32 ch) {
static const int kNumDoubleQuoteUnicodes = 8;
static const char32 kDoubleQuoteUnicodes[kNumDoubleQuoteUnicodes] = {
'"',
0x201C, // left double quotation mark (English, others)
0x201D, // right double quotation mark (Danish, Finnish, Swedish, Norw.)
0x201F, // double high-reversed-9 quotation mark (PropList.txt)
0x2033, // double prime
0x301D, // reversed double prime quotation mark (East Asian langs, horiz.)
0x301E, // close double prime (East Asian languages written horizontally)
0xFF02, // fullwidth quotation mark
'"',
0x201C, // left double quotation mark (English, others)
0x201D, // right double quotation mark (Danish, Finnish, Swedish, Norw.)
0x201F, // double high-reversed-9 quotation mark (PropList.txt)
0x2033, // double prime
0x301D, // reversed double prime quotation mark (East Asian langs,
// horiz.)
0x301E, // close double prime (East Asian languages written horizontally)
0xFF02, // fullwidth quotation mark
};
for (int i = 0; i < kNumDoubleQuoteUnicodes; ++i) {
if (kDoubleQuoteUnicodes[i] == ch)
return true;
if (kDoubleQuoteUnicodes[i] == ch) return true;
}
return false;
}

STRING NormalizeUTF8String(bool decompose, const char* str8) {
GenericVector<char32> str32, out_str32, norm_str;
UTF8ToUTF32(str8, &str32);
for (int i = 0; i < str32.length(); ++i) {
norm_str.clear();
NormalizeChar32(str32[i], decompose, &norm_str);
for (int j = 0; j < norm_str.length(); ++j) {
out_str32.push_back(norm_str[j]);
}
}
STRING out_str8;
UTF32ToUTF8(out_str32, &out_str8);
return out_str8;
}

void NormalizeChar32(char32 ch, bool decompose, GenericVector<char32>* str) {
// Helper runs a standard unicode normalization, optional OCR normalization,
// and leaves the result as char32 for subsequent processing.
static void NormalizeUTF8ToUTF32(UnicodeNormMode u_mode, OCRNorm ocr_normalize,
const char* str8,
std::vector<char32>* normed32) {
// Convert to ICU string for unicode normalization.
icu::UnicodeString uch_str(str8, "UTF-8");
IcuErrorCode error_code;
const icu::Normalizer2* nfkc = icu::Normalizer2::getInstance(
nullptr, "nfkc", decompose ? UNORM2_DECOMPOSE : UNORM2_COMPOSE,
error_code);
// Convert the enum to the new weird icu representation.
const char* norm_type =
u_mode == UnicodeNormMode::kNFKD || u_mode == UnicodeNormMode::kNFKC
? "nfkc"
: "nfc";
UNormalization2Mode compose =
u_mode == UnicodeNormMode::kNFC || u_mode == UnicodeNormMode::kNFKC
? UNORM2_COMPOSE
: UNORM2_DECOMPOSE;
// Pointer to singleton does not require deletion.
const icu::Normalizer2* normalizer =
icu::Normalizer2::getInstance(nullptr, norm_type, compose, error_code);
error_code.assertSuccess();
error_code.reset();

icu::UnicodeString uch_str(static_cast<UChar32>(ch));
icu::UnicodeString norm_str = nfkc->normalize(uch_str, error_code);
icu::UnicodeString norm_str = normalizer->normalize(uch_str, error_code);
error_code.assertSuccess();
// Convert to char32 for output. OCR normalization if required.
normed32->reserve(norm_str.length()); // An approximation.
for (int offset = 0; offset < norm_str.length();
offset = norm_str.moveIndex32(offset, 1)) {
char32 ch = norm_str.char32At(offset);
// Skip all ZWS, RTL and LTR marks.
if (Validator::IsZeroWidthMark(ch)) continue;
if (ocr_normalize == OCRNorm::kNormalize) ch = OCRNormalize(ch);
normed32->push_back(ch);
}
}

// Helper removes joiners from strings that contain no letters.
static void StripJoiners(std::vector<char32>* str32) {
for (char32 ch : *str32) {
if (u_isalpha(ch)) return;
}
int len = 0;
for (char32 ch : *str32) {
if (ch != Validator::kZeroWidthJoiner &&
ch != Validator::kZeroWidthNonJoiner) {
(*str32)[len++] = ch;
}
}
str32->resize(len);
}

// Normalizes a UTF8 string according to the given modes. Returns true on
// success. If false is returned, some failure or invalidity was present, and
// the result string is produced on a "best effort" basis.
bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize,
GraphemeNorm grapheme_normalize, const char* str8,
string* normalized) {
std::vector<char32> normed32;
NormalizeUTF8ToUTF32(u_mode, ocr_normalize, str8, &normed32);
if (grapheme_normalize == GraphemeNorm::kNormalize) {
StripJoiners(&normed32);
std::vector<std::vector<char32>> graphemes;
bool success = Validator::ValidateCleanAndSegment(
GraphemeNormMode::kSingleString, false, normed32, &graphemes);
if (graphemes.empty() || graphemes[0].empty()) {
success = false;
} else if (normalized != nullptr) {
*normalized = UNICHAR::UTF32ToUTF8(graphemes[0]);
}
return success;
}
if (normalized != nullptr) *normalized = UNICHAR::UTF32ToUTF8(normed32);
return true;
}

str->clear();
for (int i = 0; i < norm_str.length(); ++i) {
// If any spaces were added by NFKC, pretend normalization is a nop.
if (norm_str[i] == ' ') {
str->clear();
str->push_back(ch);
break;
} else {
str->push_back(OCRNormalize(static_cast<char32>(norm_str[i])));
// Normalizes a UTF8 string according to the given modes and splits into
// graphemes according to g_mode. Returns true on success. If false is returned,
// some failure or invalidity was present, and the result string is produced on
// a "best effort" basis.
bool NormalizeCleanAndSegmentUTF8(UnicodeNormMode u_mode, OCRNorm ocr_normalize,
GraphemeNormMode g_mode, bool report_errors,
const char* str8,
std::vector<string>* graphemes) {
std::vector<char32> normed32;
NormalizeUTF8ToUTF32(u_mode, ocr_normalize, str8, &normed32);
StripJoiners(&normed32);
std::vector<std::vector<char32>> graphemes32;
bool success = Validator::ValidateCleanAndSegment(g_mode, report_errors,
normed32, &graphemes32);
if (g_mode != GraphemeNormMode::kSingleString && success) {
// If we modified the string to clean it up, the segmentation may not be
// correct, so check for changes and do it again.
std::vector<char32> cleaned32;
for (const auto& g : graphemes32) {
cleaned32.insert(cleaned32.end(), g.begin(), g.end());
}
if (cleaned32 != normed32) {
graphemes32.clear();
success = Validator::ValidateCleanAndSegment(g_mode, report_errors,
cleaned32, &graphemes32);
}
}
graphemes->clear();
graphemes->reserve(graphemes32.size());
for (const auto& grapheme : graphemes32) {
graphemes->push_back(UNICHAR::UTF32ToUTF8(grapheme));
}
return success;
}

// Apply just the OCR-specific normalizations and return the normalized char.
Expand Down
64 changes: 40 additions & 24 deletions training/normstrngs.h
Expand Up @@ -21,34 +21,50 @@
#ifndef TESSERACT_CCUTIL_NORMSTRNGS_H_
#define TESSERACT_CCUTIL_NORMSTRNGS_H_

#include "genericvector.h"
#include "strngs.h"
#include <string>
#include <vector>

typedef signed int char32;
#include "validator.h"

namespace tesseract {

// UTF-8 to UTF-32 conversion function.
void UTF8ToUTF32(const char* utf8_str, GenericVector<char32>* str32);

// UTF-32 to UTF-8 convesion function.
void UTF32ToUTF8(const GenericVector<char32>& str32, STRING* utf8_str);

// Normalize a single char32 using NFKC + OCR-specific transformations.
// NOTE that proper NFKC may require multiple characters as input. The
// assumption of this function is that the input is already as fully composed
// as it can be, but may require some compatibility normalizations or just
// OCR evaluation related normalizations.
void NormalizeChar32(char32 ch, bool decompose, GenericVector<char32>* str);

// Normalize a UTF8 string. Same as above, but for UTF8-encoded strings, that
// can contain multiple UTF32 code points.
STRING NormalizeUTF8String(bool decompose, const char* str8);
// Default behavior is to compose, until it is proven that decomposed benefits
// at least one language.
inline STRING NormalizeUTF8String(const char* str8) {
return NormalizeUTF8String(false, str8);
}
// The standard unicode normalizations.
enum class UnicodeNormMode {
kNFD,
kNFC,
kNFKD,
kNFKC,
};

// To normalize away differences in punctuation that are ambiguous, like
// curly quotes and different widths of dash.
enum class OCRNorm {
kNone,
kNormalize,
};

// To validate and normalize away some subtle differences that can occur in
// Indic scripts, eg ensuring that an explicit virama is always followed by
// a zero-width non-joiner.
enum class GraphemeNorm {
kNone,
kNormalize,
};

// Normalizes a UTF8 string according to the given modes. Returns true on
// success. If false is returned, some failure or invalidity was present, and
// the result string is produced on a "best effort" basis.
bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize,
GraphemeNorm grapheme_normalize, const char* str8,
string* normalized);
// Normalizes a UTF8 string according to the given modes and splits into
// graphemes according to g_mode. Returns true on success. If false is returned,
// some failure or invalidity was present, and the result string is produced on
// a "best effort" basis.
bool NormalizeCleanAndSegmentUTF8(UnicodeNormMode u_mode, OCRNorm ocr_normalize,
GraphemeNormMode g_mode, bool report_errors,
const char* str8,
std::vector<string>* graphemes);

// Applies just the OCR-specific normalizations and return the normalized char.
char32 OCRNormalize(char32 ch);
Expand Down
10 changes: 8 additions & 2 deletions training/unicharset_training_utils.cpp
Expand Up @@ -122,8 +122,14 @@ void SetupBasicProperties(bool report_errors, bool decompose,
}

// Record normalized version of this unichar.
string normed_str = tesseract::NormalizeUTF8String(decompose, unichar_str);
if (unichar_id != 0 && !normed_str.empty()) {
string normed_str;
if (unichar_id != 0 &&
tesseract::NormalizeUTF8String(
decompose ? tesseract::UnicodeNormMode::kNFKD
: tesseract::UnicodeNormMode::kNFKC,
tesseract::OCRNorm::kNormalize, tesseract::GraphemeNorm::kNone,
unichar_str, &normed_str) &&
!normed_str.empty()) {
unicharset->set_normed(unichar_id, normed_str.c_str());
} else {
unicharset->set_normed(unichar_id, unichar_str);
Expand Down

1 comment on commit df41eab

@hoangtocdo90
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi Ray!
Please add #include in validator.cpp
failed build in windows 10 Visual Studio 2015

Please sign in to comment.