From df41eab6aa1726dfe6725284c94693dff6fa1eda Mon Sep 17 00:00:00 2001 From: Ray Smith Date: Fri, 14 Jul 2017 10:05:05 -0700 Subject: [PATCH] Added script-specific validation and normalization for virama-using scripts and updated normalization for others --- ccutil/unichar.cpp | 16 ++ training/Makefile.am | 8 +- training/normstrngs.cpp | 195 ++++++++++++------ training/normstrngs.h | 64 +++--- training/unicharset_training_utils.cpp | 10 +- training/validate_grapheme.cpp | 174 ++++++++++++++++ training/validate_grapheme.h | 35 ++++ training/validate_indic.cpp | 274 +++++++++++++++++++++++++ training/validate_indic.h | 44 ++++ training/validate_khmer.cpp | 106 ++++++++++ training/validate_khmer.h | 27 +++ training/validate_myanmar.cpp | 160 +++++++++++++++ training/validate_myanmar.h | 47 +++++ training/validator.cpp | 205 ++++++++++++++++++ training/validator.h | 243 ++++++++++++++++++++++ 15 files changed, 1518 insertions(+), 90 deletions(-) create mode 100644 training/validate_grapheme.cpp create mode 100644 training/validate_grapheme.h create mode 100644 training/validate_indic.cpp create mode 100644 training/validate_indic.h create mode 100644 training/validate_khmer.cpp create mode 100644 training/validate_khmer.h create mode 100644 training/validate_myanmar.cpp create mode 100644 training/validate_myanmar.h create mode 100644 training/validator.cpp create mode 100644 training/validator.h diff --git a/ccutil/unichar.cpp b/ccutil/unichar.cpp index 255136f3ff..b568e12a28 100644 --- a/ccutil/unichar.cpp +++ b/ccutil/unichar.cpp @@ -226,3 +226,19 @@ std::vector UNICHAR::UTF8ToUTF32(const char* utf8_str) { return unicodes; } +// Returns an empty string if the input contains an invalid unicode. +string UNICHAR::UTF32ToUTF8(const std::vector& str32) { + string utf8_str; + for (char32 ch : str32) { + UNICHAR uni_ch(ch); + int step; + if (uni_ch.utf8_len() > 0 && (step = utf8_step(uni_ch.utf8())) > 0) { + utf8_str.append(uni_ch.utf8(), step); + } else { + return ""; + } + } + return utf8_str; +} + +} // namespace tesseract diff --git a/training/Makefile.am b/training/Makefile.am index 654ca3db3e..8d06d9454d 100644 --- a/training/Makefile.am +++ b/training/Makefile.am @@ -21,7 +21,9 @@ noinst_HEADERS = \ boxchar.h commandlineflags.h commontraining.h degradeimage.h \ fileio.h icuerrorcode.h ligature_table.h lstmtester.h normstrngs.h \ mergenf.h pango_font_info.h stringrenderer.h \ - tessopt.h tlog.h unicharset_training_utils.h util.h + tessopt.h tlog.h unicharset_training_utils.h util.h \ + validate_grapheme.h validate_indic.h validate_khmer.h \ + validate_myanmar.h validator.h noinst_LTLIBRARIES = libtesseract_training.la libtesseract_tessopt.la @@ -32,7 +34,9 @@ libtesseract_training_la_LIBADD = \ libtesseract_training_la_SOURCES = \ boxchar.cpp commandlineflags.cpp commontraining.cpp degradeimage.cpp \ fileio.cpp ligature_table.cpp lstmtester.cpp normstrngs.cpp pango_font_info.cpp \ - stringrenderer.cpp tlog.cpp unicharset_training_utils.cpp + stringrenderer.cpp tlog.cpp unicharset_training_utils.cpp \ + validate_grapheme.cpp validate_indic.cpp validate_khmer.cpp \ + validate_myanmar.cpp validator.cpp libtesseract_tessopt_la_SOURCES = \ tessopt.cpp diff --git a/training/normstrngs.cpp b/training/normstrngs.cpp index 17fe5cf8b7..3334f1dc51 100644 --- a/training/normstrngs.cpp +++ b/training/normstrngs.cpp @@ -21,6 +21,10 @@ #include "normstrngs.h" #include +#include +#include +#include + #include "icuerrorcode.h" #include "unichar.h" #include "unicode/normalizer2.h" // From libicu @@ -34,18 +38,17 @@ namespace tesseract { bool is_hyphen_punc(const char32 ch) { static const int kNumHyphenPuncUnicodes = 13; static const char32 kHyphenPuncUnicodes[kNumHyphenPuncUnicodes] = { - '-', - 0x2010, 0x2011, 0x2012, 0x2013, 0x2014, 0x2015, // hyphen..horizontal bar - 0x207b, // superscript minus - 0x208b, // subscript minus - 0x2212, // minus sign - 0xfe58, // small em dash - 0xfe63, // small hyphen-minus - 0xff0d, // fullwidth hyphen-minus + '-', 0x2010, 0x2011, 0x2012, + 0x2013, 0x2014, 0x2015, // hyphen..horizontal bar + 0x207b, // superscript minus + 0x208b, // subscript minus + 0x2212, // minus sign + 0xfe58, // small em dash + 0xfe63, // small hyphen-minus + 0xff0d, // fullwidth hyphen-minus }; for (int i = 0; i < kNumHyphenPuncUnicodes; ++i) { - if (kHyphenPuncUnicodes[i] == ch) - return true; + if (kHyphenPuncUnicodes[i] == ch) return true; } return false; } @@ -53,19 +56,17 @@ bool is_hyphen_punc(const char32 ch) { bool is_single_quote(const char32 ch) { static const int kNumSingleQuoteUnicodes = 8; static const char32 kSingleQuoteUnicodes[kNumSingleQuoteUnicodes] = { - '\'', - '`', - 0x2018, // left single quotation mark (English, others) - 0x2019, // right single quotation mark (Danish, Finnish, Swedish, Norw.) - // We may have to introduce a comma set with 0x201a - 0x201B, // single high-reveresed-9 quotation mark (PropList.txt) - 0x2032, // prime - 0x300C, // left corner bracket (East Asian languages) - 0xFF07, // fullwidth apostrophe + '\'', '`', + 0x2018, // left single quotation mark (English, others) + 0x2019, // right single quotation mark (Danish, Finnish, Swedish, Norw.) + // We may have to introduce a comma set with 0x201a + 0x201B, // single high-reveresed-9 quotation mark (PropList.txt) + 0x2032, // prime + 0x300C, // left corner bracket (East Asian languages) + 0xFF07, // fullwidth apostrophe }; for (int i = 0; i < kNumSingleQuoteUnicodes; ++i) { - if (kSingleQuoteUnicodes[i] == ch) - return true; + if (kSingleQuoteUnicodes[i] == ch) return true; } return false; } @@ -73,60 +74,130 @@ bool is_single_quote(const char32 ch) { bool is_double_quote(const char32 ch) { static const int kNumDoubleQuoteUnicodes = 8; static const char32 kDoubleQuoteUnicodes[kNumDoubleQuoteUnicodes] = { - '"', - 0x201C, // left double quotation mark (English, others) - 0x201D, // right double quotation mark (Danish, Finnish, Swedish, Norw.) - 0x201F, // double high-reversed-9 quotation mark (PropList.txt) - 0x2033, // double prime - 0x301D, // reversed double prime quotation mark (East Asian langs, horiz.) - 0x301E, // close double prime (East Asian languages written horizontally) - 0xFF02, // fullwidth quotation mark + '"', + 0x201C, // left double quotation mark (English, others) + 0x201D, // right double quotation mark (Danish, Finnish, Swedish, Norw.) + 0x201F, // double high-reversed-9 quotation mark (PropList.txt) + 0x2033, // double prime + 0x301D, // reversed double prime quotation mark (East Asian langs, + // horiz.) + 0x301E, // close double prime (East Asian languages written horizontally) + 0xFF02, // fullwidth quotation mark }; for (int i = 0; i < kNumDoubleQuoteUnicodes; ++i) { - if (kDoubleQuoteUnicodes[i] == ch) - return true; + if (kDoubleQuoteUnicodes[i] == ch) return true; } return false; } -STRING NormalizeUTF8String(bool decompose, const char* str8) { - GenericVector str32, out_str32, norm_str; - UTF8ToUTF32(str8, &str32); - for (int i = 0; i < str32.length(); ++i) { - norm_str.clear(); - NormalizeChar32(str32[i], decompose, &norm_str); - for (int j = 0; j < norm_str.length(); ++j) { - out_str32.push_back(norm_str[j]); - } - } - STRING out_str8; - UTF32ToUTF8(out_str32, &out_str8); - return out_str8; -} - -void NormalizeChar32(char32 ch, bool decompose, GenericVector* str) { +// Helper runs a standard unicode normalization, optional OCR normalization, +// and leaves the result as char32 for subsequent processing. +static void NormalizeUTF8ToUTF32(UnicodeNormMode u_mode, OCRNorm ocr_normalize, + const char* str8, + std::vector* normed32) { + // Convert to ICU string for unicode normalization. + icu::UnicodeString uch_str(str8, "UTF-8"); IcuErrorCode error_code; - const icu::Normalizer2* nfkc = icu::Normalizer2::getInstance( - nullptr, "nfkc", decompose ? UNORM2_DECOMPOSE : UNORM2_COMPOSE, - error_code); + // Convert the enum to the new weird icu representation. + const char* norm_type = + u_mode == UnicodeNormMode::kNFKD || u_mode == UnicodeNormMode::kNFKC + ? "nfkc" + : "nfc"; + UNormalization2Mode compose = + u_mode == UnicodeNormMode::kNFC || u_mode == UnicodeNormMode::kNFKC + ? UNORM2_COMPOSE + : UNORM2_DECOMPOSE; + // Pointer to singleton does not require deletion. + const icu::Normalizer2* normalizer = + icu::Normalizer2::getInstance(nullptr, norm_type, compose, error_code); error_code.assertSuccess(); error_code.reset(); - - icu::UnicodeString uch_str(static_cast(ch)); - icu::UnicodeString norm_str = nfkc->normalize(uch_str, error_code); + icu::UnicodeString norm_str = normalizer->normalize(uch_str, error_code); error_code.assertSuccess(); + // Convert to char32 for output. OCR normalization if required. + normed32->reserve(norm_str.length()); // An approximation. + for (int offset = 0; offset < norm_str.length(); + offset = norm_str.moveIndex32(offset, 1)) { + char32 ch = norm_str.char32At(offset); + // Skip all ZWS, RTL and LTR marks. + if (Validator::IsZeroWidthMark(ch)) continue; + if (ocr_normalize == OCRNorm::kNormalize) ch = OCRNormalize(ch); + normed32->push_back(ch); + } +} + +// Helper removes joiners from strings that contain no letters. +static void StripJoiners(std::vector* str32) { + for (char32 ch : *str32) { + if (u_isalpha(ch)) return; + } + int len = 0; + for (char32 ch : *str32) { + if (ch != Validator::kZeroWidthJoiner && + ch != Validator::kZeroWidthNonJoiner) { + (*str32)[len++] = ch; + } + } + str32->resize(len); +} + +// Normalizes a UTF8 string according to the given modes. Returns true on +// success. If false is returned, some failure or invalidity was present, and +// the result string is produced on a "best effort" basis. +bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize, + GraphemeNorm grapheme_normalize, const char* str8, + string* normalized) { + std::vector normed32; + NormalizeUTF8ToUTF32(u_mode, ocr_normalize, str8, &normed32); + if (grapheme_normalize == GraphemeNorm::kNormalize) { + StripJoiners(&normed32); + std::vector> graphemes; + bool success = Validator::ValidateCleanAndSegment( + GraphemeNormMode::kSingleString, false, normed32, &graphemes); + if (graphemes.empty() || graphemes[0].empty()) { + success = false; + } else if (normalized != nullptr) { + *normalized = UNICHAR::UTF32ToUTF8(graphemes[0]); + } + return success; + } + if (normalized != nullptr) *normalized = UNICHAR::UTF32ToUTF8(normed32); + return true; +} - str->clear(); - for (int i = 0; i < norm_str.length(); ++i) { - // If any spaces were added by NFKC, pretend normalization is a nop. - if (norm_str[i] == ' ') { - str->clear(); - str->push_back(ch); - break; - } else { - str->push_back(OCRNormalize(static_cast(norm_str[i]))); +// Normalizes a UTF8 string according to the given modes and splits into +// graphemes according to g_mode. Returns true on success. If false is returned, +// some failure or invalidity was present, and the result string is produced on +// a "best effort" basis. +bool NormalizeCleanAndSegmentUTF8(UnicodeNormMode u_mode, OCRNorm ocr_normalize, + GraphemeNormMode g_mode, bool report_errors, + const char* str8, + std::vector* graphemes) { + std::vector normed32; + NormalizeUTF8ToUTF32(u_mode, ocr_normalize, str8, &normed32); + StripJoiners(&normed32); + std::vector> graphemes32; + bool success = Validator::ValidateCleanAndSegment(g_mode, report_errors, + normed32, &graphemes32); + if (g_mode != GraphemeNormMode::kSingleString && success) { + // If we modified the string to clean it up, the segmentation may not be + // correct, so check for changes and do it again. + std::vector cleaned32; + for (const auto& g : graphemes32) { + cleaned32.insert(cleaned32.end(), g.begin(), g.end()); } + if (cleaned32 != normed32) { + graphemes32.clear(); + success = Validator::ValidateCleanAndSegment(g_mode, report_errors, + cleaned32, &graphemes32); + } + } + graphemes->clear(); + graphemes->reserve(graphemes32.size()); + for (const auto& grapheme : graphemes32) { + graphemes->push_back(UNICHAR::UTF32ToUTF8(grapheme)); } + return success; } // Apply just the OCR-specific normalizations and return the normalized char. diff --git a/training/normstrngs.h b/training/normstrngs.h index 27f36e0981..4934f3f107 100644 --- a/training/normstrngs.h +++ b/training/normstrngs.h @@ -21,34 +21,50 @@ #ifndef TESSERACT_CCUTIL_NORMSTRNGS_H_ #define TESSERACT_CCUTIL_NORMSTRNGS_H_ -#include "genericvector.h" -#include "strngs.h" +#include +#include -typedef signed int char32; +#include "validator.h" namespace tesseract { -// UTF-8 to UTF-32 conversion function. -void UTF8ToUTF32(const char* utf8_str, GenericVector* str32); - -// UTF-32 to UTF-8 convesion function. -void UTF32ToUTF8(const GenericVector& str32, STRING* utf8_str); - -// Normalize a single char32 using NFKC + OCR-specific transformations. -// NOTE that proper NFKC may require multiple characters as input. The -// assumption of this function is that the input is already as fully composed -// as it can be, but may require some compatibility normalizations or just -// OCR evaluation related normalizations. -void NormalizeChar32(char32 ch, bool decompose, GenericVector* str); - -// Normalize a UTF8 string. Same as above, but for UTF8-encoded strings, that -// can contain multiple UTF32 code points. -STRING NormalizeUTF8String(bool decompose, const char* str8); -// Default behavior is to compose, until it is proven that decomposed benefits -// at least one language. -inline STRING NormalizeUTF8String(const char* str8) { - return NormalizeUTF8String(false, str8); -} +// The standard unicode normalizations. +enum class UnicodeNormMode { + kNFD, + kNFC, + kNFKD, + kNFKC, +}; + +// To normalize away differences in punctuation that are ambiguous, like +// curly quotes and different widths of dash. +enum class OCRNorm { + kNone, + kNormalize, +}; + +// To validate and normalize away some subtle differences that can occur in +// Indic scripts, eg ensuring that an explicit virama is always followed by +// a zero-width non-joiner. +enum class GraphemeNorm { + kNone, + kNormalize, +}; + +// Normalizes a UTF8 string according to the given modes. Returns true on +// success. If false is returned, some failure or invalidity was present, and +// the result string is produced on a "best effort" basis. +bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize, + GraphemeNorm grapheme_normalize, const char* str8, + string* normalized); +// Normalizes a UTF8 string according to the given modes and splits into +// graphemes according to g_mode. Returns true on success. If false is returned, +// some failure or invalidity was present, and the result string is produced on +// a "best effort" basis. +bool NormalizeCleanAndSegmentUTF8(UnicodeNormMode u_mode, OCRNorm ocr_normalize, + GraphemeNormMode g_mode, bool report_errors, + const char* str8, + std::vector* graphemes); // Applies just the OCR-specific normalizations and return the normalized char. char32 OCRNormalize(char32 ch); diff --git a/training/unicharset_training_utils.cpp b/training/unicharset_training_utils.cpp index d16e919af8..9a72032964 100644 --- a/training/unicharset_training_utils.cpp +++ b/training/unicharset_training_utils.cpp @@ -122,8 +122,14 @@ void SetupBasicProperties(bool report_errors, bool decompose, } // Record normalized version of this unichar. - string normed_str = tesseract::NormalizeUTF8String(decompose, unichar_str); - if (unichar_id != 0 && !normed_str.empty()) { + string normed_str; + if (unichar_id != 0 && + tesseract::NormalizeUTF8String( + decompose ? tesseract::UnicodeNormMode::kNFKD + : tesseract::UnicodeNormMode::kNFKC, + tesseract::OCRNorm::kNormalize, tesseract::GraphemeNorm::kNone, + unichar_str, &normed_str) && + !normed_str.empty()) { unicharset->set_normed(unichar_id, normed_str.c_str()); } else { unicharset->set_normed(unichar_id, unichar_str); diff --git a/training/validate_grapheme.cpp b/training/validate_grapheme.cpp new file mode 100644 index 0000000000..aa2f08fe7b --- /dev/null +++ b/training/validate_grapheme.cpp @@ -0,0 +1,174 @@ +#include "validate_grapheme.h" +#include "tprintf.h" +#include "unicode/uchar.h" // From libicu + +namespace tesseract { + +bool ValidateGrapheme::ConsumeGraphemeIfValid() { + int num_codes = codes_.size(); + char32 prev_prev_ch = ' '; + char32 prev_ch = ' '; + CharClass prev_cc = CharClass::kWhitespace; + int num_codes_in_grapheme = 0; + while (codes_used_ < num_codes) { + CharClass cc = codes_[codes_used_].first; + char32 ch = codes_[codes_used_].second; + const bool is_combiner = + cc == CharClass::kCombiner || cc == CharClass::kVirama; + // Reject easily detected badly formed sequences. + if (prev_cc == CharClass::kWhitespace && is_combiner) { + if (report_errors_) tprintf("Word started with a combiner:0x%x\n", ch); + return false; + } + if (prev_cc == CharClass::kVirama && cc == CharClass::kVirama) { + if (report_errors_) + tprintf("Two grapheme links in a row:0x%x 0x%x\n", prev_ch, ch); + return false; + } + if (prev_cc != CharClass::kWhitespace && cc != CharClass::kWhitespace && + IsBadlyFormed(prev_ch, ch)) { + return false; + } + bool prev_is_fwd_combiner = + prev_ch == kZeroWidthJoiner || prev_cc == CharClass::kVirama || + (prev_ch == kZeroWidthNonJoiner && + (cc == CharClass::kVirama || prev_prev_ch == kZeroWidthJoiner)); + if (num_codes_in_grapheme > 0 && !is_combiner && !prev_is_fwd_combiner) + break; + CodeOnlyToOutput(); + ++num_codes_in_grapheme; + prev_prev_ch = prev_ch; + prev_ch = ch; + prev_cc = cc; + } + if (num_codes_in_grapheme > 0) MultiCodePart(num_codes_in_grapheme); + return true; +} + +Validator::CharClass ValidateGrapheme::UnicodeToCharClass(char32 ch) const { + if (IsVedicAccent(ch)) return CharClass::kVedicMark; + // The ZeroWidth[Non]Joiner characters are mapped to kCombiner as they + // always combine with the previous character. + if (u_hasBinaryProperty(ch, UCHAR_GRAPHEME_LINK)) return CharClass::kVirama; + if (u_isUWhiteSpace(ch)) return CharClass::kWhitespace; + int char_type = u_charType(ch); + if (char_type == U_NON_SPACING_MARK || char_type == U_ENCLOSING_MARK || + char_type == U_COMBINING_SPACING_MARK || ch == kZeroWidthNonJoiner || + ch == kZeroWidthJoiner) + return CharClass::kCombiner; + return CharClass::kOther; +} + +// Helper returns true if the sequence prev_ch,ch is invalid. +bool ValidateGrapheme::IsBadlyFormed(char32 prev_ch, char32 ch) { + // Reject badly formed Indic vowels. + if (IsBadlyFormedIndicVowel(prev_ch, ch)) { + if (report_errors_) + tprintf("Badly formed Indic vowel sequence:0x%x 0x%x\n", prev_ch, ch); + return true; + } + if (IsBadlyFormedThai(prev_ch, ch)) { + if (report_errors_) tprintf("Badly formed Thai:0x%x 0x%x\n", prev_ch, ch); + return true; + } + return false; +} + +// Helper returns true if the sequence prev_ch,ch is an invalid Indic vowel. +// Some vowels in Indic scripts may be analytically decomposed into atomic pairs +// of components that are themselves valid unicode symbols. (See Table 12-1 in +// http://www.unicode.org/versions/Unicode9.0.0/ch12.pdf +// for examples in Devanagari). The Unicode standard discourages specifying +// vowels this way, but they are sometimes encountered in text, probably because +// some editors still permit it. Renderers however dislike such pairs, and so +// this function may be used to detect their occurence for removal. +// TODO(rays) This function only covers a subset of Indic languages and doesn't +// include all rules. Add rules as appropriate to support other languages or +// find a way to generalize these existing rules that makes use of the +// regularity of the mapping from ISCII to Unicode. +/* static */ +bool ValidateGrapheme::IsBadlyFormedIndicVowel(char32 prev_ch, char32 ch) { + return ((prev_ch == 0x905 && (ch == 0x946 || ch == 0x93E)) || + (prev_ch == 0x909 && ch == 0x941) || + (prev_ch == 0x90F && (ch >= 0x945 && ch <= 0x947)) || + (prev_ch == 0x905 && (ch >= 0x949 && ch <= 0x94C)) || + (prev_ch == 0x906 && (ch >= 0x949 && ch <= 0x94C)) || + // Illegal combinations of two dependent Devanagari vowels. + (prev_ch == 0x93E && (ch >= 0x945 && ch <= 0x948)) || + // Dependent Devanagari vowels following a virama. + (prev_ch == 0x94D && (ch >= 0x93E && ch <= 0x94C)) || + // Bengali vowels (Table 9-5, pg 313) + (prev_ch == 0x985 && ch == 0x9BE) || + // Telugu vowels (Table 9-19, pg 331) + (prev_ch == 0xC12 && (ch == 0xC55 || ch == 0xC4C)) || + // Kannada vowels (Table 9-20, pg 332) + (prev_ch == 0xC92 && ch == 0xCCC)); +} + +// Helper returns true if ch is a Thai consonant. +static bool IsThaiConsonant(char32 ch) { return 0xe01 <= ch && ch <= 0xe2e; } + +// Helper returns true is ch is a before-consonant vowel. +static bool IsThaiBeforeConsonantVowel(char32 ch) { + return 0xe40 <= ch && ch <= 0xe44; +} + +// Helper returns true if ch is a Thai tone mark. +static bool IsThaiToneMark(char32 ch) { return 0xe48 <= ch && ch <= 0xe4b; } + +// Helper returns true if ch is a Thai vowel that may be followed by a tone +// mark. +static bool IsThaiTonableVowel(char32 ch) { + return (0xe34 <= ch && ch <= 0xe39) || ch == 0xe31; +} + +// Helper returns true if the sequence prev_ch,ch is invalid Thai. +// These rules come from a native Thai speaker, and are not covered by the +// Thai section in the unicode book: +// http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf +// Comments below added by Ray interpreting the code ranges. +/* static */ +bool ValidateGrapheme::IsBadlyFormedThai(char32 prev_ch, char32 ch) { + // Tone marks must follow consonants or specific vowels. + if (IsThaiToneMark(ch) && + !(IsThaiConsonant(prev_ch) || IsThaiTonableVowel(prev_ch))) { + return true; + } + // Tonable vowels must follow consonants. + if ((IsThaiTonableVowel(ch) || ch == 0xe47) && !IsThaiConsonant(prev_ch)) { + return true; + } + // Thanthakhat must follow consonant or specific vowels. + if (ch == 0xe4c && + !(IsThaiConsonant(prev_ch) || prev_ch == 0xe38 || prev_ch == 0xe34)) { + return true; + } + // Nikkhahit must follow a consonant ?or certain markers?. + // TODO(rays) confirm this, but there were so many in the ground truth of the + // validation set that it seems reasonable to assume it is valid. + if (ch == 0xe4d && + !(IsThaiConsonant(prev_ch) || prev_ch == 0xe48 || prev_ch == 0xe49)) { + return true; + } + // The vowels e30, e32, e33 can be used more liberally. + if ((ch == 0xe30 || ch == 0xe32 || ch == 0xe33) && + !(IsThaiConsonant(prev_ch) || IsThaiToneMark(prev_ch)) && + !(prev_ch == 0xe32 && ch == 0xe30) && + !(prev_ch == 0xe4d && ch == 0xe32)) { + return true; + } + // Some vowels come before consonants, and therefore cannot follow things + // that cannot end a syllable. + if (IsThaiBeforeConsonantVowel(ch) && + (IsThaiBeforeConsonantVowel(prev_ch) || prev_ch == 0xe31 || + prev_ch == 0xe37)) { + return true; + } + // Dont allow the standalone vowel U+0e24 to be followed by other vowels. + if ((0xe30 <= ch && ch <= 0xe4D) && prev_ch == 0xe24) { + return true; + } + return false; +} + +} // namespace tesseract diff --git a/training/validate_grapheme.h b/training/validate_grapheme.h new file mode 100644 index 0000000000..138ad57075 --- /dev/null +++ b/training/validate_grapheme.h @@ -0,0 +1,35 @@ +#ifndef TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_ +#define TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_ + +#include "validator.h" + +namespace tesseract { + +// Subclass of Validator that validates and segments generic unicode into +// grapheme clusters, including Latin with diacritics. +class ValidateGrapheme : public Validator { + public: + ValidateGrapheme(ViramaScript script, bool report_errors) + : Validator(script, report_errors) {} + ~ValidateGrapheme() {} + + protected: + // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to + // parts_ and output_. Returns true if a valid Grapheme was consumed, + // otherwise does not increment codes_used_. + bool ConsumeGraphemeIfValid() override; + // Returns the CharClass corresponding to the given Unicode ch. + CharClass UnicodeToCharClass(char32 ch) const override; + + private: + // Helper returns true if the sequence prev_ch,ch is invalid. + bool IsBadlyFormed(char32 prev_ch, char32 ch); + // Helper returns true if the sequence prev_ch,ch is an invalid Indic vowel. + static bool IsBadlyFormedIndicVowel(char32 prev_ch, char32 ch); + // Helper returns true if the sequence prev_ch,ch is invalid Thai. + static bool IsBadlyFormedThai(char32 prev_ch, char32 ch); +}; + +} // namespace tesseract + +#endif // TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_ diff --git a/training/validate_indic.cpp b/training/validate_indic.cpp new file mode 100644 index 0000000000..0ff769c80f --- /dev/null +++ b/training/validate_indic.cpp @@ -0,0 +1,274 @@ +#include "validate_indic.h" +#include "errcode.h" +#include "tprintf.h" + +namespace tesseract { + +// Returns whether codes matches the pattern for an Indic Grapheme. +// The ISCII standard http://varamozhi.sourceforge.net/iscii91.pdf +// has a BNF for valid syllables (Graphemes) which is modified slightly +// for Unicode. Notably U+200C and U+200D are used before/after the +// virama/virama to express explicit or soft viramas. +// Also the unicode v.9 Malayalam entry states that CZHC can be used in several +// Indic languages to request traditional ligatures, and CzHC is Malayalam- +// specific for requesting open conjuncts. +// +// + vowel Grapheme: V[D](v)* +// + consonant Grapheme: (C[N](H|HZ|Hz|ZH)?)*C[N](H|Hz)?[M[P]][D](v)* +bool ValidateIndic::ConsumeGraphemeIfValid() { + switch (codes_[codes_used_].first) { + case CharClass::kConsonant: + return ConsumeConsonantHeadIfValid() && ConsumeConsonantTailIfValid(); + case CharClass::kVowel: + return ConsumeVowelIfValid(); + case CharClass::kZeroWidthJoiner: + case CharClass::kZeroWidthNonJoiner: + // Apart from within an aksara, joiners are silently dropped. + if (report_errors_) + tprintf("Dropping isolated joiner: 0x%x\n", codes_[codes_used_].second); + ++codes_used_; + return true; + case CharClass::kOther: + UseMultiCode(1); + return true; + default: + if (report_errors_) { + tprintf("Invalid start of grapheme sequence:%c=0x%x\n", + codes_[codes_used_].first, codes_[codes_used_].second); + } + return false; + } +} + +Validator::CharClass ValidateIndic::UnicodeToCharClass(char32 ch) const { + if (IsVedicAccent(ch)) return CharClass::kVedicMark; + if (ch == kZeroWidthNonJoiner) return CharClass::kZeroWidthNonJoiner; + if (ch == kZeroWidthJoiner) return CharClass::kZeroWidthJoiner; + // Offset from the start of the relevant unicode code block aka code page. + int base = static_cast(script_); + int off = ch - base; + // Anything in another code block is other. + if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther; + // Exception for Tamil. The aytham character is considered a letter. + if (script_ == ViramaScript::kTamil && off == 0x03) return CharClass::kVowel; + if (off < 0x4) return CharClass::kVowelModifier; + if (script_ == ViramaScript::kSinhala) { + // Sinhala is an exception. + if (off <= 0x19) return CharClass::kVowel; + if (off <= 0x49) return CharClass::kConsonant; + if (off == 0x4a) return CharClass::kVirama; + if (off <= 0x5f) return CharClass::kMatra; + } else { + if (off <= 0x14 || off == 0x50) return CharClass::kVowel; + if (off <= 0x3b || (0x58 <= off && off <= 0x5f)) + return CharClass::kConsonant; + // Sinhala doesn't have Nukta or Avagraha. + if (off == 0x3c) return CharClass::kNukta; + if (off == 0x3d) return CharClass::kVowel; + if (off <= 0x4c || (0x51 <= off && off <= 0x54)) return CharClass::kMatra; + if (0x55 <= off && off <= 0x57) return CharClass::kMatraPiece; + if (off == 0x4d) return CharClass::kVirama; + } + if (off == 0x60 || off == 0x61) return CharClass::kVowel; + if (off == 0x62 || off == 0x63) return CharClass::kMatra; + // Danda and digits up to 6f are OK as other. + // 70-7f are script-specific. + if (script_ == ViramaScript::kBengali && (off == 0x70 || off == 0x71)) + return CharClass::kConsonant; + if (script_ == ViramaScript::kGurmukhi && (off == 0x72 || off == 0x73)) + return CharClass::kConsonant; + if (script_ == ViramaScript::kSinhala && off == 0x70) + return CharClass::kConsonant; + if (script_ == ViramaScript::kDevanagari && off == 0x70) + return CharClass::kOther; + if (0x70 <= off && off <= 0x73) return CharClass::kVowelModifier; + // Non Indic, Digits, Measures, danda, etc. + return CharClass::kOther; +} + +// Helper consumes/copies a virama and any associated post-virama joiners. +// A linking virama (with either type of pre-virama joiner, post-virama ZWJ, or +// no joiner at all) must be followed by a consonant. +// A non-linking (explicit) virama is indicated by a ZWNJ after it, or a non +// consonant, space, or character from a different script. We clean up the +// representation to make it consistent by adding a ZWNJ if missing from a +// non-linking virama. Returns false with an invalid sequence. +bool ValidateIndic::ConsumeViramaIfValid(IndicPair joiner, bool post_matra) { + int num_codes = codes_.size(); + if (joiner.first == CharClass::kOther) { + CodeOnlyToOutput(); + if (codes_used_ < num_codes && + codes_[codes_used_].second == kZeroWidthJoiner) { + // Post-matra viramas must be explicit, so no joiners allowed here. + if (post_matra) { + if (report_errors_) tprintf("ZWJ after a post-matra virama!!\n"); + return false; + } + if (codes_used_ + 1 < num_codes && + codes_[codes_used_ - 2].second != kRayana && + (codes_[codes_used_ + 1].second == kZeroWidthNonJoiner || + codes_[codes_used_ + 1].second == kYayana || + codes_[codes_used_ + 1].second == kRayana)) { + // This combination will be picked up later. + ASSERT_HOST(!CodeOnlyToOutput()); + } else { + // Half-form with optional Nukta. + int len = output_.size() + 1 - output_used_; + if (UseMultiCode(len)) return true; + } + if (codes_used_ < num_codes && + codes_[codes_used_].second == kZeroWidthNonJoiner) { + if (output_used_ == output_.size() || + output_[output_used_] != kRayana) { + if (report_errors_) { + tprintf("Virama ZWJ ZWNJ in non-Sinhala: base=0x%x!\n", + static_cast(script_)); + } + return false; + } + // Special Sinhala case of Stand-alone Repaya. ['RA' H Z z] + if (UseMultiCode(4)) return true; + } + } else if (codes_used_ == num_codes || + codes_[codes_used_].first != CharClass::kConsonant || + post_matra) { + if (codes_used_ == num_codes || + codes_[codes_used_].second != kZeroWidthNonJoiner) { + // It is valid to have an unterminated virama at the end of a word, but + // for consistency, we will always add ZWNJ if not present. + output_.push_back(kZeroWidthNonJoiner); + } else { + CodeOnlyToOutput(); + } + // Explicit virama [H z] + MultiCodePart(2); + } + } else { + // Pre-virama joiner [{Z|z} H] requests specific conjunct. + if (UseMultiCode(2)) { + if (report_errors_) + tprintf("Invalid pre-virama joiner with no 2nd consonant!!\n"); + return false; + } + if (codes_[codes_used_].second == kZeroWidthJoiner || + codes_[codes_used_].second == kZeroWidthNonJoiner) { + if (report_errors_) { + tprintf("JHJ!!: 0x%x 0x%x 0x%x\n", joiner.second, output_.back(), + codes_[codes_used_].second); + } + return false; + } + } + // It is good so far as it goes. + return true; +} + +// Helper consumes/copies a series of consonants separated by viramas while +// valid, but not any vowel or other modifiers. +bool ValidateIndic::ConsumeConsonantHeadIfValid() { + const int num_codes = codes_.size(); + // Consonant aksara + do { + CodeOnlyToOutput(); + // Special Sinhala case of [H Z Yayana/Rayana]. + int index = output_.size() - 3; + if (output_used_ <= index && + (output_.back() == kYayana || output_.back() == kRayana) && + IsVirama(output_[index]) && output_[index + 1] == kZeroWidthJoiner) { + MultiCodePart(3); + } + bool have_nukta = false; + if (codes_used_ < num_codes && + codes_[codes_used_].first == CharClass::kNukta) { + have_nukta = true; + CodeOnlyToOutput(); + } + // Test for subscript conjunct. + index = output_.size() - 2 - have_nukta; + if (output_used_ <= index && IsSubscriptScript() && + IsVirama(output_[index])) { + // Output previous virama, consonant + optional nukta. + MultiCodePart(2 + have_nukta); + } + IndicPair joiner(CharClass::kOther, 0); + if (codes_used_ < num_codes && + (codes_[codes_used_].second == kZeroWidthJoiner || + (codes_[codes_used_].second == kZeroWidthNonJoiner && + script_ == ViramaScript::kMalayalam))) { + joiner = codes_[codes_used_]; + if (++codes_used_ == num_codes) { + if (report_errors_) { + tprintf("Skipping ending joiner: 0x%x 0x%x\n", output_.back(), + joiner.second); + } + return true; + } + if (codes_[codes_used_].first == CharClass::kVirama) { + output_.push_back(joiner.second); + } else { + if (report_errors_) { + tprintf("Skipping unnecessary joiner: 0x%x 0x%x 0x%x\n", + output_.back(), joiner.second, codes_[codes_used_].second); + } + joiner = std::make_pair(CharClass::kOther, 0); + } + } + if (codes_used_ < num_codes && + codes_[codes_used_].first == CharClass::kVirama) { + if (!ConsumeViramaIfValid(joiner, false)) return false; + } else { + break; // No virama, so the run of consonants is over. + } + } while (codes_used_ < num_codes && + codes_[codes_used_].first == CharClass::kConsonant); + if (output_used_ < output_.size()) MultiCodePart(1); + return true; +} + +// Helper consumes/copies a tail part of a consonant, comprising optional +// matra/piece, vowel modifier, vedic mark, terminating virama. +bool ValidateIndic::ConsumeConsonantTailIfValid() { + if (codes_used_ == codes_.size()) return true; + // No virama: Finish the grapheme. + // Are multiple matras allowed? + if (codes_[codes_used_].first == CharClass::kMatra) { + if (UseMultiCode(1)) return true; + if (codes_[codes_used_].first == CharClass::kMatraPiece) { + if (UseMultiCode(1)) return true; + } + } + while (codes_[codes_used_].first == CharClass::kVowelModifier) { + if (UseMultiCode(1)) return true; + // Only Malayalam allows only repeated 0xd02. + if (script_ != ViramaScript::kMalayalam || output_.back() != 0xd02) break; + } + while (codes_[codes_used_].first == CharClass::kVedicMark) { + if (UseMultiCode(1)) return true; + } + if (codes_[codes_used_].first == CharClass::kVirama) { + if (!ConsumeViramaIfValid(IndicPair(CharClass::kOther, 0), true)) { + return false; + } + } + // What we have consumed so far is a valid consonant cluster. + if (output_used_ < output_.size()) MultiCodePart(1); + + return true; +} + +// Helper consumes/copies a vowel and optional modifiers. +bool ValidateIndic::ConsumeVowelIfValid() { + if (UseMultiCode(1)) return true; + while (codes_[codes_used_].first == CharClass::kVowelModifier) { + if (UseMultiCode(1)) return true; + // Only Malayalam allows repeated modifiers? + if (script_ != ViramaScript::kMalayalam) break; + } + while (codes_[codes_used_].first == CharClass::kVedicMark) { + if (UseMultiCode(1)) return true; + } + // What we have consumed so far is a valid vowel cluster. + return true; +} + +} // namespace tesseract diff --git a/training/validate_indic.h b/training/validate_indic.h new file mode 100644 index 0000000000..62dbcb23d1 --- /dev/null +++ b/training/validate_indic.h @@ -0,0 +1,44 @@ +#ifndef TESSERACT_TRAINING_VALIDATE_INDIC_H_ +#define TESSERACT_TRAINING_VALIDATE_INDIC_H_ + +#include "validator.h" + +namespace tesseract { + +// Subclass of Validator that validates and segments Indic scripts in the +// unicode range 0x900-0xdff (Devanagari-Sinhala). +class ValidateIndic : public Validator { + public: + ValidateIndic(ViramaScript script, bool report_errors) + : Validator(script, report_errors) {} + ~ValidateIndic() {} + + protected: + // Returns whether codes matches the pattern for an Indic Grapheme. + // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to + // parts_ and output_. Returns true if a valid Grapheme was consumed, + // otherwise does not increment codes_used_. + bool ConsumeGraphemeIfValid() override; + // Returns the CharClass corresponding to the given Unicode ch. + Validator::CharClass UnicodeToCharClass(char32 ch) const override; + + private: + // Helper consumes/copies a virama and any associated post-virama joiners. + bool ConsumeViramaIfValid(IndicPair joiner, bool post_matra); + // Helper consumes/copies a series of consonants separated by viramas while + // valid, but not any vowel or other modifiers. + bool ConsumeConsonantHeadIfValid(); + // Helper consumes/copies a tail part of a consonant, comprising optional + // matra/piece, vowel modifier, vedic mark, terminating virama. + bool ConsumeConsonantTailIfValid(); + // Helper consumes/copies a vowel and optional modifiers. + bool ConsumeVowelIfValid(); + + // Some special unicodes used only for Indic processing. + static const char32 kYayana = 0xdba; // Sinhala Ya + static const char32 kRayana = 0xdbb; // Sinhala Ra +}; + +} // namespace tesseract + +#endif // TESSERACT_TRAINING_VALIDATE_INDIC_H_ diff --git a/training/validate_khmer.cpp b/training/validate_khmer.cpp new file mode 100644 index 0000000000..45c8f061de --- /dev/null +++ b/training/validate_khmer.cpp @@ -0,0 +1,106 @@ +#include "validate_khmer.h" +#include "errcode.h" +#include "tprintf.h" + +namespace tesseract { + +// Returns whether codes matches the pattern for a Khmer Grapheme. +// Taken from unicode standard: +// http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf. +// where it gives: B {R | C} {S {R}}* {{Z} V} {O} {S}, using different notation +// to the ISCII standard http://varamozhi.sourceforge.net/iscii91.pdf. +// Translated to the codes used by the CharClass enum: +// C {R | N} {HC {R}}* {{Z|z} M{P}} {D} {HC} +// Where R is a new symbol (Robat) and N is repurposed as a consonant shifter. +// Also the Consonant class here includes independent vowels, as they are +// treated the same anyway. +// In the split grapheme mode, the only characters that get grouped are the +// HC and the {Z|z}M The unicode chapter on Khmer only mentions the joiners in +// the BNF syntax, so who knows what they do. +bool ValidateKhmer::ConsumeGraphemeIfValid() { + int num_codes = codes_.size(); + if (codes_used_ == num_codes) return false; + if (codes_[codes_used_].first == CharClass::kOther) { + UseMultiCode(1); + return true; + } + if (codes_[codes_used_].first != CharClass::kConsonant) { + if (report_errors_) { + tprintf("Invalid start of Khmer syllable:0x%x\n", + codes_[codes_used_].second); + } + return false; + } + if (UseMultiCode(1)) return true; + if (codes_[codes_used_].first == CharClass::kRobat || + codes_[codes_used_].first == CharClass::kNukta) { + if (UseMultiCode(1)) return true; + } + while (codes_used_ + 1 < num_codes && + codes_[codes_used_].first == CharClass::kVirama && + codes_[codes_used_ + 1].first == CharClass::kConsonant) { + ASSERT_HOST(!CodeOnlyToOutput()); + if (UseMultiCode(2)) return true; + if (codes_[codes_used_].first == CharClass::kRobat) { + if (UseMultiCode(1)) return true; + } + } + int num_matra_parts = 0; + if (codes_[codes_used_].second == kZeroWidthJoiner || + codes_[codes_used_].second == kZeroWidthNonJoiner) { + if (CodeOnlyToOutput()) { + if (report_errors_) { + tprintf("Unterminated joiner: 0x%x\n", output_.back()); + } + return false; + } + ++num_matra_parts; + } + // Not quite as shown by the BNF, the matra piece is allowed as a matra on its + // own or as an addition to other matras. + if (codes_[codes_used_].first == CharClass::kMatra || + codes_[codes_used_].first == CharClass::kMatraPiece) { + ++num_matra_parts; + if (UseMultiCode(num_matra_parts)) return true; + } else if (num_matra_parts) { + if (report_errors_) { + tprintf("Joiner with non-dependent vowel after it!:0x%x 0x%x\n", + output_.back(), codes_[codes_used_].second); + } + return false; + } + if (codes_[codes_used_].first == CharClass::kMatraPiece && + codes_[codes_used_ - 1].first != CharClass::kMatraPiece) { + if (UseMultiCode(1)) return true; + } + if (codes_[codes_used_].first == CharClass::kVowelModifier) { + if (UseMultiCode(1)) return true; + } + if (codes_used_ + 1 < num_codes && + codes_[codes_used_].first == CharClass::kVirama && + codes_[codes_used_ + 1].first == CharClass::kConsonant) { + ASSERT_HOST(!CodeOnlyToOutput()); + if (UseMultiCode(2)) return true; + } + return true; +} + +Validator::CharClass ValidateKhmer::UnicodeToCharClass(char32 ch) const { + if (IsVedicAccent(ch)) return CharClass::kVedicMark; + if (ch == kZeroWidthNonJoiner) return CharClass::kZeroWidthNonJoiner; + if (ch == kZeroWidthJoiner) return CharClass::kZeroWidthJoiner; + // Offset from the start of the relevant unicode code block aka code page. + int off = ch - static_cast(script_); + // Anything in another code block is other. + if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther; + if (off <= 0x33) return CharClass::kConsonant; + if (off <= 0x45) return CharClass::kMatra; + if (off == 0x46) return CharClass::kMatraPiece; + if (off == 0x4c) return CharClass::kRobat; + if (off == 0x49 || off == 0x4a) return CharClass::kNukta; + if (off <= 0x51) return CharClass::kVowelModifier; + if (off == 0x52) return CharClass::kVirama; + return CharClass::kOther; +} + +} // namespace tesseract diff --git a/training/validate_khmer.h b/training/validate_khmer.h new file mode 100644 index 0000000000..a2fe75c962 --- /dev/null +++ b/training/validate_khmer.h @@ -0,0 +1,27 @@ +#ifndef TESSERACT_TRAINING_VALIDATE_KHMER_H_ +#define TESSERACT_TRAINING_VALIDATE_KHMER_H_ + +#include "validator.h" + +namespace tesseract { + +// Subclass of Validator that validates and segments Khmer. +class ValidateKhmer : public Validator { + public: + ValidateKhmer(ViramaScript script, bool report_errors) + : Validator(script, report_errors) {} + ~ValidateKhmer() {} + + protected: + // Returns whether codes matches the pattern for an Khmer Grapheme. + // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to + // parts_ and output_. Returns true if a valid Grapheme was consumed, + // otherwise does not increment codes_used_. + bool ConsumeGraphemeIfValid() override; + // Returns the CharClass corresponding to the given Unicode ch. + CharClass UnicodeToCharClass(char32 ch) const override; +}; + +} // namespace tesseract + +#endif // TESSERACT_TRAINING_VALIDATE_KHMER_H_ diff --git a/training/validate_myanmar.cpp b/training/validate_myanmar.cpp new file mode 100644 index 0000000000..4493469023 --- /dev/null +++ b/training/validate_myanmar.cpp @@ -0,0 +1,160 @@ +#include "validate_myanmar.h" +#include "errcode.h" +#include "icuerrorcode.h" +#include "tprintf.h" +#include "unicode/uchar.h" // From libicu +#include "unicode/uscript.h" // From libicu + +namespace tesseract { + +// Returns whether codes matches the pattern for a Myanmar Grapheme. +// Taken directly from the unicode table 16-3. +// See http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf +bool ValidateMyanmar::ConsumeGraphemeIfValid() { + int num_codes = codes_.size(); + if (codes_used_ == num_codes) return true; + // Other. + if (IsMyanmarOther(codes_[codes_used_].second)) { + UseMultiCode(1); + return true; + } + // Kinzi. + if (codes_used_ + 2 < num_codes && codes_[codes_used_].second == 0x1004 && + codes_[codes_used_ + 1].second == kMyanmarAsat && + codes_[codes_used_ + 2].second == kMyanmarVirama) { + ASSERT_HOST(!CodeOnlyToOutput()); + ASSERT_HOST(!CodeOnlyToOutput()); + if (UseMultiCode(3)) return true; + } + // Base consonant/vowel. NOTE that since everything in Myanmar appears to be + // optional, except the base, this is the only place where invalid input can + // be detected and false returned. + if (IsMyanmarLetter(codes_[codes_used_].second)) { + if (UseMultiCode(1)) return true; + } else { + if (report_errors_) { + tprintf("Invalid start of Myanmar syllable:0x%x\n", + codes_[codes_used_].second); + } + return false; // One of these is required. + } + if (ConsumeSubscriptIfPresent()) return true; + ConsumeOptionalSignsIfPresent(); + // What we have consumed so far is a valid syllable. + return true; +} + +// TODO(rays) Doesn't use intermediate coding like the other scripts, as there +// is little correspondence between the content of table 16-3 and the char +// classes of the Indic languages. (Experts may disagree and improve!) +// In unicode table 16-3 there is basically a long list of optional characters, +// which can be coded quite easily. +// Unfortunately, table 16-3 doesn't include even half the Myanmar unicodes!! +// The table also allows sequences that still result in dotted circles!! +// So with a lot of guesswork the rest have been added in a reasonable place. +Validator::CharClass ValidateMyanmar::UnicodeToCharClass(char32 ch) const { + if (IsMyanmarLetter(ch)) return CharClass::kConsonant; + return CharClass::kOther; +} + +// Helper consumes/copies a virama and any subscript consonant. +// Returns true if the end of input is reached. +bool ValidateMyanmar::ConsumeSubscriptIfPresent() { + // Subscript consonant. It appears there can be only one. + int num_codes = codes_.size(); + if (codes_used_ + 1 < num_codes && + codes_[codes_used_].second == kMyanmarVirama) { + if (IsMyanmarLetter(codes_[codes_used_ + 1].second)) { + ASSERT_HOST(!CodeOnlyToOutput()); + if (UseMultiCode(2)) return true; + } + } + return false; +} + +// Helper consumes/copies a series of optional signs. +// Returns true if the end of input is reached. +bool ValidateMyanmar::ConsumeOptionalSignsIfPresent() { + // The following characters are allowed, all optional, and in sequence. + // An exception is kMyanmarMedialYa, which can include kMyanmarAsat. + const std::vector kMedials({kMyanmarAsat, kMyanmarMedialYa, 0x103c, + 0x103d, 0x103e, 0x105e, 0x105f, 0x1060, + 0x1081, 0x1031}); + for (char32 ch : kMedials) { + if (codes_[codes_used_].second == ch) { + if (UseMultiCode(1)) return true; + if (ch == kMyanmarMedialYa && + codes_[codes_used_].second == kMyanmarAsat) { + if (UseMultiCode(1)) return true; + } + } + } + // Vowel sign i, ii, ai. + char32 ch = codes_[codes_used_].second; + if (ch == 0x102d || ch == 0x102e || ch == 0x1032) { + if (UseMultiCode(1)) return true; + } + // Vowel sign u, uu, and extensions. + ch = codes_[codes_used_].second; + if (ch == 0x102f || ch == 0x1030 || (0x1056 <= ch && ch <= 0x1059) || + ch == 0x1062 || ch == 0x1067 || ch == 0x1068 || + (0x1071 <= ch && ch <= 0x1074) || (0x1083 <= ch && ch <= 0x1086) || + ch == 0x109c || ch == 0x109d) { + if (UseMultiCode(1)) return true; + } + // Tall aa, aa with optional asat. + if (codes_[codes_used_].second == 0x102b || + codes_[codes_used_].second == 0x102c) { + if (UseMultiCode(1)) return true; + if (codes_[codes_used_].second == kMyanmarAsat) { + if (UseMultiCode(1)) return true; + } + } + // The following characters are allowed, all optional, and in sequence. + const std::vector kSigns({0x1036, 0x1037}); + for (char32 ch : kSigns) { + if (codes_[codes_used_].second == ch) { + if (UseMultiCode(1)) return true; + } + } + // Tone mark extensions. + ch = codes_[codes_used_].second; + if (ch == 0x1038 || ch == kMyanmarAsat || ch == 0x1063 || ch == 0x1064 || + (0x1069 <= ch && ch <= 0x106d) || (0x1087 <= ch && ch <= 0x108d) || + ch == 0x108f || ch == 0x109a || ch == 0x109b || + (0xaa7b <= ch && ch <= 0xaa7d)) { + if (UseMultiCode(1)) return true; + } + return false; +} + +// Returns true if the unicode is a Myanmar "letter" including consonants +// and independent vowels. Although table 16-3 distinguishes between some +// base consonants and vowels, the extensions make no such distinction, so we +// put them all into a single bucket. +/* static */ +bool ValidateMyanmar::IsMyanmarLetter(char32 ch) { + return (0x1000 <= ch && ch <= 0x102a) || ch == 0x103f || + (0x1050 <= ch && ch <= 0x1055) || (0x105a <= ch && ch <= 0x105d) || + ch == 0x1061 || ch == 0x1065 || ch == 0x1066 || + (0x106e <= ch && ch <= 0x1070) || (0x1075 <= ch && ch <= 0x1080) || + ch == 0x108e || (0xa9e0 <= ch && ch <= 0xa9ef) || + (0xa9fa <= ch && ch <= 0xa9ff) || (0xaa60 <= ch && ch <= 0xaa73) || + ch == 0xaa7a || ch == 0xaa7e || ch == 0xaa7f; +} + +// Returns true if ch is a Myanmar digit or other symbol that does not take +// part in being a syllable. +/* static */ +bool ValidateMyanmar::IsMyanmarOther(char32 ch) { + IcuErrorCode err; + UScriptCode script_code = uscript_getScript(ch, err); + if (script_code != USCRIPT_MYANMAR && ch != Validator::kZeroWidthJoiner && + ch != Validator::kZeroWidthNonJoiner) + return true; + return (0x1040 <= ch && ch <= 0x1049) || (0x1090 <= ch && ch <= 0x1099) || + (0x109c <= ch && ch <= 0x109d) || (0xa9f0 <= ch && ch <= 0xa9f9) || + (0xaa74 <= ch && ch <= 0xaa79); +} + +} // namespace tesseract diff --git a/training/validate_myanmar.h b/training/validate_myanmar.h new file mode 100644 index 0000000000..d2ada74505 --- /dev/null +++ b/training/validate_myanmar.h @@ -0,0 +1,47 @@ +#ifndef TESSERACT_TRAINING_VALIDATE_MYANMAR_H_ +#define TESSERACT_TRAINING_VALIDATE_MYANMAR_H_ + +#include "validator.h" + +namespace tesseract { + +// Subclass of Validator that validates and segments Myanmar. +class ValidateMyanmar : public Validator { + public: + ValidateMyanmar(ViramaScript script, bool report_errors) + : Validator(script, report_errors) {} + ~ValidateMyanmar() {} + + protected: + // Returns whether codes matches the pattern for a Myanmar Grapheme. + // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to + // parts_ and output_. Returns true if a valid Grapheme was consumed, + // otherwise does not increment codes_used_. + bool ConsumeGraphemeIfValid() override; + // Returns the CharClass corresponding to the given Unicode ch. + Validator::CharClass UnicodeToCharClass(char32 ch) const override; + + private: + // Helper consumes/copies a virama and any subscript consonant. + // Returns true if the end of input is reached. + bool ConsumeSubscriptIfPresent(); + // Helper consumes/copies a series of optional signs. + // Returns true if the end of input is reached. + bool ConsumeOptionalSignsIfPresent(); + // Returns true if the unicode is a Myanmar "letter" including consonants + // and independent vowels. Although table 16-3 distinguishes between some + // base consonants and vowels, the extensions make no such distinction, so we + // put them all into a single bucket. + static bool IsMyanmarLetter(char32 ch); + // Returns true if ch is a Myanmar digit or other symbol that does not take + // part in being a syllable. + static bool IsMyanmarOther(char32 ch); + + // Some special unicodes used only for Myanmar processing. + static const char32 kMyanmarAsat = 0x103a; + static const char32 kMyanmarMedialYa = 0x103b; +}; + +} // namespace tesseract + +#endif // TESSERACT_TRAINING_VALIDATE_MYANMAR_H_ diff --git a/training/validator.cpp b/training/validator.cpp new file mode 100644 index 0000000000..54ec4500e6 --- /dev/null +++ b/training/validator.cpp @@ -0,0 +1,205 @@ +#include "validator.h" + +#include +#include +#include + +#include "icuerrorcode.h" +#include "unicode/uchar.h" // From libicu +#include "unicode/uscript.h" // From libicu +#include "validate_grapheme.h" +#include "validate_indic.h" +#include "validate_khmer.h" +#include "validate_myanmar.h" + +namespace tesseract { + +// Some specific but universally useful unicodes. +const char32 Validator::kZeroWidthSpace = 0x200B; +const char32 Validator::kZeroWidthNonJoiner = 0x200C; +const char32 Validator::kZeroWidthJoiner = 0x200D; +const char32 Validator::kLeftToRightMark = 0x200E; +const char32 Validator::kRightToLeftMark = 0x200F; +const char32 Validator::kInvalid = 0xfffd; + +// Validates and cleans the src vector of unicodes to the *dest, according to +// g_mode. In the case of kSingleString, a single vector containing the whole +// result is added to *dest. With kCombined, multiple vectors are added to +// *dest with one grapheme in each. With kGlyphSplit, multiple vectors are +// added to *dest with a smaller unit representing a glyph in each. +// In case of validation error, returns false and as much as possible of the +// input, without discarding invalid text. +/* static */ +bool Validator::ValidateCleanAndSegment( + GraphemeNormMode g_mode, bool report_errors, const std::vector& src, + std::vector>* dest) { + ValidateGrapheme g_validator(ViramaScript::kNonVirama, report_errors); + std::vector> graphemes; + ViramaScript script = MostFrequentViramaScript(src); + bool success = true; + if (script == ViramaScript::kNonVirama) { + // The grapheme segmenter's maximum segmentation is the grapheme unit, so + // up the mode by 1 to get the desired effect. + if (g_mode == GraphemeNormMode::kCombined) + g_mode = GraphemeNormMode::kGlyphSplit; + else if (g_mode == GraphemeNormMode::kGlyphSplit) + g_mode = GraphemeNormMode::kIndividualUnicodes; + // Just do grapheme segmentation. + success = g_validator.ValidateCleanAndSegmentInternal(g_mode, src, dest); + } else { + success = g_validator.ValidateCleanAndSegmentInternal( + GraphemeNormMode::kGlyphSplit, src, &graphemes); + std::unique_ptr validator( + ScriptValidator(script, report_errors)); + for (const auto& grapheme : graphemes) { + if (!validator->ValidateCleanAndSegmentInternal(g_mode, grapheme, dest)) { + success = false; + } + } + } + return success; +} + +// Factory method that understands how to map script to the right subclass. +std::unique_ptr Validator::ScriptValidator(ViramaScript script, + bool report_errors) { + switch (script) { + case ViramaScript::kNonVirama: + return std::unique_ptr( + new ValidateGrapheme(script, report_errors)); + case ViramaScript::kMyanmar: + return std::unique_ptr( + new ValidateMyanmar(script, report_errors)); + case ViramaScript::kKhmer: + return std::unique_ptr( + new ValidateKhmer(script, report_errors)); + default: + return std::unique_ptr( + new ValidateIndic(script, report_errors)); + } +} + +// Internal version of the public static ValidateCleanAndSegment. +// Validates and cleans the src vector of unicodes to the *dest, according to +// its type and the given g_mode. +// In case of validation error, returns false and returns as much as possible +// of the input, without discarding invalid text. +bool Validator::ValidateCleanAndSegmentInternal( + GraphemeNormMode g_mode, const std::vector& src, + std::vector>* dest) { + Clear(); + ComputeClassCodes(src); + bool success = true; + for (codes_used_ = 0; codes_used_ < codes_.size();) { + if (!ConsumeGraphemeIfValid()) { + success = false; + ++codes_used_; + } + } + MoveResultsToDest(g_mode, dest); + return success; +} + +// Moves the results from parts_ or output_ to dest according to g_mode. +void Validator::MoveResultsToDest(GraphemeNormMode g_mode, + std::vector>* dest) { + if (g_mode == GraphemeNormMode::kIndividualUnicodes) { + // Append each element of the combined output_ that we made as a new vector + // in dest. + dest->reserve(dest->size() + output_.size()); + for (char32 ch : output_) dest->push_back({ch}); + } else if (g_mode == GraphemeNormMode::kGlyphSplit) { + // Append all the parts_ that we made onto dest. + std::move(parts_.begin(), parts_.end(), std::back_inserter(*dest)); + } else if (g_mode == GraphemeNormMode::kCombined || dest->empty()) { + // Append the combined output_ that we made onto dest as one new vector. + dest->push_back(std::vector()); + output_.swap(dest->back()); + } else { // kNone. + // Append the combined output_ that we made onto the last existing element + // of dest. + dest->back().insert(dest->back().end(), output_.begin(), output_.end()); + } +} + +bool CmpPairSecond(const std::pair& p1, + const std::pair& p2) { + return p1.second < p2.second; +} + +// Computes and returns the ViramaScript corresponding to the most frequent +// virama-using script in the input, or kNonVirama if none are present. +/* static */ +ViramaScript Validator::MostFrequentViramaScript( + const std::vector& utf32) { + std::unordered_map histogram; + for (char32 ch : utf32) { + // Determine the codepage base. For the Indic scripts, and Khmer, it is + // sufficient to divide by kIndicCodePageSize but Myanmar is all over the + // unicode code space, so use its script id. + int base = ch / kIndicCodePageSize; + IcuErrorCode err; + UScriptCode script_code = uscript_getScript(ch, err); + if ((kMinIndicUnicode <= ch && ch <= kMaxViramaScriptUnicode && + script_code != USCRIPT_COMMON) || + script_code == USCRIPT_MYANMAR) { + if (script_code == USCRIPT_MYANMAR) + base = static_cast(ViramaScript::kMyanmar) / kIndicCodePageSize; + ++histogram[base]; + } + } + if (!histogram.empty()) { + int base = + std::max_element(histogram.begin(), histogram.end(), CmpPairSecond) + ->first; + char32 codebase = static_cast(base * kIndicCodePageSize); + // Check for validity. + if (codebase == static_cast(ViramaScript::kMyanmar) || + codebase == static_cast(ViramaScript::kKhmer) || + (static_cast(ViramaScript::kDevanagari) <= codebase && + codebase <= static_cast(ViramaScript::kSinhala))) { + return static_cast(codebase); + } + } + return ViramaScript::kNonVirama; +} + +// Returns true if the given UTF-32 unicode is a "virama" character. +/* static */ +bool Validator::IsVirama(char32 unicode) { + return (kMinIndicUnicode <= unicode && unicode <= kMaxSinhalaUnicode && + (unicode & 0x7f) == 0x4d) || + unicode == kSinhalaVirama || unicode == kMyanmarVirama || + unicode == kKhmerVirama; +} + +// Returns true if the given UTF-32 unicode is a vedic accent. +/* static */ +bool Validator::IsVedicAccent(char32 unicode) { + return 0x1cd0 <= unicode && unicode < 0x1d00; +} + +// Returns true if the script is one that uses subscripts for conjuncts. +bool Validator::IsSubscriptScript() const { + return script_ == ViramaScript::kTelugu || + script_ == ViramaScript::kKannada || + script_ == ViramaScript::kMyanmar || script_ == ViramaScript::kKhmer; +} + +void Validator::ComputeClassCodes(const std::vector& text) { + codes_.reserve(text.size()); + for (char32 c : text) { + codes_.push_back(std::make_pair(UnicodeToCharClass(c), c)); + } +} + +// Resets to the initial state. +void Validator::Clear() { + codes_.clear(); + parts_.clear(); + output_.clear(); + codes_used_ = 0; + output_used_ = 0; +} + +} // namespace tesseract diff --git a/training/validator.h b/training/validator.h new file mode 100644 index 0000000000..6b21daa911 --- /dev/null +++ b/training/validator.h @@ -0,0 +1,243 @@ +/********************************************************************** + * File: validator.h + * Description: Base class for various text validators. Intended mainly for + * scripts that use a virama character. + * Author: Ray Smith + * Created: Tue May 23 2017 + * + * (C) Copyright 2017, Google Inc. + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * http://www.apache.org/licenses/LICENSE-2.0 + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + **********************************************************************/ + +#ifndef TESSERACT_TRAINING_VALIDATOR_H_ +#define TESSERACT_TRAINING_VALIDATOR_H_ + +#include +#include +#include "unichar.h" + +namespace tesseract { + +// Different kinds of grapheme normalization - not just for Indic! +// A grapheme is a syllable unit in Indic and can be several unicodes. +// In other scripts, a grapheme is a base character and accent/diacritic +// combination, as not all accented characters have a single composed form. +enum class GraphemeNormMode { + // Validation result is a single string, even if input is multi-word. + kSingleString, + // Standard unicode graphemes are validated and output as grapheme units. + kCombined, + // Graphemes are validated and sub-divided. For virama-using scripts, units + // that correspond to repeatable glyphs are generated. (Mostly single unicodes + // but viramas and joiners are paired with the most sensible neighbor.) + // For non-virama scripts, this means that base/accent pairs are separated, + // ie the output is individual unicodes. + kGlyphSplit, + // The output is always single unicodes, regardless of the script. + kIndividualUnicodes, +}; + +// An enum representing the scripts that use a virama character. It is +// guaranteed that the value of any element, (except kNonVirama) can be cast +// to a unicode (char32) value that represents the start of the unicode range +// of the corresponding script. +enum class ViramaScript : char32 { + kNonVirama = 0, + kDevanagari = 0x900, + kBengali = 0x980, + kGurmukhi = 0xa00, + kGujarati = 0xa80, + kOriya = 0xb00, + kTamil = 0xb80, + kTelugu = 0xc00, + kKannada = 0xc80, + kMalayalam = 0xd00, + kSinhala = 0xd80, + kMyanmar = 0x1000, + kKhmer = 0x1780, +}; + +// Base class offers a validation API and protected methods to allow subclasses +// to easily build the validated/segmented output. +class Validator { + public: + // Validates and cleans the src vector of unicodes to the *dest, according to + // g_mode. In the case of kSingleString, a single vector containing the whole + // result is added to *dest. With kCombined, multiple vectors are added to + // *dest with one grapheme in each. With kGlyphSplit, multiple vectors are + // added to *dest with a smaller unit representing a glyph in each. + // In case of validation error, returns false and as much as possible of the + // input, without discarding invalid text. + static bool ValidateCleanAndSegment(GraphemeNormMode g_mode, + bool report_errors, + const std::vector& src, + std::vector>* dest); + + // Returns true if the unicode ch is a non-printing zero-width mark of no + // significance to OCR training or evaluation. + static bool IsZeroWidthMark(char32 ch) { + return ch == kZeroWidthSpace || ch == kLeftToRightMark || + ch == kRightToLeftMark || ch == kInvalid; + } + virtual ~Validator() {} + + // Some specific but universally useful unicodes. + static const char32 kZeroWidthSpace; + static const char32 kZeroWidthNonJoiner; + static const char32 kZeroWidthJoiner; + static const char32 kLeftToRightMark; + static const char32 kRightToLeftMark; + static const char32 kInvalid; + + protected: + // These are more or less the character class identifiers in the ISCII + // standard, section 8. They have been augmented with the Unicode meta + // characters Zero Width Joiner and Zero Width Non Joiner, and the + // Unicode Vedic Marks. + // The best sources of information on Unicode and Indic scripts are: + // http://varamozhi.sourceforge.net/iscii91.pdf + // http://www.unicode.org/versions/Unicode9.0.0/ch12.pdf + // http://unicode.org/faq/indic.html + // http://www.microsoft.com/typography/otfntdev/teluguot/shaping.aspx + enum class CharClass { + // NOTE: The values of the enum members are meaningless and arbitrary, ie + // they are not used for sorting, or any other risky application. + // The reason they are what they are is they are a single character + // abbreviation that can be used in a regexp/BNF definition of a grammar, + // IN A COMMENT, and still not relied upon in the code. + kConsonant = 'C', + kVowel = 'V', + kVirama = 'H', // (aka Halant) + kMatra = 'M', // (aka Dependent Vowel) + kMatraPiece = 'P', // unicode provides pieces of Matras. + kVowelModifier = 'D', // (candrabindu, anusvara, visarga, other marks) + kZeroWidthNonJoiner = 'z', // Unicode Zero Width Non-Joiner U+200C + kZeroWidthJoiner = 'Z', // Unicode Zero Width Joiner U+200D + kVedicMark = 'v', // Modifiers can come modify any indic syllable. + kNukta = 'N', // Occurs only immediately after consonants. + kRobat = 'R', // Khmer only. + kOther = 'O', // (digits, measures, non-Indic, etc) + // Additional classes used only by ValidateGrapheme. + kWhitespace = ' ', + kCombiner = 'c', // Combiners other than virama. + }; + typedef std::pair IndicPair; + + Validator(ViramaScript script, bool report_errors) + : script_(script), + codes_used_(0), + output_used_(0), + report_errors_(report_errors) {} + + // Factory method that understands how to map script to the right subclass. + static std::unique_ptr ScriptValidator(ViramaScript script, + bool report_errors); + + // Internal version of the public static ValidateCleanAndSegment. + // Validates and cleans the src vector of unicodes to the *dest, according to + // its type and the given g_mode. + // In case of validation error, returns false and returns as much as possible + // of the input, without discarding invalid text. + bool ValidateCleanAndSegmentInternal(GraphemeNormMode g_mode, + const std::vector& src, + std::vector>* dest); + // Moves the results from parts_ or output_ to dest according to g_mode. + void MoveResultsToDest(GraphemeNormMode g_mode, + std::vector>* dest); + + // Computes and returns the ViramaScript corresponding to the most frequent + // virama-using script in the input, or kNonVirama if none are present. + static ViramaScript MostFrequentViramaScript( + const std::vector& utf32); + // Returns true if the given UTF-32 unicode is a "virama" character. + static bool IsVirama(char32 unicode); + // Returns true if the given UTF-32 unicode is a vedic accent. + static bool IsVedicAccent(char32 unicode); + // Returns true if the script is one that uses subscripts for conjuncts. + bool IsSubscriptScript() const; + + // Helper function appends the next element of codes_ only to output_, + // without touching parts_ + // Returns true at the end of codes_. + bool CodeOnlyToOutput() { + output_.push_back(codes_[codes_used_].second); + return ++codes_used_ == codes_.size(); + } + + // Helper function adds a length-element vector to parts_ from the last length + // elements of output_. If there are more than length unused elements in + // output_, adds unicodes as single-element vectors to parts_ to catch + // output_used_ up to output->size() - length before adding the length-element + // vector. + void MultiCodePart(int length) { + while (output_used_ + length < output_.size()) { + parts_.emplace_back( + std::initializer_list{output_[output_used_++]}); + } + parts_.emplace_back(std::initializer_list{output_[output_used_]}); + while (++output_used_ < output_.size()) { + parts_.back().push_back(output_[output_used_]); + } + } + + // Helper function appends the next element of codes_ to output_, and then + // calls MultiCodePart to add the appropriate components to parts_. + // Returns true at the end of codes_. + bool UseMultiCode(int length) { + output_.push_back(codes_[codes_used_].second); + MultiCodePart(length); + return ++codes_used_ == codes_.size(); + } + + // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to + // parts_ and output_. Returns true if a valid Grapheme was consumed, + // otherwise does not increment codes_used_. + virtual bool ConsumeGraphemeIfValid() = 0; + // Sets codes_ to the class codes for the given unicode text. + void ComputeClassCodes(const std::vector& text); + // Returns the CharClass corresponding to the given Unicode ch. + virtual CharClass UnicodeToCharClass(char32 ch) const = 0; + // Resets to the initial state. + void Clear(); + + // Number of unicodes in each Indic codepage. + static const int kIndicCodePageSize = 128; + // Lowest unicode value of any Indic script. (Devanagari). + static const char32 kMinIndicUnicode = 0x900; + // Highest unicode value of any consistent (ISCII-based) Indic script. + static const char32 kMaxSinhalaUnicode = 0xdff; + // Highest unicode value of any virama-using script. (Khmer). + static const char32 kMaxViramaScriptUnicode = 0x17ff; + // Some special unicodes. + static const char32 kSinhalaVirama = 0xdca; + static const char32 kMyanmarVirama = 0x1039; + static const char32 kKhmerVirama = 0x17d2; + + // Script we are operating on. + ViramaScript script_; + // Input unicodes with assigned CharClass is the data to be validated. + std::vector codes_; + // Glyph-like components of the input. + std::vector> parts_; + // Copied validated unicodes from codes_ that are OK to output. + std::vector output_; + // The number of elements of codes_ that have been processed so far. + int codes_used_; + // The number of elements of output_ that have already been added to parts_. + int output_used_; + // Log error messages for reasons why text is invalid. + bool report_errors_; +}; + +} // namespace tesseract + +#endif // TESSERACT_TRAINING_VALIDATOR_H_