Skip to content

Commit

Permalink
Merge pull request #2244 from Shreeshrii/mya
Browse files Browse the repository at this point in the history
Fix Myanmar validation rules as per Unicode charts
  • Loading branch information
zdenop committed Mar 1, 2019
2 parents 0b354f2 + 2ba8e00 commit f5a7ca2
Showing 1 changed file with 17 additions and 9 deletions.
26 changes: 17 additions & 9 deletions src/training/validate_myanmar.cpp
Expand Up @@ -111,7 +111,8 @@ bool ValidateMyanmar::ConsumeOptionalSignsIfPresent() {
}
}
// The following characters are allowed, all optional, and in sequence.
const std::vector<char32> kSigns({0x1036, 0x1037});
// Anusvar, Dot below, Visarga
const std::vector<char32> kSigns({0x1036, 0x1037, 0x1038});
for (char32 ch : kSigns) {
if (codes_[codes_used_].second == ch) {
if (UseMultiCode(1)) return true;
Expand All @@ -131,30 +132,37 @@ bool ValidateMyanmar::ConsumeOptionalSignsIfPresent() {
// Returns true if the unicode is a Myanmar "letter" including consonants
// and independent vowels. Although table 16-3 distinguishes between some
// base consonants and vowels, the extensions make no such distinction, so we
// put them all into a single bucket.
// put them all into a single bucket.
// Update MYANMAR LETTER based on following:
// https://unicode.org/charts/PDF/U1000.pdf - Myanmar
// http://unicode.org/charts/PDF/UAA60.pdf - Myanmar Extended-A
// http://unicode.org/charts/PDF/UA9E0.pdf - Myanmar Extended-B
/* static */
bool ValidateMyanmar::IsMyanmarLetter(char32 ch) {
return (0x1000 <= ch && ch <= 0x102a) || ch == 0x103f ||
(0x104c <= ch && ch <= 0x1055) || (0x105a <= ch && ch <= 0x105d) ||
ch == 0x1061 || ch == 0x1065 || ch == 0x1066 ||
(0x106e <= ch && ch <= 0x1070) || (0x1075 <= ch && ch <= 0x1080) ||
ch == 0x108e || (0xa9e0 <= ch && ch <= 0xa9ef) ||
(0xa9fa <= ch && ch <= 0xa9ff) || (0xaa60 <= ch && ch <= 0xaa73) ||
(0x106e <= ch && ch <= 0x1070) || (0x1075 <= ch && ch <= 0x1081) ||
ch == 0x108e || (0xa9e0 <= ch && ch <= 0xa9e4) ||
(0xa9e7 <= ch && ch <= 0xa9ef) || (0xa9fa <= ch && ch <= 0xa9fe) ||
(0xaa60 <= ch && ch <= 0xaa6f) || (0xaa71 <= ch && ch <= 0xaa73) ||
ch == 0xaa7a || ch == 0xaa7e || ch == 0xaa7f;
}

// Returns true if ch is a Myanmar digit or other symbol that does not take
// part in being a syllable.
// part in being a syllable eg. punctuation marks.
// MYANMAR DIGIT, MYANMAR SYMBOL, MYANMAR LOGOGRAM
// REDUPLICATION MARKS
/* static */
bool ValidateMyanmar::IsMyanmarOther(char32 ch) {
IcuErrorCode err;
UScriptCode script_code = uscript_getScript(ch, err);
if (script_code != USCRIPT_MYANMAR && ch != Validator::kZeroWidthJoiner &&
ch != Validator::kZeroWidthNonJoiner)
return true;
return (0x1040 <= ch && ch <= 0x104b) || (0x1090 <= ch && ch <= 0x1099) ||
(0x109c <= ch && ch <= 0x109d) || (0xa9f0 <= ch && ch <= 0xa9f9) ||
(0xaa74 <= ch && ch <= 0xaa79);
return (0x1040 <= ch && ch <= 0x104f) || (0x1090 <= ch && ch <= 0x1099) ||
(0x109e <= ch && ch <= 0x109f) || (0xa9f0 <= ch && ch <= 0xa9f9) ||
(ch == 0xa9e6 || ch == 0xaa70) || (0xaa74 <= ch && ch <= 0xaa79);
}

} // namespace tesseract

0 comments on commit f5a7ca2

Please sign in to comment.