From df41eab6aa1726dfe6725284c94693dff6fa1eda Mon Sep 17 00:00:00 2001
From: Ray Smith <rays@google.com>
Date: Fri, 14 Jul 2017 10:05:05 -0700
Subject: [PATCH] Added script-specific validation and normalization for
 virama-using scripts and updated normalization for others

---
 ccutil/unichar.cpp                     |  16 ++
 training/Makefile.am                   |   8 +-
 training/normstrngs.cpp                | 195 ++++++++++++------
 training/normstrngs.h                  |  64 +++---
 training/unicharset_training_utils.cpp |  10 +-
 training/validate_grapheme.cpp         | 174 ++++++++++++++++
 training/validate_grapheme.h           |  35 ++++
 training/validate_indic.cpp            | 274 +++++++++++++++++++++++++
 training/validate_indic.h              |  44 ++++
 training/validate_khmer.cpp            | 106 ++++++++++
 training/validate_khmer.h              |  27 +++
 training/validate_myanmar.cpp          | 160 +++++++++++++++
 training/validate_myanmar.h            |  47 +++++
 training/validator.cpp                 | 205 ++++++++++++++++++
 training/validator.h                   | 243 ++++++++++++++++++++++
 15 files changed, 1518 insertions(+), 90 deletions(-)
 create mode 100644 training/validate_grapheme.cpp
 create mode 100644 training/validate_grapheme.h
 create mode 100644 training/validate_indic.cpp
 create mode 100644 training/validate_indic.h
 create mode 100644 training/validate_khmer.cpp
 create mode 100644 training/validate_khmer.h
 create mode 100644 training/validate_myanmar.cpp
 create mode 100644 training/validate_myanmar.h
 create mode 100644 training/validator.cpp
 create mode 100644 training/validator.h
diff --git a/ccutil/unichar.cpp b/ccutil/unichar.cpp
index 255136f3ff..b568e12a28 100644
--- a/ccutil/unichar.cpp
+++ b/ccutil/unichar.cpp
@@ -226,3 +226,19 @@ std::vector<char32> UNICHAR::UTF8ToUTF32(const char* utf8_str) {
   return unicodes;
 }
 
+// Returns an empty string if the input contains an invalid unicode.
+string UNICHAR::UTF32ToUTF8(const std::vector<char32>& str32) {
+  string utf8_str;
+  for (char32 ch : str32) {
+    UNICHAR uni_ch(ch);
+    int step;
+    if (uni_ch.utf8_len() > 0 && (step = utf8_step(uni_ch.utf8())) > 0) {
+      utf8_str.append(uni_ch.utf8(), step);
+    } else {
+      return "";
+    }
+  }
+  return utf8_str;
+}
+
+}  // namespace tesseract
diff --git a/training/Makefile.am b/training/Makefile.am
index 654ca3db3e..8d06d9454d 100644
--- a/training/Makefile.am
+++ b/training/Makefile.am
@@ -21,7 +21,9 @@ noinst_HEADERS = \
     boxchar.h commandlineflags.h commontraining.h degradeimage.h \
       fileio.h icuerrorcode.h ligature_table.h lstmtester.h normstrngs.h \
       mergenf.h pango_font_info.h stringrenderer.h \
-      tessopt.h tlog.h unicharset_training_utils.h util.h
+      tessopt.h tlog.h unicharset_training_utils.h util.h \
+      validate_grapheme.h validate_indic.h validate_khmer.h \
+      validate_myanmar.h validator.h
 
 noinst_LTLIBRARIES = libtesseract_training.la libtesseract_tessopt.la
 
@@ -32,7 +34,9 @@ libtesseract_training_la_LIBADD = \
 libtesseract_training_la_SOURCES = \
     boxchar.cpp commandlineflags.cpp commontraining.cpp degradeimage.cpp \
       fileio.cpp ligature_table.cpp lstmtester.cpp normstrngs.cpp pango_font_info.cpp \
-      stringrenderer.cpp tlog.cpp unicharset_training_utils.cpp
+      stringrenderer.cpp tlog.cpp unicharset_training_utils.cpp \
+      validate_grapheme.cpp validate_indic.cpp validate_khmer.cpp \
+      validate_myanmar.cpp validator.cpp
 
 libtesseract_tessopt_la_SOURCES = \
     tessopt.cpp
diff --git a/training/normstrngs.cpp b/training/normstrngs.cpp
index 17fe5cf8b7..3334f1dc51 100644
--- a/training/normstrngs.cpp
+++ b/training/normstrngs.cpp
@@ -21,6 +21,10 @@
 #include "normstrngs.h"
 
 #include <assert.h>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
 #include "icuerrorcode.h"
 #include "unichar.h"
 #include "unicode/normalizer2.h"  // From libicu
@@ -34,18 +38,17 @@ namespace tesseract {
 bool is_hyphen_punc(const char32 ch) {
   static const int kNumHyphenPuncUnicodes = 13;
   static const char32 kHyphenPuncUnicodes[kNumHyphenPuncUnicodes] = {
-    '-',
-    0x2010, 0x2011, 0x2012, 0x2013, 0x2014, 0x2015,  // hyphen..horizontal bar
-    0x207b,  // superscript minus
-    0x208b,  // subscript minus
-    0x2212,  // minus sign
-    0xfe58,  // small em dash
-    0xfe63,  // small hyphen-minus
-    0xff0d,  // fullwidth hyphen-minus
+      '-',    0x2010, 0x2011, 0x2012,
+      0x2013, 0x2014, 0x2015,  // hyphen..horizontal bar
+      0x207b,                  // superscript minus
+      0x208b,                  // subscript minus
+      0x2212,                  // minus sign
+      0xfe58,                  // small em dash
+      0xfe63,                  // small hyphen-minus
+      0xff0d,                  // fullwidth hyphen-minus
   };
   for (int i = 0; i < kNumHyphenPuncUnicodes; ++i) {
-    if (kHyphenPuncUnicodes[i] == ch)
-      return true;
+    if (kHyphenPuncUnicodes[i] == ch) return true;
   }
   return false;
 }
@@ -53,19 +56,17 @@ bool is_hyphen_punc(const char32 ch) {
 bool is_single_quote(const char32 ch) {
   static const int kNumSingleQuoteUnicodes = 8;
   static const char32 kSingleQuoteUnicodes[kNumSingleQuoteUnicodes] = {
-    '\'',
-    '`',
-    0x2018,  // left single quotation mark (English, others)
-    0x2019,  // right single quotation mark (Danish, Finnish, Swedish, Norw.)
-             // We may have to introduce a comma set with 0x201a
-    0x201B,  // single high-reveresed-9 quotation mark (PropList.txt)
-    0x2032,  // prime
-    0x300C,  // left corner bracket (East Asian languages)
-    0xFF07,  // fullwidth apostrophe
+      '\'', '`',
+      0x2018,  // left single quotation mark (English, others)
+      0x2019,  // right single quotation mark (Danish, Finnish, Swedish, Norw.)
+               // We may have to introduce a comma set with 0x201a
+      0x201B,  // single high-reveresed-9 quotation mark (PropList.txt)
+      0x2032,  // prime
+      0x300C,  // left corner bracket (East Asian languages)
+      0xFF07,  // fullwidth apostrophe
   };
   for (int i = 0; i < kNumSingleQuoteUnicodes; ++i) {
-    if (kSingleQuoteUnicodes[i] == ch)
-      return true;
+    if (kSingleQuoteUnicodes[i] == ch) return true;
   }
   return false;
 }
@@ -73,60 +74,130 @@ bool is_single_quote(const char32 ch) {
 bool is_double_quote(const char32 ch) {
   static const int kNumDoubleQuoteUnicodes = 8;
   static const char32 kDoubleQuoteUnicodes[kNumDoubleQuoteUnicodes] = {
-    '"',
-    0x201C,  // left double quotation mark (English, others)
-    0x201D,  // right double quotation mark (Danish, Finnish, Swedish, Norw.)
-    0x201F,  // double high-reversed-9 quotation mark (PropList.txt)
-    0x2033,  // double prime
-    0x301D,  // reversed double prime quotation mark (East Asian langs, horiz.)
-    0x301E,  // close double prime (East Asian languages written horizontally)
-    0xFF02,  // fullwidth quotation mark
+      '"',
+      0x201C,  // left double quotation mark (English, others)
+      0x201D,  // right double quotation mark (Danish, Finnish, Swedish, Norw.)
+      0x201F,  // double high-reversed-9 quotation mark (PropList.txt)
+      0x2033,  // double prime
+      0x301D,  // reversed double prime quotation mark (East Asian langs,
+               // horiz.)
+      0x301E,  // close double prime (East Asian languages written horizontally)
+      0xFF02,  // fullwidth quotation mark
   };
   for (int i = 0; i < kNumDoubleQuoteUnicodes; ++i) {
-    if (kDoubleQuoteUnicodes[i] == ch)
-      return true;
+    if (kDoubleQuoteUnicodes[i] == ch) return true;
   }
   return false;
 }
 
-STRING NormalizeUTF8String(bool decompose, const char* str8) {
-  GenericVector<char32> str32, out_str32, norm_str;
-  UTF8ToUTF32(str8, &str32);
-  for (int i = 0; i < str32.length(); ++i) {
-    norm_str.clear();
-    NormalizeChar32(str32[i], decompose, &norm_str);
-    for (int j = 0; j < norm_str.length(); ++j) {
-      out_str32.push_back(norm_str[j]);
-    }
-  }
-  STRING out_str8;
-  UTF32ToUTF8(out_str32, &out_str8);
-  return out_str8;
-}
-
-void NormalizeChar32(char32 ch, bool decompose, GenericVector<char32>* str) {
+// Helper runs a standard unicode normalization, optional OCR normalization,
+// and leaves the result as char32 for subsequent processing.
+static void NormalizeUTF8ToUTF32(UnicodeNormMode u_mode, OCRNorm ocr_normalize,
+                                 const char* str8,
+                                 std::vector<char32>* normed32) {
+  // Convert to ICU string for unicode normalization.
+  icu::UnicodeString uch_str(str8, "UTF-8");
   IcuErrorCode error_code;
-  const icu::Normalizer2* nfkc = icu::Normalizer2::getInstance(
-      nullptr, "nfkc", decompose ? UNORM2_DECOMPOSE : UNORM2_COMPOSE,
-      error_code);
+  // Convert the enum to the new weird icu representation.
+  const char* norm_type =
+      u_mode == UnicodeNormMode::kNFKD || u_mode == UnicodeNormMode::kNFKC
+          ? "nfkc"
+          : "nfc";
+  UNormalization2Mode compose =
+      u_mode == UnicodeNormMode::kNFC || u_mode == UnicodeNormMode::kNFKC
+          ? UNORM2_COMPOSE
+          : UNORM2_DECOMPOSE;
+  // Pointer to singleton does not require deletion.
+  const icu::Normalizer2* normalizer =
+      icu::Normalizer2::getInstance(nullptr, norm_type, compose, error_code);
   error_code.assertSuccess();
   error_code.reset();
-
-  icu::UnicodeString uch_str(static_cast<UChar32>(ch));
-  icu::UnicodeString norm_str = nfkc->normalize(uch_str, error_code);
+  icu::UnicodeString norm_str = normalizer->normalize(uch_str, error_code);
   error_code.assertSuccess();
+  // Convert to char32 for output. OCR normalization if required.
+  normed32->reserve(norm_str.length());  // An approximation.
+  for (int offset = 0; offset < norm_str.length();
+       offset = norm_str.moveIndex32(offset, 1)) {
+    char32 ch = norm_str.char32At(offset);
+    // Skip all ZWS, RTL and LTR marks.
+    if (Validator::IsZeroWidthMark(ch)) continue;
+    if (ocr_normalize == OCRNorm::kNormalize) ch = OCRNormalize(ch);
+    normed32->push_back(ch);
+  }
+}
+
+// Helper removes joiners from strings that contain no letters.
+static void StripJoiners(std::vector<char32>* str32) {
+  for (char32 ch : *str32) {
+    if (u_isalpha(ch)) return;
+  }
+  int len = 0;
+  for (char32 ch : *str32) {
+    if (ch != Validator::kZeroWidthJoiner &&
+        ch != Validator::kZeroWidthNonJoiner) {
+      (*str32)[len++] = ch;
+    }
+  }
+  str32->resize(len);
+}
+
+// Normalizes a UTF8 string according to the given modes. Returns true on
+// success. If false is returned, some failure or invalidity was present, and
+// the result string is produced on a "best effort" basis.
+bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize,
+                         GraphemeNorm grapheme_normalize, const char* str8,
+                         string* normalized) {
+  std::vector<char32> normed32;
+  NormalizeUTF8ToUTF32(u_mode, ocr_normalize, str8, &normed32);
+  if (grapheme_normalize == GraphemeNorm::kNormalize) {
+    StripJoiners(&normed32);
+    std::vector<std::vector<char32>> graphemes;
+    bool success = Validator::ValidateCleanAndSegment(
+        GraphemeNormMode::kSingleString, false, normed32, &graphemes);
+    if (graphemes.empty() || graphemes[0].empty()) {
+      success = false;
+    } else if (normalized != nullptr) {
+      *normalized = UNICHAR::UTF32ToUTF8(graphemes[0]);
+    }
+    return success;
+  }
+  if (normalized != nullptr) *normalized = UNICHAR::UTF32ToUTF8(normed32);
+  return true;
+}
 
-  str->clear();
-  for (int i = 0; i < norm_str.length(); ++i) {
-    // If any spaces were added by NFKC, pretend normalization is a nop.
-    if (norm_str[i] == ' ') {
-      str->clear();
-      str->push_back(ch);
-      break;
-    } else {
-      str->push_back(OCRNormalize(static_cast<char32>(norm_str[i])));
+// Normalizes a UTF8 string according to the given modes and splits into
+// graphemes according to g_mode. Returns true on success. If false is returned,
+// some failure or invalidity was present, and the result string is produced on
+// a "best effort" basis.
+bool NormalizeCleanAndSegmentUTF8(UnicodeNormMode u_mode, OCRNorm ocr_normalize,
+                                  GraphemeNormMode g_mode, bool report_errors,
+                                  const char* str8,
+                                  std::vector<string>* graphemes) {
+  std::vector<char32> normed32;
+  NormalizeUTF8ToUTF32(u_mode, ocr_normalize, str8, &normed32);
+  StripJoiners(&normed32);
+  std::vector<std::vector<char32>> graphemes32;
+  bool success = Validator::ValidateCleanAndSegment(g_mode, report_errors,
+                                                    normed32, &graphemes32);
+  if (g_mode != GraphemeNormMode::kSingleString && success) {
+    // If we modified the string to clean it up, the segmentation may not be
+    // correct, so check for changes and do it again.
+    std::vector<char32> cleaned32;
+    for (const auto& g : graphemes32) {
+      cleaned32.insert(cleaned32.end(), g.begin(), g.end());
     }
+    if (cleaned32 != normed32) {
+      graphemes32.clear();
+      success = Validator::ValidateCleanAndSegment(g_mode, report_errors,
+                                                   cleaned32, &graphemes32);
+    }
+  }
+  graphemes->clear();
+  graphemes->reserve(graphemes32.size());
+  for (const auto& grapheme : graphemes32) {
+    graphemes->push_back(UNICHAR::UTF32ToUTF8(grapheme));
   }
+  return success;
 }
 
 // Apply just the OCR-specific normalizations and return the normalized char.
diff --git a/training/normstrngs.h b/training/normstrngs.h
index 27f36e0981..4934f3f107 100644
--- a/training/normstrngs.h
+++ b/training/normstrngs.h
@@ -21,34 +21,50 @@
 #ifndef TESSERACT_CCUTIL_NORMSTRNGS_H_
 #define TESSERACT_CCUTIL_NORMSTRNGS_H_
 
-#include "genericvector.h"
-#include "strngs.h"
+#include <string>
+#include <vector>
 
-typedef signed int char32;
+#include "validator.h"
 
 namespace tesseract {
 
-// UTF-8 to UTF-32 conversion function.
-void UTF8ToUTF32(const char* utf8_str, GenericVector<char32>* str32);
-
-// UTF-32 to UTF-8 convesion function.
-void UTF32ToUTF8(const GenericVector<char32>& str32, STRING* utf8_str);
-
-// Normalize a single char32 using NFKC + OCR-specific transformations.
-// NOTE that proper NFKC may require multiple characters as input. The
-// assumption of this function is that the input is already as fully composed
-// as it can be, but may require some compatibility normalizations or just
-// OCR evaluation related normalizations.
-void NormalizeChar32(char32 ch, bool decompose, GenericVector<char32>* str);
-
-// Normalize a UTF8 string. Same as above, but for UTF8-encoded strings, that
-// can contain multiple UTF32 code points.
-STRING NormalizeUTF8String(bool decompose, const char* str8);
-// Default behavior is to compose, until it is proven that decomposed benefits
-// at least one language.
-inline STRING NormalizeUTF8String(const char* str8) {
-  return NormalizeUTF8String(false, str8);
-}
+// The standard unicode normalizations.
+enum class UnicodeNormMode {
+  kNFD,
+  kNFC,
+  kNFKD,
+  kNFKC,
+};
+
+// To normalize away differences in punctuation that are ambiguous, like
+// curly quotes and different widths of dash.
+enum class OCRNorm {
+  kNone,
+  kNormalize,
+};
+
+// To validate and normalize away some subtle differences that can occur in
+// Indic scripts, eg ensuring that an explicit virama is always followed by
+// a zero-width non-joiner.
+enum class GraphemeNorm {
+  kNone,
+  kNormalize,
+};
+
+// Normalizes a UTF8 string according to the given modes. Returns true on
+// success. If false is returned, some failure or invalidity was present, and
+// the result string is produced on a "best effort" basis.
+bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize,
+                         GraphemeNorm grapheme_normalize, const char* str8,
+                         string* normalized);
+// Normalizes a UTF8 string according to the given modes and splits into
+// graphemes according to g_mode. Returns true on success. If false is returned,
+// some failure or invalidity was present, and the result string is produced on
+// a "best effort" basis.
+bool NormalizeCleanAndSegmentUTF8(UnicodeNormMode u_mode, OCRNorm ocr_normalize,
+                                  GraphemeNormMode g_mode, bool report_errors,
+                                  const char* str8,
+                                  std::vector<string>* graphemes);
 
 // Applies just the OCR-specific normalizations and return the normalized char.
 char32 OCRNormalize(char32 ch);
diff --git a/training/unicharset_training_utils.cpp b/training/unicharset_training_utils.cpp
index d16e919af8..9a72032964 100644
--- a/training/unicharset_training_utils.cpp
+++ b/training/unicharset_training_utils.cpp
@@ -122,8 +122,14 @@ void SetupBasicProperties(bool report_errors, bool decompose,
     }
 
     // Record normalized version of this unichar.
-    string normed_str = tesseract::NormalizeUTF8String(decompose, unichar_str);
-    if (unichar_id != 0 && !normed_str.empty()) {
+    string normed_str;
+    if (unichar_id != 0 &&
+        tesseract::NormalizeUTF8String(
+            decompose ? tesseract::UnicodeNormMode::kNFKD
+                      : tesseract::UnicodeNormMode::kNFKC,
+            tesseract::OCRNorm::kNormalize, tesseract::GraphemeNorm::kNone,
+            unichar_str, &normed_str) &&
+        !normed_str.empty()) {
       unicharset->set_normed(unichar_id, normed_str.c_str());
     } else {
       unicharset->set_normed(unichar_id, unichar_str);
diff --git a/training/validate_grapheme.cpp b/training/validate_grapheme.cpp
new file mode 100644
index 0000000000..aa2f08fe7b
--- /dev/null
+++ b/training/validate_grapheme.cpp
@@ -0,0 +1,174 @@
+#include "validate_grapheme.h"
+#include "tprintf.h"
+#include "unicode/uchar.h"  // From libicu
+
+namespace tesseract {
+
+bool ValidateGrapheme::ConsumeGraphemeIfValid() {
+  int num_codes = codes_.size();
+  char32 prev_prev_ch = ' ';
+  char32 prev_ch = ' ';
+  CharClass prev_cc = CharClass::kWhitespace;
+  int num_codes_in_grapheme = 0;
+  while (codes_used_ < num_codes) {
+    CharClass cc = codes_[codes_used_].first;
+    char32 ch = codes_[codes_used_].second;
+    const bool is_combiner =
+        cc == CharClass::kCombiner || cc == CharClass::kVirama;
+    // Reject easily detected badly formed sequences.
+    if (prev_cc == CharClass::kWhitespace && is_combiner) {
+      if (report_errors_) tprintf("Word started with a combiner:0x%x\n", ch);
+      return false;
+    }
+    if (prev_cc == CharClass::kVirama && cc == CharClass::kVirama) {
+      if (report_errors_)
+        tprintf("Two grapheme links in a row:0x%x 0x%x\n", prev_ch, ch);
+      return false;
+    }
+    if (prev_cc != CharClass::kWhitespace && cc != CharClass::kWhitespace &&
+        IsBadlyFormed(prev_ch, ch)) {
+      return false;
+    }
+    bool prev_is_fwd_combiner =
+        prev_ch == kZeroWidthJoiner || prev_cc == CharClass::kVirama ||
+        (prev_ch == kZeroWidthNonJoiner &&
+         (cc == CharClass::kVirama || prev_prev_ch == kZeroWidthJoiner));
+    if (num_codes_in_grapheme > 0 && !is_combiner && !prev_is_fwd_combiner)
+      break;
+    CodeOnlyToOutput();
+    ++num_codes_in_grapheme;
+    prev_prev_ch = prev_ch;
+    prev_ch = ch;
+    prev_cc = cc;
+  }
+  if (num_codes_in_grapheme > 0) MultiCodePart(num_codes_in_grapheme);
+  return true;
+}
+
+Validator::CharClass ValidateGrapheme::UnicodeToCharClass(char32 ch) const {
+  if (IsVedicAccent(ch)) return CharClass::kVedicMark;
+  // The ZeroWidth[Non]Joiner characters are mapped to kCombiner as they
+  // always combine with the previous character.
+  if (u_hasBinaryProperty(ch, UCHAR_GRAPHEME_LINK)) return CharClass::kVirama;
+  if (u_isUWhiteSpace(ch)) return CharClass::kWhitespace;
+  int char_type = u_charType(ch);
+  if (char_type == U_NON_SPACING_MARK || char_type == U_ENCLOSING_MARK ||
+      char_type == U_COMBINING_SPACING_MARK || ch == kZeroWidthNonJoiner ||
+      ch == kZeroWidthJoiner)
+    return CharClass::kCombiner;
+  return CharClass::kOther;
+}
+
+// Helper returns true if the sequence prev_ch,ch is invalid.
+bool ValidateGrapheme::IsBadlyFormed(char32 prev_ch, char32 ch) {
+  // Reject badly formed Indic vowels.
+  if (IsBadlyFormedIndicVowel(prev_ch, ch)) {
+    if (report_errors_)
+      tprintf("Badly formed Indic vowel sequence:0x%x 0x%x\n", prev_ch, ch);
+    return true;
+  }
+  if (IsBadlyFormedThai(prev_ch, ch)) {
+    if (report_errors_) tprintf("Badly formed Thai:0x%x 0x%x\n", prev_ch, ch);
+    return true;
+  }
+  return false;
+}
+
+// Helper returns true if the sequence prev_ch,ch is an invalid Indic vowel.
+// Some vowels in Indic scripts may be analytically decomposed into atomic pairs
+// of components that are themselves valid unicode symbols. (See Table 12-1 in
+// http://www.unicode.org/versions/Unicode9.0.0/ch12.pdf
+// for examples in Devanagari). The Unicode standard discourages specifying
+// vowels this way, but they are sometimes encountered in text, probably because
+// some editors still permit it. Renderers however dislike such pairs, and so
+// this function may be used to detect their occurence for removal.
+// TODO(rays) This function only covers a subset of Indic languages and doesn't
+// include all rules. Add rules as appropriate to support other languages or
+// find a way to generalize these existing rules that makes use of the
+// regularity of the mapping from ISCII to Unicode.
+/* static */
+bool ValidateGrapheme::IsBadlyFormedIndicVowel(char32 prev_ch, char32 ch) {
+  return ((prev_ch == 0x905 && (ch == 0x946 || ch == 0x93E)) ||
+          (prev_ch == 0x909 && ch == 0x941) ||
+          (prev_ch == 0x90F && (ch >= 0x945 && ch <= 0x947)) ||
+          (prev_ch == 0x905 && (ch >= 0x949 && ch <= 0x94C)) ||
+          (prev_ch == 0x906 && (ch >= 0x949 && ch <= 0x94C)) ||
+          // Illegal combinations of two dependent Devanagari vowels.
+          (prev_ch == 0x93E && (ch >= 0x945 && ch <= 0x948)) ||
+          // Dependent Devanagari vowels following a virama.
+          (prev_ch == 0x94D && (ch >= 0x93E && ch <= 0x94C)) ||
+          // Bengali vowels (Table 9-5, pg 313)
+          (prev_ch == 0x985 && ch == 0x9BE) ||
+          // Telugu vowels (Table 9-19, pg 331)
+          (prev_ch == 0xC12 && (ch == 0xC55 || ch == 0xC4C)) ||
+          // Kannada vowels (Table 9-20, pg 332)
+          (prev_ch == 0xC92 && ch == 0xCCC));
+}
+
+// Helper returns true if ch is a Thai consonant.
+static bool IsThaiConsonant(char32 ch) { return 0xe01 <= ch && ch <= 0xe2e; }
+
+// Helper returns true is ch is a before-consonant vowel.
+static bool IsThaiBeforeConsonantVowel(char32 ch) {
+  return 0xe40 <= ch && ch <= 0xe44;
+}
+
+// Helper returns true if ch is a Thai tone mark.
+static bool IsThaiToneMark(char32 ch) { return 0xe48 <= ch && ch <= 0xe4b; }
+
+// Helper returns true if ch is a Thai vowel that may be followed by a tone
+// mark.
+static bool IsThaiTonableVowel(char32 ch) {
+  return (0xe34 <= ch && ch <= 0xe39) || ch == 0xe31;
+}
+
+// Helper returns true if the sequence prev_ch,ch is invalid Thai.
+// These rules come from a native Thai speaker, and are not covered by the
+// Thai section in the unicode book:
+// http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf
+// Comments below added by Ray interpreting the code ranges.
+/* static */
+bool ValidateGrapheme::IsBadlyFormedThai(char32 prev_ch, char32 ch) {
+  // Tone marks must follow consonants or specific vowels.
+  if (IsThaiToneMark(ch) &&
+      !(IsThaiConsonant(prev_ch) || IsThaiTonableVowel(prev_ch))) {
+    return true;
+  }
+  // Tonable vowels must follow consonants.
+  if ((IsThaiTonableVowel(ch) || ch == 0xe47) && !IsThaiConsonant(prev_ch)) {
+    return true;
+  }
+  // Thanthakhat must follow consonant or specific vowels.
+  if (ch == 0xe4c &&
+      !(IsThaiConsonant(prev_ch) || prev_ch == 0xe38 || prev_ch == 0xe34)) {
+    return true;
+  }
+  // Nikkhahit must follow a consonant ?or certain markers?.
+  // TODO(rays) confirm this, but there were so many in the ground truth of the
+  // validation set that it seems reasonable to assume it is valid.
+  if (ch == 0xe4d &&
+      !(IsThaiConsonant(prev_ch) || prev_ch == 0xe48 || prev_ch == 0xe49)) {
+    return true;
+  }
+  // The vowels e30, e32, e33 can be used more liberally.
+  if ((ch == 0xe30 || ch == 0xe32 || ch == 0xe33) &&
+      !(IsThaiConsonant(prev_ch) || IsThaiToneMark(prev_ch)) &&
+      !(prev_ch == 0xe32 && ch == 0xe30) &&
+      !(prev_ch == 0xe4d && ch == 0xe32)) {
+    return true;
+  }
+  // Some vowels come before consonants, and therefore cannot follow things
+  // that cannot end a syllable.
+  if (IsThaiBeforeConsonantVowel(ch) &&
+      (IsThaiBeforeConsonantVowel(prev_ch) || prev_ch == 0xe31 ||
+       prev_ch == 0xe37)) {
+    return true;
+  }
+  // Dont allow the standalone vowel U+0e24 to be followed by other vowels.
+  if ((0xe30 <= ch && ch <= 0xe4D) && prev_ch == 0xe24) {
+    return true;
+  }
+  return false;
+}
+
+}  // namespace tesseract
diff --git a/training/validate_grapheme.h b/training/validate_grapheme.h
new file mode 100644
index 0000000000..138ad57075
--- /dev/null
+++ b/training/validate_grapheme.h
@@ -0,0 +1,35 @@
+#ifndef TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
+#define TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
+
+#include "validator.h"
+
+namespace tesseract {
+
+// Subclass of Validator that validates and segments generic unicode into
+// grapheme clusters, including Latin with diacritics.
+class ValidateGrapheme : public Validator {
+ public:
+  ValidateGrapheme(ViramaScript script, bool report_errors)
+      : Validator(script, report_errors) {}
+  ~ValidateGrapheme() {}
+
+ protected:
+  // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
+  // parts_ and output_. Returns true if a valid Grapheme was consumed,
+  // otherwise does not increment codes_used_.
+  bool ConsumeGraphemeIfValid() override;
+  // Returns the CharClass corresponding to the given Unicode ch.
+  CharClass UnicodeToCharClass(char32 ch) const override;
+
+ private:
+  // Helper returns true if the sequence prev_ch,ch is invalid.
+  bool IsBadlyFormed(char32 prev_ch, char32 ch);
+  // Helper returns true if the sequence prev_ch,ch is an invalid Indic vowel.
+  static bool IsBadlyFormedIndicVowel(char32 prev_ch, char32 ch);
+  // Helper returns true if the sequence prev_ch,ch is invalid Thai.
+  static bool IsBadlyFormedThai(char32 prev_ch, char32 ch);
+};
+
+}  // namespace tesseract
+
+#endif  // TESSERACT_TRAINING_VALIDATE_GRAPHEME_H_
diff --git a/training/validate_indic.cpp b/training/validate_indic.cpp
new file mode 100644
index 0000000000..0ff769c80f
--- /dev/null
+++ b/training/validate_indic.cpp
@@ -0,0 +1,274 @@
+#include "validate_indic.h"
+#include "errcode.h"
+#include "tprintf.h"
+
+namespace tesseract {
+
+// Returns whether codes matches the pattern for an Indic Grapheme.
+// The ISCII standard http://varamozhi.sourceforge.net/iscii91.pdf
+// has a BNF for valid syllables (Graphemes) which is modified slightly
+// for Unicode.  Notably U+200C and U+200D are used before/after the
+// virama/virama to express explicit or soft viramas.
+// Also the unicode v.9 Malayalam entry states that CZHC can be used in several
+// Indic languages to request traditional ligatures, and CzHC is Malayalam-
+// specific for requesting open conjuncts.
+//
+//  + vowel Grapheme:  V[D](v)*
+//  + consonant Grapheme: (C[N](H|HZ|Hz|ZH)?)*C[N](H|Hz)?[M[P]][D](v)*
+bool ValidateIndic::ConsumeGraphemeIfValid() {
+  switch (codes_[codes_used_].first) {
+    case CharClass::kConsonant:
+      return ConsumeConsonantHeadIfValid() && ConsumeConsonantTailIfValid();
+    case CharClass::kVowel:
+      return ConsumeVowelIfValid();
+    case CharClass::kZeroWidthJoiner:
+    case CharClass::kZeroWidthNonJoiner:
+      // Apart from within an aksara, joiners are silently dropped.
+      if (report_errors_)
+        tprintf("Dropping isolated joiner: 0x%x\n", codes_[codes_used_].second);
+      ++codes_used_;
+      return true;
+    case CharClass::kOther:
+      UseMultiCode(1);
+      return true;
+    default:
+      if (report_errors_) {
+        tprintf("Invalid start of grapheme sequence:%c=0x%x\n",
+                codes_[codes_used_].first, codes_[codes_used_].second);
+      }
+      return false;
+  }
+}
+
+Validator::CharClass ValidateIndic::UnicodeToCharClass(char32 ch) const {
+  if (IsVedicAccent(ch)) return CharClass::kVedicMark;
+  if (ch == kZeroWidthNonJoiner) return CharClass::kZeroWidthNonJoiner;
+  if (ch == kZeroWidthJoiner) return CharClass::kZeroWidthJoiner;
+  // Offset from the start of the relevant unicode code block aka code page.
+  int base = static_cast<char32>(script_);
+  int off = ch - base;
+  // Anything in another code block is other.
+  if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther;
+  // Exception for Tamil. The aytham character is considered a letter.
+  if (script_ == ViramaScript::kTamil && off == 0x03) return CharClass::kVowel;
+  if (off < 0x4) return CharClass::kVowelModifier;
+  if (script_ == ViramaScript::kSinhala) {
+    // Sinhala is an exception.
+    if (off <= 0x19) return CharClass::kVowel;
+    if (off <= 0x49) return CharClass::kConsonant;
+    if (off == 0x4a) return CharClass::kVirama;
+    if (off <= 0x5f) return CharClass::kMatra;
+  } else {
+    if (off <= 0x14 || off == 0x50) return CharClass::kVowel;
+    if (off <= 0x3b || (0x58 <= off && off <= 0x5f))
+      return CharClass::kConsonant;
+    // Sinhala doesn't have Nukta or Avagraha.
+    if (off == 0x3c) return CharClass::kNukta;
+    if (off == 0x3d) return CharClass::kVowel;
+    if (off <= 0x4c || (0x51 <= off && off <= 0x54)) return CharClass::kMatra;
+    if (0x55 <= off && off <= 0x57) return CharClass::kMatraPiece;
+    if (off == 0x4d) return CharClass::kVirama;
+  }
+  if (off == 0x60 || off == 0x61) return CharClass::kVowel;
+  if (off == 0x62 || off == 0x63) return CharClass::kMatra;
+  // Danda and digits up to 6f are OK as other.
+  // 70-7f are script-specific.
+  if (script_ == ViramaScript::kBengali && (off == 0x70 || off == 0x71))
+    return CharClass::kConsonant;
+  if (script_ == ViramaScript::kGurmukhi && (off == 0x72 || off == 0x73))
+    return CharClass::kConsonant;
+  if (script_ == ViramaScript::kSinhala && off == 0x70)
+    return CharClass::kConsonant;
+  if (script_ == ViramaScript::kDevanagari && off == 0x70)
+    return CharClass::kOther;
+  if (0x70 <= off && off <= 0x73) return CharClass::kVowelModifier;
+  // Non Indic, Digits, Measures, danda, etc.
+  return CharClass::kOther;
+}
+
+// Helper consumes/copies a virama and any associated post-virama joiners.
+// A linking virama (with either type of pre-virama joiner, post-virama ZWJ, or
+// no joiner at all) must be followed by a consonant.
+// A non-linking (explicit) virama is indicated by a ZWNJ after it, or a non
+// consonant, space, or character from a different script. We clean up the
+// representation to make it consistent by adding a ZWNJ if missing from a
+// non-linking virama. Returns false with an invalid sequence.
+bool ValidateIndic::ConsumeViramaIfValid(IndicPair joiner, bool post_matra) {
+  int num_codes = codes_.size();
+  if (joiner.first == CharClass::kOther) {
+    CodeOnlyToOutput();
+    if (codes_used_ < num_codes &&
+        codes_[codes_used_].second == kZeroWidthJoiner) {
+      // Post-matra viramas must be explicit, so no joiners allowed here.
+      if (post_matra) {
+        if (report_errors_) tprintf("ZWJ after a post-matra virama!!\n");
+        return false;
+      }
+      if (codes_used_ + 1 < num_codes &&
+          codes_[codes_used_ - 2].second != kRayana &&
+          (codes_[codes_used_ + 1].second == kZeroWidthNonJoiner ||
+           codes_[codes_used_ + 1].second == kYayana ||
+           codes_[codes_used_ + 1].second == kRayana)) {
+        // This combination will be picked up later.
+        ASSERT_HOST(!CodeOnlyToOutput());
+      } else {
+        // Half-form with optional Nukta.
+        int len = output_.size() + 1 - output_used_;
+        if (UseMultiCode(len)) return true;
+      }
+      if (codes_used_ < num_codes &&
+          codes_[codes_used_].second == kZeroWidthNonJoiner) {
+        if (output_used_ == output_.size() ||
+            output_[output_used_] != kRayana) {
+          if (report_errors_) {
+            tprintf("Virama ZWJ ZWNJ in non-Sinhala: base=0x%x!\n",
+                    static_cast<int>(script_));
+          }
+          return false;
+        }
+        // Special Sinhala case of Stand-alone Repaya. ['RA' H Z z]
+        if (UseMultiCode(4)) return true;
+      }
+    } else if (codes_used_ == num_codes ||
+               codes_[codes_used_].first != CharClass::kConsonant ||
+               post_matra) {
+      if (codes_used_ == num_codes ||
+          codes_[codes_used_].second != kZeroWidthNonJoiner) {
+        // It is valid to have an unterminated virama at the end of a word, but
+        // for consistency, we will always add ZWNJ if not present.
+        output_.push_back(kZeroWidthNonJoiner);
+      } else {
+        CodeOnlyToOutput();
+      }
+      // Explicit virama [H z]
+      MultiCodePart(2);
+    }
+  } else {
+    // Pre-virama joiner [{Z|z} H] requests specific conjunct.
+    if (UseMultiCode(2)) {
+      if (report_errors_)
+        tprintf("Invalid pre-virama joiner with no 2nd consonant!!\n");
+      return false;
+    }
+    if (codes_[codes_used_].second == kZeroWidthJoiner ||
+        codes_[codes_used_].second == kZeroWidthNonJoiner) {
+      if (report_errors_) {
+        tprintf("JHJ!!: 0x%x 0x%x 0x%x\n", joiner.second, output_.back(),
+                codes_[codes_used_].second);
+      }
+      return false;
+    }
+  }
+  // It is good so far as it goes.
+  return true;
+}
+
+// Helper consumes/copies a series of consonants separated by viramas while
+// valid, but not any vowel or other modifiers.
+bool ValidateIndic::ConsumeConsonantHeadIfValid() {
+  const int num_codes = codes_.size();
+  // Consonant aksara
+  do {
+    CodeOnlyToOutput();
+    // Special Sinhala case of [H Z Yayana/Rayana].
+    int index = output_.size() - 3;
+    if (output_used_ <= index &&
+        (output_.back() == kYayana || output_.back() == kRayana) &&
+        IsVirama(output_[index]) && output_[index + 1] == kZeroWidthJoiner) {
+      MultiCodePart(3);
+    }
+    bool have_nukta = false;
+    if (codes_used_ < num_codes &&
+        codes_[codes_used_].first == CharClass::kNukta) {
+      have_nukta = true;
+      CodeOnlyToOutput();
+    }
+    // Test for subscript conjunct.
+    index = output_.size() - 2 - have_nukta;
+    if (output_used_ <= index && IsSubscriptScript() &&
+        IsVirama(output_[index])) {
+      // Output previous virama, consonant + optional nukta.
+      MultiCodePart(2 + have_nukta);
+    }
+    IndicPair joiner(CharClass::kOther, 0);
+    if (codes_used_ < num_codes &&
+        (codes_[codes_used_].second == kZeroWidthJoiner ||
+         (codes_[codes_used_].second == kZeroWidthNonJoiner &&
+          script_ == ViramaScript::kMalayalam))) {
+      joiner = codes_[codes_used_];
+      if (++codes_used_ == num_codes) {
+        if (report_errors_) {
+          tprintf("Skipping ending joiner: 0x%x 0x%x\n", output_.back(),
+                  joiner.second);
+        }
+        return true;
+      }
+      if (codes_[codes_used_].first == CharClass::kVirama) {
+        output_.push_back(joiner.second);
+      } else {
+        if (report_errors_) {
+          tprintf("Skipping unnecessary joiner: 0x%x 0x%x 0x%x\n",
+                  output_.back(), joiner.second, codes_[codes_used_].second);
+        }
+        joiner = std::make_pair(CharClass::kOther, 0);
+      }
+    }
+    if (codes_used_ < num_codes &&
+        codes_[codes_used_].first == CharClass::kVirama) {
+      if (!ConsumeViramaIfValid(joiner, false)) return false;
+    } else {
+      break;  // No virama, so the run of consonants is over.
+    }
+  } while (codes_used_ < num_codes &&
+           codes_[codes_used_].first == CharClass::kConsonant);
+  if (output_used_ < output_.size()) MultiCodePart(1);
+  return true;
+}
+
+// Helper consumes/copies a tail part of a consonant, comprising optional
+// matra/piece, vowel modifier, vedic mark, terminating virama.
+bool ValidateIndic::ConsumeConsonantTailIfValid() {
+  if (codes_used_ == codes_.size()) return true;
+  // No virama: Finish the grapheme.
+  // Are multiple matras allowed?
+  if (codes_[codes_used_].first == CharClass::kMatra) {
+    if (UseMultiCode(1)) return true;
+    if (codes_[codes_used_].first == CharClass::kMatraPiece) {
+      if (UseMultiCode(1)) return true;
+    }
+  }
+  while (codes_[codes_used_].first == CharClass::kVowelModifier) {
+    if (UseMultiCode(1)) return true;
+    // Only Malayalam allows only repeated 0xd02.
+    if (script_ != ViramaScript::kMalayalam || output_.back() != 0xd02) break;
+  }
+  while (codes_[codes_used_].first == CharClass::kVedicMark) {
+    if (UseMultiCode(1)) return true;
+  }
+  if (codes_[codes_used_].first == CharClass::kVirama) {
+    if (!ConsumeViramaIfValid(IndicPair(CharClass::kOther, 0), true)) {
+      return false;
+    }
+  }
+  // What we have consumed so far is a valid consonant cluster.
+  if (output_used_ < output_.size()) MultiCodePart(1);
+
+  return true;
+}
+
+// Helper consumes/copies a vowel and optional modifiers.
+bool ValidateIndic::ConsumeVowelIfValid() {
+  if (UseMultiCode(1)) return true;
+  while (codes_[codes_used_].first == CharClass::kVowelModifier) {
+    if (UseMultiCode(1)) return true;
+    // Only Malayalam allows repeated modifiers?
+    if (script_ != ViramaScript::kMalayalam) break;
+  }
+  while (codes_[codes_used_].first == CharClass::kVedicMark) {
+    if (UseMultiCode(1)) return true;
+  }
+  // What we have consumed so far is a valid vowel cluster.
+  return true;
+}
+
+}  // namespace tesseract
diff --git a/training/validate_indic.h b/training/validate_indic.h
new file mode 100644
index 0000000000..62dbcb23d1
--- /dev/null
+++ b/training/validate_indic.h
@@ -0,0 +1,44 @@
+#ifndef TESSERACT_TRAINING_VALIDATE_INDIC_H_
+#define TESSERACT_TRAINING_VALIDATE_INDIC_H_
+
+#include "validator.h"
+
+namespace tesseract {
+
+// Subclass of Validator that validates and segments Indic scripts in the
+// unicode range 0x900-0xdff (Devanagari-Sinhala).
+class ValidateIndic : public Validator {
+ public:
+  ValidateIndic(ViramaScript script, bool report_errors)
+      : Validator(script, report_errors) {}
+  ~ValidateIndic() {}
+
+ protected:
+  // Returns whether codes matches the pattern for an Indic Grapheme.
+  // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
+  // parts_ and output_. Returns true if a valid Grapheme was consumed,
+  // otherwise does not increment codes_used_.
+  bool ConsumeGraphemeIfValid() override;
+  // Returns the CharClass corresponding to the given Unicode ch.
+  Validator::CharClass UnicodeToCharClass(char32 ch) const override;
+
+ private:
+  // Helper consumes/copies a virama and any associated post-virama joiners.
+  bool ConsumeViramaIfValid(IndicPair joiner, bool post_matra);
+  // Helper consumes/copies a series of consonants separated by viramas while
+  // valid, but not any vowel or other modifiers.
+  bool ConsumeConsonantHeadIfValid();
+  // Helper consumes/copies a tail part of a consonant, comprising optional
+  // matra/piece, vowel modifier, vedic mark, terminating virama.
+  bool ConsumeConsonantTailIfValid();
+  // Helper consumes/copies a vowel and optional modifiers.
+  bool ConsumeVowelIfValid();
+
+  // Some special unicodes used only for Indic processing.
+  static const char32 kYayana = 0xdba;  // Sinhala Ya
+  static const char32 kRayana = 0xdbb;  // Sinhala Ra
+};
+
+}  // namespace tesseract
+
+#endif  // TESSERACT_TRAINING_VALIDATE_INDIC_H_
diff --git a/training/validate_khmer.cpp b/training/validate_khmer.cpp
new file mode 100644
index 0000000000..45c8f061de
--- /dev/null
+++ b/training/validate_khmer.cpp
@@ -0,0 +1,106 @@
+#include "validate_khmer.h"
+#include "errcode.h"
+#include "tprintf.h"
+
+namespace tesseract {
+
+// Returns whether codes matches the pattern for a Khmer Grapheme.
+// Taken from unicode standard:
+// http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf.
+// where it gives: B {R | C} {S {R}}* {{Z} V} {O} {S}, using different notation
+// to the ISCII standard http://varamozhi.sourceforge.net/iscii91.pdf.
+// Translated to the codes used by the CharClass enum:
+// C {R | N} {HC {R}}* {{Z|z} M{P}} {D} {HC}
+// Where R is a new symbol (Robat) and N is repurposed as a consonant shifter.
+// Also the Consonant class here includes independent vowels, as they are
+// treated the same anyway.
+// In the split grapheme mode, the only characters that get grouped are the
+// HC and the {Z|z}M The unicode chapter on Khmer only mentions the joiners in
+// the BNF syntax, so who knows what they do.
+bool ValidateKhmer::ConsumeGraphemeIfValid() {
+  int num_codes = codes_.size();
+  if (codes_used_ == num_codes) return false;
+  if (codes_[codes_used_].first == CharClass::kOther) {
+    UseMultiCode(1);
+    return true;
+  }
+  if (codes_[codes_used_].first != CharClass::kConsonant) {
+    if (report_errors_) {
+      tprintf("Invalid start of Khmer syllable:0x%x\n",
+              codes_[codes_used_].second);
+    }
+    return false;
+  }
+  if (UseMultiCode(1)) return true;
+  if (codes_[codes_used_].first == CharClass::kRobat ||
+      codes_[codes_used_].first == CharClass::kNukta) {
+    if (UseMultiCode(1)) return true;
+  }
+  while (codes_used_ + 1 < num_codes &&
+         codes_[codes_used_].first == CharClass::kVirama &&
+         codes_[codes_used_ + 1].first == CharClass::kConsonant) {
+    ASSERT_HOST(!CodeOnlyToOutput());
+    if (UseMultiCode(2)) return true;
+    if (codes_[codes_used_].first == CharClass::kRobat) {
+      if (UseMultiCode(1)) return true;
+    }
+  }
+  int num_matra_parts = 0;
+  if (codes_[codes_used_].second == kZeroWidthJoiner ||
+      codes_[codes_used_].second == kZeroWidthNonJoiner) {
+    if (CodeOnlyToOutput()) {
+      if (report_errors_) {
+        tprintf("Unterminated joiner: 0x%x\n", output_.back());
+      }
+      return false;
+    }
+    ++num_matra_parts;
+  }
+  // Not quite as shown by the BNF, the matra piece is allowed as a matra on its
+  // own or as an addition to other matras.
+  if (codes_[codes_used_].first == CharClass::kMatra ||
+      codes_[codes_used_].first == CharClass::kMatraPiece) {
+    ++num_matra_parts;
+    if (UseMultiCode(num_matra_parts)) return true;
+  } else if (num_matra_parts) {
+    if (report_errors_) {
+      tprintf("Joiner with non-dependent vowel after it!:0x%x 0x%x\n",
+              output_.back(), codes_[codes_used_].second);
+    }
+    return false;
+  }
+  if (codes_[codes_used_].first == CharClass::kMatraPiece &&
+      codes_[codes_used_ - 1].first != CharClass::kMatraPiece) {
+    if (UseMultiCode(1)) return true;
+  }
+  if (codes_[codes_used_].first == CharClass::kVowelModifier) {
+    if (UseMultiCode(1)) return true;
+  }
+  if (codes_used_ + 1 < num_codes &&
+      codes_[codes_used_].first == CharClass::kVirama &&
+      codes_[codes_used_ + 1].first == CharClass::kConsonant) {
+    ASSERT_HOST(!CodeOnlyToOutput());
+    if (UseMultiCode(2)) return true;
+  }
+  return true;
+}
+
+Validator::CharClass ValidateKhmer::UnicodeToCharClass(char32 ch) const {
+  if (IsVedicAccent(ch)) return CharClass::kVedicMark;
+  if (ch == kZeroWidthNonJoiner) return CharClass::kZeroWidthNonJoiner;
+  if (ch == kZeroWidthJoiner) return CharClass::kZeroWidthJoiner;
+  // Offset from the start of the relevant unicode code block aka code page.
+  int off = ch - static_cast<char32>(script_);
+  // Anything in another code block is other.
+  if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther;
+  if (off <= 0x33) return CharClass::kConsonant;
+  if (off <= 0x45) return CharClass::kMatra;
+  if (off == 0x46) return CharClass::kMatraPiece;
+  if (off == 0x4c) return CharClass::kRobat;
+  if (off == 0x49 || off == 0x4a) return CharClass::kNukta;
+  if (off <= 0x51) return CharClass::kVowelModifier;
+  if (off == 0x52) return CharClass::kVirama;
+  return CharClass::kOther;
+}
+
+}  // namespace tesseract
diff --git a/training/validate_khmer.h b/training/validate_khmer.h
new file mode 100644
index 0000000000..a2fe75c962
--- /dev/null
+++ b/training/validate_khmer.h
@@ -0,0 +1,27 @@
+#ifndef TESSERACT_TRAINING_VALIDATE_KHMER_H_
+#define TESSERACT_TRAINING_VALIDATE_KHMER_H_
+
+#include "validator.h"
+
+namespace tesseract {
+
+// Subclass of Validator that validates and segments Khmer.
+class ValidateKhmer : public Validator {
+ public:
+  ValidateKhmer(ViramaScript script, bool report_errors)
+      : Validator(script, report_errors) {}
+  ~ValidateKhmer() {}
+
+ protected:
+  // Returns whether codes matches the pattern for an Khmer Grapheme.
+  // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
+  // parts_ and output_. Returns true if a valid Grapheme was consumed,
+  // otherwise does not increment codes_used_.
+  bool ConsumeGraphemeIfValid() override;
+  // Returns the CharClass corresponding to the given Unicode ch.
+  CharClass UnicodeToCharClass(char32 ch) const override;
+};
+
+}  // namespace tesseract
+
+#endif  // TESSERACT_TRAINING_VALIDATE_KHMER_H_
diff --git a/training/validate_myanmar.cpp b/training/validate_myanmar.cpp
new file mode 100644
index 0000000000..4493469023
--- /dev/null
+++ b/training/validate_myanmar.cpp
@@ -0,0 +1,160 @@
+#include "validate_myanmar.h"
+#include "errcode.h"
+#include "icuerrorcode.h"
+#include "tprintf.h"
+#include "unicode/uchar.h"    // From libicu
+#include "unicode/uscript.h"  // From libicu
+
+namespace tesseract {
+
+// Returns whether codes matches the pattern for a Myanmar Grapheme.
+// Taken directly from the unicode table 16-3.
+// See http://www.unicode.org/versions/Unicode9.0.0/ch16.pdf
+bool ValidateMyanmar::ConsumeGraphemeIfValid() {
+  int num_codes = codes_.size();
+  if (codes_used_ == num_codes) return true;
+  // Other.
+  if (IsMyanmarOther(codes_[codes_used_].second)) {
+    UseMultiCode(1);
+    return true;
+  }
+  // Kinzi.
+  if (codes_used_ + 2 < num_codes && codes_[codes_used_].second == 0x1004 &&
+      codes_[codes_used_ + 1].second == kMyanmarAsat &&
+      codes_[codes_used_ + 2].second == kMyanmarVirama) {
+    ASSERT_HOST(!CodeOnlyToOutput());
+    ASSERT_HOST(!CodeOnlyToOutput());
+    if (UseMultiCode(3)) return true;
+  }
+  // Base consonant/vowel. NOTE that since everything in Myanmar appears to be
+  // optional, except the base, this is the only place where invalid input can
+  // be detected and false returned.
+  if (IsMyanmarLetter(codes_[codes_used_].second)) {
+    if (UseMultiCode(1)) return true;
+  } else {
+    if (report_errors_) {
+      tprintf("Invalid start of Myanmar syllable:0x%x\n",
+              codes_[codes_used_].second);
+    }
+    return false;  // One of these is required.
+  }
+  if (ConsumeSubscriptIfPresent()) return true;
+  ConsumeOptionalSignsIfPresent();
+  // What we have consumed so far is a valid syllable.
+  return true;
+}
+
+// TODO(rays) Doesn't use intermediate coding like the other scripts, as there
+// is little correspondence between the content of table 16-3 and the char
+// classes of the Indic languages. (Experts may disagree and improve!)
+// In unicode table 16-3 there is basically a long list of optional characters,
+// which can be coded quite easily.
+// Unfortunately, table 16-3 doesn't include even half the Myanmar unicodes!!
+// The table also allows sequences that still result in dotted circles!!
+// So with a lot of guesswork the rest have been added in a reasonable place.
+Validator::CharClass ValidateMyanmar::UnicodeToCharClass(char32 ch) const {
+  if (IsMyanmarLetter(ch)) return CharClass::kConsonant;
+  return CharClass::kOther;
+}
+
+// Helper consumes/copies a virama and any subscript consonant.
+// Returns true if the end of input is reached.
+bool ValidateMyanmar::ConsumeSubscriptIfPresent() {
+  // Subscript consonant. It appears there can be only one.
+  int num_codes = codes_.size();
+  if (codes_used_ + 1 < num_codes &&
+      codes_[codes_used_].second == kMyanmarVirama) {
+    if (IsMyanmarLetter(codes_[codes_used_ + 1].second)) {
+      ASSERT_HOST(!CodeOnlyToOutput());
+      if (UseMultiCode(2)) return true;
+    }
+  }
+  return false;
+}
+
+// Helper consumes/copies a series of optional signs.
+// Returns true if the end of input is reached.
+bool ValidateMyanmar::ConsumeOptionalSignsIfPresent() {
+  // The following characters are allowed, all optional, and in sequence.
+  // An exception is kMyanmarMedialYa, which can include kMyanmarAsat.
+  const std::vector<char32> kMedials({kMyanmarAsat, kMyanmarMedialYa, 0x103c,
+                                      0x103d, 0x103e, 0x105e, 0x105f, 0x1060,
+                                      0x1081, 0x1031});
+  for (char32 ch : kMedials) {
+    if (codes_[codes_used_].second == ch) {
+      if (UseMultiCode(1)) return true;
+      if (ch == kMyanmarMedialYa &&
+          codes_[codes_used_].second == kMyanmarAsat) {
+        if (UseMultiCode(1)) return true;
+      }
+    }
+  }
+  // Vowel sign i, ii, ai.
+  char32 ch = codes_[codes_used_].second;
+  if (ch == 0x102d || ch == 0x102e || ch == 0x1032) {
+    if (UseMultiCode(1)) return true;
+  }
+  // Vowel sign u, uu, and extensions.
+  ch = codes_[codes_used_].second;
+  if (ch == 0x102f || ch == 0x1030 || (0x1056 <= ch && ch <= 0x1059) ||
+      ch == 0x1062 || ch == 0x1067 || ch == 0x1068 ||
+      (0x1071 <= ch && ch <= 0x1074) || (0x1083 <= ch && ch <= 0x1086) ||
+      ch == 0x109c || ch == 0x109d) {
+    if (UseMultiCode(1)) return true;
+  }
+  // Tall aa, aa with optional asat.
+  if (codes_[codes_used_].second == 0x102b ||
+      codes_[codes_used_].second == 0x102c) {
+    if (UseMultiCode(1)) return true;
+    if (codes_[codes_used_].second == kMyanmarAsat) {
+      if (UseMultiCode(1)) return true;
+    }
+  }
+  // The following characters are allowed, all optional, and in sequence.
+  const std::vector<char32> kSigns({0x1036, 0x1037});
+  for (char32 ch : kSigns) {
+    if (codes_[codes_used_].second == ch) {
+      if (UseMultiCode(1)) return true;
+    }
+  }
+  // Tone mark extensions.
+  ch = codes_[codes_used_].second;
+  if (ch == 0x1038 || ch == kMyanmarAsat || ch == 0x1063 || ch == 0x1064 ||
+      (0x1069 <= ch && ch <= 0x106d) || (0x1087 <= ch && ch <= 0x108d) ||
+      ch == 0x108f || ch == 0x109a || ch == 0x109b ||
+      (0xaa7b <= ch && ch <= 0xaa7d)) {
+    if (UseMultiCode(1)) return true;
+  }
+  return false;
+}
+
+// Returns true if the unicode is a Myanmar "letter" including consonants
+// and independent vowels. Although table 16-3 distinguishes between some
+// base consonants and vowels, the extensions make no such distinction, so we
+// put them all into a single bucket.
+/* static */
+bool ValidateMyanmar::IsMyanmarLetter(char32 ch) {
+  return (0x1000 <= ch && ch <= 0x102a) || ch == 0x103f ||
+         (0x1050 <= ch && ch <= 0x1055) || (0x105a <= ch && ch <= 0x105d) ||
+         ch == 0x1061 || ch == 0x1065 || ch == 0x1066 ||
+         (0x106e <= ch && ch <= 0x1070) || (0x1075 <= ch && ch <= 0x1080) ||
+         ch == 0x108e || (0xa9e0 <= ch && ch <= 0xa9ef) ||
+         (0xa9fa <= ch && ch <= 0xa9ff) || (0xaa60 <= ch && ch <= 0xaa73) ||
+         ch == 0xaa7a || ch == 0xaa7e || ch == 0xaa7f;
+}
+
+// Returns true if ch is a Myanmar digit or other symbol that does not take
+// part in being a syllable.
+/* static */
+bool ValidateMyanmar::IsMyanmarOther(char32 ch) {
+  IcuErrorCode err;
+  UScriptCode script_code = uscript_getScript(ch, err);
+  if (script_code != USCRIPT_MYANMAR && ch != Validator::kZeroWidthJoiner &&
+      ch != Validator::kZeroWidthNonJoiner)
+    return true;
+  return (0x1040 <= ch && ch <= 0x1049) || (0x1090 <= ch && ch <= 0x1099) ||
+         (0x109c <= ch && ch <= 0x109d) || (0xa9f0 <= ch && ch <= 0xa9f9) ||
+         (0xaa74 <= ch && ch <= 0xaa79);
+}
+
+}  // namespace tesseract
diff --git a/training/validate_myanmar.h b/training/validate_myanmar.h
new file mode 100644
index 0000000000..d2ada74505
--- /dev/null
+++ b/training/validate_myanmar.h
@@ -0,0 +1,47 @@
+#ifndef TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
+#define TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
+
+#include "validator.h"
+
+namespace tesseract {
+
+// Subclass of Validator that validates and segments Myanmar.
+class ValidateMyanmar : public Validator {
+ public:
+  ValidateMyanmar(ViramaScript script, bool report_errors)
+      : Validator(script, report_errors) {}
+  ~ValidateMyanmar() {}
+
+ protected:
+  // Returns whether codes matches the pattern for a Myanmar Grapheme.
+  // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
+  // parts_ and output_. Returns true if a valid Grapheme was consumed,
+  // otherwise does not increment codes_used_.
+  bool ConsumeGraphemeIfValid() override;
+  // Returns the CharClass corresponding to the given Unicode ch.
+  Validator::CharClass UnicodeToCharClass(char32 ch) const override;
+
+ private:
+  // Helper consumes/copies a virama and any subscript consonant.
+  // Returns true if the end of input is reached.
+  bool ConsumeSubscriptIfPresent();
+  // Helper consumes/copies a series of optional signs.
+  // Returns true if the end of input is reached.
+  bool ConsumeOptionalSignsIfPresent();
+  // Returns true if the unicode is a Myanmar "letter" including consonants
+  // and independent vowels. Although table 16-3 distinguishes between some
+  // base consonants and vowels, the extensions make no such distinction, so we
+  // put them all into a single bucket.
+  static bool IsMyanmarLetter(char32 ch);
+  // Returns true if ch is a Myanmar digit or other symbol that does not take
+  // part in being a syllable.
+  static bool IsMyanmarOther(char32 ch);
+
+  // Some special unicodes used only for Myanmar processing.
+  static const char32 kMyanmarAsat = 0x103a;
+  static const char32 kMyanmarMedialYa = 0x103b;
+};
+
+}  // namespace tesseract
+
+#endif  // TESSERACT_TRAINING_VALIDATE_MYANMAR_H_
diff --git a/training/validator.cpp b/training/validator.cpp
new file mode 100644
index 0000000000..54ec4500e6
--- /dev/null
+++ b/training/validator.cpp
@@ -0,0 +1,205 @@
+#include "validator.h"
+
+#include <algorithm>
+#include <unordered_map>
+#include <vector>
+
+#include "icuerrorcode.h"
+#include "unicode/uchar.h"    // From libicu
+#include "unicode/uscript.h"  // From libicu
+#include "validate_grapheme.h"
+#include "validate_indic.h"
+#include "validate_khmer.h"
+#include "validate_myanmar.h"
+
+namespace tesseract {
+
+// Some specific but universally useful unicodes.
+const char32 Validator::kZeroWidthSpace = 0x200B;
+const char32 Validator::kZeroWidthNonJoiner = 0x200C;
+const char32 Validator::kZeroWidthJoiner = 0x200D;
+const char32 Validator::kLeftToRightMark = 0x200E;
+const char32 Validator::kRightToLeftMark = 0x200F;
+const char32 Validator::kInvalid = 0xfffd;
+
+// Validates and cleans the src vector of unicodes to the *dest, according to
+// g_mode. In the case of kSingleString, a single vector containing the whole
+// result is added to *dest. With kCombined, multiple vectors are added to
+// *dest with one grapheme in each. With kGlyphSplit, multiple vectors are
+// added to *dest with a smaller unit representing a glyph in each.
+// In case of validation error, returns false and as much as possible of the
+// input, without discarding invalid text.
+/* static */
+bool Validator::ValidateCleanAndSegment(
+    GraphemeNormMode g_mode, bool report_errors, const std::vector<char32>& src,
+    std::vector<std::vector<char32>>* dest) {
+  ValidateGrapheme g_validator(ViramaScript::kNonVirama, report_errors);
+  std::vector<std::vector<char32>> graphemes;
+  ViramaScript script = MostFrequentViramaScript(src);
+  bool success = true;
+  if (script == ViramaScript::kNonVirama) {
+    // The grapheme segmenter's maximum segmentation is the grapheme unit, so
+    // up the mode by 1 to get the desired effect.
+    if (g_mode == GraphemeNormMode::kCombined)
+      g_mode = GraphemeNormMode::kGlyphSplit;
+    else if (g_mode == GraphemeNormMode::kGlyphSplit)
+      g_mode = GraphemeNormMode::kIndividualUnicodes;
+    // Just do grapheme segmentation.
+    success = g_validator.ValidateCleanAndSegmentInternal(g_mode, src, dest);
+  } else {
+    success = g_validator.ValidateCleanAndSegmentInternal(
+        GraphemeNormMode::kGlyphSplit, src, &graphemes);
+    std::unique_ptr<Validator> validator(
+        ScriptValidator(script, report_errors));
+    for (const auto& grapheme : graphemes) {
+      if (!validator->ValidateCleanAndSegmentInternal(g_mode, grapheme, dest)) {
+        success = false;
+      }
+    }
+  }
+  return success;
+}
+
+// Factory method that understands how to map script to the right subclass.
+std::unique_ptr<Validator> Validator::ScriptValidator(ViramaScript script,
+                                                      bool report_errors) {
+  switch (script) {
+    case ViramaScript::kNonVirama:
+      return std::unique_ptr<Validator>(
+          new ValidateGrapheme(script, report_errors));
+    case ViramaScript::kMyanmar:
+      return std::unique_ptr<Validator>(
+          new ValidateMyanmar(script, report_errors));
+    case ViramaScript::kKhmer:
+      return std::unique_ptr<Validator>(
+          new ValidateKhmer(script, report_errors));
+    default:
+      return std::unique_ptr<Validator>(
+          new ValidateIndic(script, report_errors));
+  }
+}
+
+// Internal version of the public static ValidateCleanAndSegment.
+// Validates and cleans the src vector of unicodes to the *dest, according to
+// its type and the given g_mode.
+// In case of validation error, returns false and returns as much as possible
+// of the input, without discarding invalid text.
+bool Validator::ValidateCleanAndSegmentInternal(
+    GraphemeNormMode g_mode, const std::vector<char32>& src,
+    std::vector<std::vector<char32>>* dest) {
+  Clear();
+  ComputeClassCodes(src);
+  bool success = true;
+  for (codes_used_ = 0; codes_used_ < codes_.size();) {
+    if (!ConsumeGraphemeIfValid()) {
+      success = false;
+      ++codes_used_;
+    }
+  }
+  MoveResultsToDest(g_mode, dest);
+  return success;
+}
+
+// Moves the results from parts_ or output_ to dest according to g_mode.
+void Validator::MoveResultsToDest(GraphemeNormMode g_mode,
+                                  std::vector<std::vector<char32>>* dest) {
+  if (g_mode == GraphemeNormMode::kIndividualUnicodes) {
+    // Append each element of the combined output_ that we made as a new vector
+    // in dest.
+    dest->reserve(dest->size() + output_.size());
+    for (char32 ch : output_) dest->push_back({ch});
+  } else if (g_mode == GraphemeNormMode::kGlyphSplit) {
+    // Append all the parts_ that we made onto dest.
+    std::move(parts_.begin(), parts_.end(), std::back_inserter(*dest));
+  } else if (g_mode == GraphemeNormMode::kCombined || dest->empty()) {
+    // Append the combined output_ that we made onto dest as one new vector.
+    dest->push_back(std::vector<char32>());
+    output_.swap(dest->back());
+  } else {  // kNone.
+    // Append the combined output_ that we made onto the last existing element
+    // of dest.
+    dest->back().insert(dest->back().end(), output_.begin(), output_.end());
+  }
+}
+
+bool CmpPairSecond(const std::pair<int, int>& p1,
+                   const std::pair<int, int>& p2) {
+  return p1.second < p2.second;
+}
+
+// Computes and returns the ViramaScript corresponding to the most frequent
+// virama-using script in the input, or kNonVirama if none are present.
+/* static */
+ViramaScript Validator::MostFrequentViramaScript(
+    const std::vector<char32>& utf32) {
+  std::unordered_map<int, int> histogram;
+  for (char32 ch : utf32) {
+    // Determine the codepage base. For the Indic scripts, and Khmer, it is
+    // sufficient to divide by kIndicCodePageSize but Myanmar is all over the
+    // unicode code space, so use its script id.
+    int base = ch / kIndicCodePageSize;
+    IcuErrorCode err;
+    UScriptCode script_code = uscript_getScript(ch, err);
+    if ((kMinIndicUnicode <= ch && ch <= kMaxViramaScriptUnicode &&
+         script_code != USCRIPT_COMMON) ||
+        script_code == USCRIPT_MYANMAR) {
+      if (script_code == USCRIPT_MYANMAR)
+        base = static_cast<char32>(ViramaScript::kMyanmar) / kIndicCodePageSize;
+      ++histogram[base];
+    }
+  }
+  if (!histogram.empty()) {
+    int base =
+        std::max_element(histogram.begin(), histogram.end(), CmpPairSecond)
+            ->first;
+    char32 codebase = static_cast<char32>(base * kIndicCodePageSize);
+    // Check for validity.
+    if (codebase == static_cast<char32>(ViramaScript::kMyanmar) ||
+        codebase == static_cast<char32>(ViramaScript::kKhmer) ||
+        (static_cast<char32>(ViramaScript::kDevanagari) <= codebase &&
+         codebase <= static_cast<char32>(ViramaScript::kSinhala))) {
+      return static_cast<ViramaScript>(codebase);
+    }
+  }
+  return ViramaScript::kNonVirama;
+}
+
+// Returns true if the given UTF-32 unicode is a "virama" character.
+/* static */
+bool Validator::IsVirama(char32 unicode) {
+  return (kMinIndicUnicode <= unicode && unicode <= kMaxSinhalaUnicode &&
+          (unicode & 0x7f) == 0x4d) ||
+         unicode == kSinhalaVirama || unicode == kMyanmarVirama ||
+         unicode == kKhmerVirama;
+}
+
+// Returns true if the given UTF-32 unicode is a vedic accent.
+/* static */
+bool Validator::IsVedicAccent(char32 unicode) {
+  return 0x1cd0 <= unicode && unicode < 0x1d00;
+}
+
+// Returns true if the script is one that uses subscripts for conjuncts.
+bool Validator::IsSubscriptScript() const {
+  return script_ == ViramaScript::kTelugu ||
+         script_ == ViramaScript::kKannada ||
+         script_ == ViramaScript::kMyanmar || script_ == ViramaScript::kKhmer;
+}
+
+void Validator::ComputeClassCodes(const std::vector<char32>& text) {
+  codes_.reserve(text.size());
+  for (char32 c : text) {
+    codes_.push_back(std::make_pair(UnicodeToCharClass(c), c));
+  }
+}
+
+// Resets to the initial state.
+void Validator::Clear() {
+  codes_.clear();
+  parts_.clear();
+  output_.clear();
+  codes_used_ = 0;
+  output_used_ = 0;
+}
+
+}  // namespace tesseract
diff --git a/training/validator.h b/training/validator.h
new file mode 100644
index 0000000000..6b21daa911
--- /dev/null
+++ b/training/validator.h
@@ -0,0 +1,243 @@
+/**********************************************************************
+ * File:        validator.h
+ * Description: Base class for various text validators. Intended mainly for
+ *              scripts that use a virama character.
+ * Author:      Ray Smith
+ * Created:     Tue May 23 2017
+ *
+ * (C) Copyright 2017, Google Inc.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ **********************************************************************/
+
+#ifndef TESSERACT_TRAINING_VALIDATOR_H_
+#define TESSERACT_TRAINING_VALIDATOR_H_
+
+#include <memory>
+#include <vector>
+#include "unichar.h"
+
+namespace tesseract {
+
+// Different kinds of grapheme normalization - not just for Indic!
+// A grapheme is a syllable unit in Indic and can be several unicodes.
+// In other scripts, a grapheme is a base character and accent/diacritic
+// combination, as not all accented characters have a single composed form.
+enum class GraphemeNormMode {
+  // Validation result is a single string, even if input is multi-word.
+  kSingleString,
+  // Standard unicode graphemes are validated and output as grapheme units.
+  kCombined,
+  // Graphemes are validated and sub-divided. For virama-using scripts, units
+  // that correspond to repeatable glyphs are generated. (Mostly single unicodes
+  // but viramas and joiners are paired with the most sensible neighbor.)
+  // For non-virama scripts, this means that base/accent pairs are separated,
+  // ie the output is individual unicodes.
+  kGlyphSplit,
+  // The output is always single unicodes, regardless of the script.
+  kIndividualUnicodes,
+};
+
+// An enum representing the scripts that use a virama character. It is
+// guaranteed that the value of any element, (except kNonVirama) can be cast
+// to a unicode (char32) value that represents the start of the unicode range
+// of the corresponding script.
+enum class ViramaScript : char32 {
+  kNonVirama = 0,
+  kDevanagari = 0x900,
+  kBengali = 0x980,
+  kGurmukhi = 0xa00,
+  kGujarati = 0xa80,
+  kOriya = 0xb00,
+  kTamil = 0xb80,
+  kTelugu = 0xc00,
+  kKannada = 0xc80,
+  kMalayalam = 0xd00,
+  kSinhala = 0xd80,
+  kMyanmar = 0x1000,
+  kKhmer = 0x1780,
+};
+
+// Base class offers a validation API and protected methods to allow subclasses
+// to easily build the validated/segmented output.
+class Validator {
+ public:
+  // Validates and cleans the src vector of unicodes to the *dest, according to
+  // g_mode. In the case of kSingleString, a single vector containing the whole
+  // result is added to *dest. With kCombined, multiple vectors are added to
+  // *dest with one grapheme in each. With kGlyphSplit, multiple vectors are
+  // added to *dest with a smaller unit representing a glyph in each.
+  // In case of validation error, returns false and as much as possible of the
+  // input, without discarding invalid text.
+  static bool ValidateCleanAndSegment(GraphemeNormMode g_mode,
+                                      bool report_errors,
+                                      const std::vector<char32>& src,
+                                      std::vector<std::vector<char32>>* dest);
+
+  // Returns true if the unicode ch is a non-printing zero-width mark of no
+  // significance to OCR training or evaluation.
+  static bool IsZeroWidthMark(char32 ch) {
+    return ch == kZeroWidthSpace || ch == kLeftToRightMark ||
+           ch == kRightToLeftMark || ch == kInvalid;
+  }
+  virtual ~Validator() {}
+
+  // Some specific but universally useful unicodes.
+  static const char32 kZeroWidthSpace;
+  static const char32 kZeroWidthNonJoiner;
+  static const char32 kZeroWidthJoiner;
+  static const char32 kLeftToRightMark;
+  static const char32 kRightToLeftMark;
+  static const char32 kInvalid;
+
+ protected:
+  // These are more or less the character class identifiers in the ISCII
+  // standard, section 8.  They have been augmented with the Unicode meta
+  // characters Zero Width Joiner and Zero Width Non Joiner, and the
+  // Unicode Vedic Marks.
+  // The best sources of information on Unicode and Indic scripts are:
+  //   http://varamozhi.sourceforge.net/iscii91.pdf
+  //   http://www.unicode.org/versions/Unicode9.0.0/ch12.pdf
+  //   http://unicode.org/faq/indic.html
+  //   http://www.microsoft.com/typography/otfntdev/teluguot/shaping.aspx
+  enum class CharClass {
+    // NOTE: The values of the enum members are meaningless and arbitrary, ie
+    // they are not used for sorting, or any other risky application.
+    // The reason they are what they are is they are a single character
+    // abbreviation that can be used in a regexp/BNF definition of a grammar,
+    // IN A COMMENT, and still not relied upon in the code.
+    kConsonant = 'C',
+    kVowel = 'V',
+    kVirama = 'H',              // (aka Halant)
+    kMatra = 'M',               // (aka Dependent Vowel)
+    kMatraPiece = 'P',          // unicode provides pieces of Matras.
+    kVowelModifier = 'D',       // (candrabindu, anusvara, visarga, other marks)
+    kZeroWidthNonJoiner = 'z',  // Unicode Zero Width Non-Joiner U+200C
+    kZeroWidthJoiner = 'Z',     // Unicode Zero Width Joiner U+200D
+    kVedicMark = 'v',           // Modifiers can come modify any indic syllable.
+    kNukta = 'N',               // Occurs only immediately after consonants.
+    kRobat = 'R',               // Khmer only.
+    kOther = 'O',               // (digits, measures, non-Indic, etc)
+    // Additional classes used only by ValidateGrapheme.
+    kWhitespace = ' ',
+    kCombiner = 'c',  // Combiners other than virama.
+  };
+  typedef std::pair<CharClass, char32> IndicPair;
+
+  Validator(ViramaScript script, bool report_errors)
+      : script_(script),
+        codes_used_(0),
+        output_used_(0),
+        report_errors_(report_errors) {}
+
+  // Factory method that understands how to map script to the right subclass.
+  static std::unique_ptr<Validator> ScriptValidator(ViramaScript script,
+                                                    bool report_errors);
+
+  // Internal version of the public static ValidateCleanAndSegment.
+  // Validates and cleans the src vector of unicodes to the *dest, according to
+  // its type and the given g_mode.
+  // In case of validation error, returns false and returns as much as possible
+  // of the input, without discarding invalid text.
+  bool ValidateCleanAndSegmentInternal(GraphemeNormMode g_mode,
+                                       const std::vector<char32>& src,
+                                       std::vector<std::vector<char32>>* dest);
+  // Moves the results from parts_ or output_ to dest according to g_mode.
+  void MoveResultsToDest(GraphemeNormMode g_mode,
+                         std::vector<std::vector<char32>>* dest);
+
+  // Computes and returns the ViramaScript corresponding to the most frequent
+  // virama-using script in the input, or kNonVirama if none are present.
+  static ViramaScript MostFrequentViramaScript(
+      const std::vector<char32>& utf32);
+  // Returns true if the given UTF-32 unicode is a "virama" character.
+  static bool IsVirama(char32 unicode);
+  // Returns true if the given UTF-32 unicode is a vedic accent.
+  static bool IsVedicAccent(char32 unicode);
+  // Returns true if the script is one that uses subscripts for conjuncts.
+  bool IsSubscriptScript() const;
+
+  // Helper function appends the next element of codes_ only to output_,
+  // without touching parts_
+  // Returns true at the end of codes_.
+  bool CodeOnlyToOutput() {
+    output_.push_back(codes_[codes_used_].second);
+    return ++codes_used_ == codes_.size();
+  }
+
+  // Helper function adds a length-element vector to parts_ from the last length
+  // elements of output_. If there are more than length unused elements in
+  // output_, adds unicodes as single-element vectors to parts_ to catch
+  // output_used_ up to output->size() - length before adding the length-element
+  // vector.
+  void MultiCodePart(int length) {
+    while (output_used_ + length < output_.size()) {
+      parts_.emplace_back(
+          std::initializer_list<char32>{output_[output_used_++]});
+    }
+    parts_.emplace_back(std::initializer_list<char32>{output_[output_used_]});
+    while (++output_used_ < output_.size()) {
+      parts_.back().push_back(output_[output_used_]);
+    }
+  }
+
+  // Helper function appends the next element of codes_ to output_, and then
+  // calls MultiCodePart to add the appropriate components to parts_.
+  // Returns true at the end of codes_.
+  bool UseMultiCode(int length) {
+    output_.push_back(codes_[codes_used_].second);
+    MultiCodePart(length);
+    return ++codes_used_ == codes_.size();
+  }
+
+  // Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
+  // parts_ and output_. Returns true if a valid Grapheme was consumed,
+  // otherwise does not increment codes_used_.
+  virtual bool ConsumeGraphemeIfValid() = 0;
+  // Sets codes_ to the class codes for the given unicode text.
+  void ComputeClassCodes(const std::vector<char32>& text);
+  // Returns the CharClass corresponding to the given Unicode ch.
+  virtual CharClass UnicodeToCharClass(char32 ch) const = 0;
+  // Resets to the initial state.
+  void Clear();
+
+  // Number of unicodes in each Indic codepage.
+  static const int kIndicCodePageSize = 128;
+  // Lowest unicode value of any Indic script. (Devanagari).
+  static const char32 kMinIndicUnicode = 0x900;
+  // Highest unicode value of any consistent (ISCII-based) Indic script.
+  static const char32 kMaxSinhalaUnicode = 0xdff;
+  // Highest unicode value of any virama-using script. (Khmer).
+  static const char32 kMaxViramaScriptUnicode = 0x17ff;
+  // Some special unicodes.
+  static const char32 kSinhalaVirama = 0xdca;
+  static const char32 kMyanmarVirama = 0x1039;
+  static const char32 kKhmerVirama = 0x17d2;
+
+  // Script we are operating on.
+  ViramaScript script_;
+  // Input unicodes with assigned CharClass is the data to be validated.
+  std::vector<IndicPair> codes_;
+  // Glyph-like components of the input.
+  std::vector<std::vector<char32>> parts_;
+  // Copied validated unicodes from codes_ that are OK to output.
+  std::vector<char32> output_;
+  // The number of elements of codes_ that have been processed so far.
+  int codes_used_;
+  // The number of elements of output_ that have already been added to parts_.
+  int output_used_;
+  // Log error messages for reasons why text is invalid.
+  bool report_errors_;
+};
+
+}  // namespace tesseract
+
+#endif  // TESSERACT_TRAINING_VALIDATOR_H_