Added script-specific validation and normalization for virama-using s…

…cripts and updated normalization for others
tesseract-ocr · Jul 14, 2017 · df41eab · df41eab · hoangtocdo90 · Jul 21, 2017
1 parent da03e4e
commit df41eab
Show file tree

Hide file tree

Showing 15 changed files with 1,518 additions and 90 deletions.
diff --git a/ccutil/unichar.cpp b/ccutil/unichar.cpp
@@ -226,3 +226,19 @@ std::vector<char32> UNICHAR::UTF8ToUTF32(const char* utf8_str) {
   return unicodes;
 }
 
+// Returns an empty string if the input contains an invalid unicode.
+string UNICHAR::UTF32ToUTF8(const std::vector<char32>& str32) {
+  string utf8_str;
+  for (char32 ch : str32) {
+    UNICHAR uni_ch(ch);
+    int step;
+    if (uni_ch.utf8_len() > 0 && (step = utf8_step(uni_ch.utf8())) > 0) {
+      utf8_str.append(uni_ch.utf8(), step);
+    } else {
+      return "";
+    }
+  }
+  return utf8_str;
+}
+
+}  // namespace tesseract
diff --git a/training/Makefile.am b/training/Makefile.am
@@ -21,7 +21,9 @@ noinst_HEADERS = \
     boxchar.h commandlineflags.h commontraining.h degradeimage.h \
       fileio.h icuerrorcode.h ligature_table.h lstmtester.h normstrngs.h \
       mergenf.h pango_font_info.h stringrenderer.h \
-      tessopt.h tlog.h unicharset_training_utils.h util.h
+      tessopt.h tlog.h unicharset_training_utils.h util.h \
+      validate_grapheme.h validate_indic.h validate_khmer.h \
+      validate_myanmar.h validator.h
 
 noinst_LTLIBRARIES = libtesseract_training.la libtesseract_tessopt.la
 
@@ -32,7 +34,9 @@ libtesseract_training_la_LIBADD = \
 libtesseract_training_la_SOURCES = \
     boxchar.cpp commandlineflags.cpp commontraining.cpp degradeimage.cpp \
       fileio.cpp ligature_table.cpp lstmtester.cpp normstrngs.cpp pango_font_info.cpp \
-      stringrenderer.cpp tlog.cpp unicharset_training_utils.cpp
+      stringrenderer.cpp tlog.cpp unicharset_training_utils.cpp \
+      validate_grapheme.cpp validate_indic.cpp validate_khmer.cpp \
+      validate_myanmar.cpp validator.cpp
 
 libtesseract_tessopt_la_SOURCES = \
     tessopt.cpp

diff --git a/training/normstrngs.cpp b/training/normstrngs.cpp
@@ -21,6 +21,10 @@
 #include "normstrngs.h"
 
 #include <assert.h>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
 #include "icuerrorcode.h"
 #include "unichar.h"
 #include "unicode/normalizer2.h"  // From libicu
@@ -34,99 +38,166 @@ namespace tesseract {
 bool is_hyphen_punc(const char32 ch) {
   static const int kNumHyphenPuncUnicodes = 13;
   static const char32 kHyphenPuncUnicodes[kNumHyphenPuncUnicodes] = {
-    '-',
-    0x2010, 0x2011, 0x2012, 0x2013, 0x2014, 0x2015,  // hyphen..horizontal bar
-    0x207b,  // superscript minus
-    0x208b,  // subscript minus
-    0x2212,  // minus sign
-    0xfe58,  // small em dash
-    0xfe63,  // small hyphen-minus
-    0xff0d,  // fullwidth hyphen-minus
+      '-',    0x2010, 0x2011, 0x2012,
+      0x2013, 0x2014, 0x2015,  // hyphen..horizontal bar
+      0x207b,                  // superscript minus
+      0x208b,                  // subscript minus
+      0x2212,                  // minus sign
+      0xfe58,                  // small em dash
+      0xfe63,                  // small hyphen-minus
+      0xff0d,                  // fullwidth hyphen-minus
   };
   for (int i = 0; i < kNumHyphenPuncUnicodes; ++i) {
-    if (kHyphenPuncUnicodes[i] == ch)
-      return true;
+    if (kHyphenPuncUnicodes[i] == ch) return true;
   }
   return false;
 }
 
 bool is_single_quote(const char32 ch) {
   static const int kNumSingleQuoteUnicodes = 8;
   static const char32 kSingleQuoteUnicodes[kNumSingleQuoteUnicodes] = {
-    '\'',
-    '`',
-    0x2018,  // left single quotation mark (English, others)
-    0x2019,  // right single quotation mark (Danish, Finnish, Swedish, Norw.)
-             // We may have to introduce a comma set with 0x201a
-    0x201B,  // single high-reveresed-9 quotation mark (PropList.txt)
-    0x2032,  // prime
-    0x300C,  // left corner bracket (East Asian languages)
-    0xFF07,  // fullwidth apostrophe
+      '\'', '`',
+      0x2018,  // left single quotation mark (English, others)
+      0x2019,  // right single quotation mark (Danish, Finnish, Swedish, Norw.)
+               // We may have to introduce a comma set with 0x201a
+      0x201B,  // single high-reveresed-9 quotation mark (PropList.txt)
+      0x2032,  // prime
+      0x300C,  // left corner bracket (East Asian languages)
+      0xFF07,  // fullwidth apostrophe
   };
   for (int i = 0; i < kNumSingleQuoteUnicodes; ++i) {
-    if (kSingleQuoteUnicodes[i] == ch)
-      return true;
+    if (kSingleQuoteUnicodes[i] == ch) return true;
   }
   return false;
 }
 
 bool is_double_quote(const char32 ch) {
   static const int kNumDoubleQuoteUnicodes = 8;
   static const char32 kDoubleQuoteUnicodes[kNumDoubleQuoteUnicodes] = {
-    '"',
-    0x201C,  // left double quotation mark (English, others)
-    0x201D,  // right double quotation mark (Danish, Finnish, Swedish, Norw.)
-    0x201F,  // double high-reversed-9 quotation mark (PropList.txt)
-    0x2033,  // double prime
-    0x301D,  // reversed double prime quotation mark (East Asian langs, horiz.)
-    0x301E,  // close double prime (East Asian languages written horizontally)
-    0xFF02,  // fullwidth quotation mark
+      '"',
+      0x201C,  // left double quotation mark (English, others)
+      0x201D,  // right double quotation mark (Danish, Finnish, Swedish, Norw.)
+      0x201F,  // double high-reversed-9 quotation mark (PropList.txt)
+      0x2033,  // double prime
+      0x301D,  // reversed double prime quotation mark (East Asian langs,
+               // horiz.)
+      0x301E,  // close double prime (East Asian languages written horizontally)
+      0xFF02,  // fullwidth quotation mark
   };
   for (int i = 0; i < kNumDoubleQuoteUnicodes; ++i) {
-    if (kDoubleQuoteUnicodes[i] == ch)
-      return true;
+    if (kDoubleQuoteUnicodes[i] == ch) return true;
   }
   return false;
 }
 
-STRING NormalizeUTF8String(bool decompose, const char* str8) {
-  GenericVector<char32> str32, out_str32, norm_str;
-  UTF8ToUTF32(str8, &str32);
-  for (int i = 0; i < str32.length(); ++i) {
-    norm_str.clear();
-    NormalizeChar32(str32[i], decompose, &norm_str);
-    for (int j = 0; j < norm_str.length(); ++j) {
-      out_str32.push_back(norm_str[j]);
-    }
-  }
-  STRING out_str8;
-  UTF32ToUTF8(out_str32, &out_str8);
-  return out_str8;
-}
-
-void NormalizeChar32(char32 ch, bool decompose, GenericVector<char32>* str) {
+// Helper runs a standard unicode normalization, optional OCR normalization,
+// and leaves the result as char32 for subsequent processing.
+static void NormalizeUTF8ToUTF32(UnicodeNormMode u_mode, OCRNorm ocr_normalize,
+                                 const char* str8,
+                                 std::vector<char32>* normed32) {
+  // Convert to ICU string for unicode normalization.
+  icu::UnicodeString uch_str(str8, "UTF-8");
   IcuErrorCode error_code;
-  const icu::Normalizer2* nfkc = icu::Normalizer2::getInstance(
-      nullptr, "nfkc", decompose ? UNORM2_DECOMPOSE : UNORM2_COMPOSE,
-      error_code);
+  // Convert the enum to the new weird icu representation.
+  const char* norm_type =
+      u_mode == UnicodeNormMode::kNFKD || u_mode == UnicodeNormMode::kNFKC
+          ? "nfkc"
+          : "nfc";
+  UNormalization2Mode compose =
+      u_mode == UnicodeNormMode::kNFC || u_mode == UnicodeNormMode::kNFKC
+          ? UNORM2_COMPOSE
+          : UNORM2_DECOMPOSE;
+  // Pointer to singleton does not require deletion.
+  const icu::Normalizer2* normalizer =
+      icu::Normalizer2::getInstance(nullptr, norm_type, compose, error_code);
   error_code.assertSuccess();
   error_code.reset();
-
-  icu::UnicodeString uch_str(static_cast<UChar32>(ch));
-  icu::UnicodeString norm_str = nfkc->normalize(uch_str, error_code);
+  icu::UnicodeString norm_str = normalizer->normalize(uch_str, error_code);
   error_code.assertSuccess();
+  // Convert to char32 for output. OCR normalization if required.
+  normed32->reserve(norm_str.length());  // An approximation.
+  for (int offset = 0; offset < norm_str.length();
+       offset = norm_str.moveIndex32(offset, 1)) {
+    char32 ch = norm_str.char32At(offset);
+    // Skip all ZWS, RTL and LTR marks.
+    if (Validator::IsZeroWidthMark(ch)) continue;
+    if (ocr_normalize == OCRNorm::kNormalize) ch = OCRNormalize(ch);
+    normed32->push_back(ch);
+  }
+}
+
+// Helper removes joiners from strings that contain no letters.
+static void StripJoiners(std::vector<char32>* str32) {
+  for (char32 ch : *str32) {
+    if (u_isalpha(ch)) return;
+  }
+  int len = 0;
+  for (char32 ch : *str32) {
+    if (ch != Validator::kZeroWidthJoiner &&
+        ch != Validator::kZeroWidthNonJoiner) {
+      (*str32)[len++] = ch;
+    }
+  }
+  str32->resize(len);
+}
+
+// Normalizes a UTF8 string according to the given modes. Returns true on
+// success. If false is returned, some failure or invalidity was present, and
+// the result string is produced on a "best effort" basis.
+bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize,
+                         GraphemeNorm grapheme_normalize, const char* str8,
+                         string* normalized) {
+  std::vector<char32> normed32;
+  NormalizeUTF8ToUTF32(u_mode, ocr_normalize, str8, &normed32);
+  if (grapheme_normalize == GraphemeNorm::kNormalize) {
+    StripJoiners(&normed32);
+    std::vector<std::vector<char32>> graphemes;
+    bool success = Validator::ValidateCleanAndSegment(
+        GraphemeNormMode::kSingleString, false, normed32, &graphemes);
+    if (graphemes.empty() || graphemes[0].empty()) {
+      success = false;
+    } else if (normalized != nullptr) {
+      *normalized = UNICHAR::UTF32ToUTF8(graphemes[0]);
+    }
+    return success;
+  }
+  if (normalized != nullptr) *normalized = UNICHAR::UTF32ToUTF8(normed32);
+  return true;
+}
 
-  str->clear();
-  for (int i = 0; i < norm_str.length(); ++i) {
-    // If any spaces were added by NFKC, pretend normalization is a nop.
-    if (norm_str[i] == ' ') {
-      str->clear();
-      str->push_back(ch);
-      break;
-    } else {
-      str->push_back(OCRNormalize(static_cast<char32>(norm_str[i])));
+// Normalizes a UTF8 string according to the given modes and splits into
+// graphemes according to g_mode. Returns true on success. If false is returned,
+// some failure or invalidity was present, and the result string is produced on
+// a "best effort" basis.
+bool NormalizeCleanAndSegmentUTF8(UnicodeNormMode u_mode, OCRNorm ocr_normalize,
+                                  GraphemeNormMode g_mode, bool report_errors,
+                                  const char* str8,
+                                  std::vector<string>* graphemes) {
+  std::vector<char32> normed32;
+  NormalizeUTF8ToUTF32(u_mode, ocr_normalize, str8, &normed32);
+  StripJoiners(&normed32);
+  std::vector<std::vector<char32>> graphemes32;
+  bool success = Validator::ValidateCleanAndSegment(g_mode, report_errors,
+                                                    normed32, &graphemes32);
+  if (g_mode != GraphemeNormMode::kSingleString && success) {
+    // If we modified the string to clean it up, the segmentation may not be
+    // correct, so check for changes and do it again.
+    std::vector<char32> cleaned32;
+    for (const auto& g : graphemes32) {
+      cleaned32.insert(cleaned32.end(), g.begin(), g.end());
     }
+    if (cleaned32 != normed32) {
+      graphemes32.clear();
+      success = Validator::ValidateCleanAndSegment(g_mode, report_errors,
+                                                   cleaned32, &graphemes32);
+    }
+  }
+  graphemes->clear();
+  graphemes->reserve(graphemes32.size());
+  for (const auto& grapheme : graphemes32) {
+    graphemes->push_back(UNICHAR::UTF32ToUTF8(grapheme));
   }
+  return success;
 }
 
 // Apply just the OCR-specific normalizations and return the normalized char.

diff --git a/training/normstrngs.h b/training/normstrngs.h
@@ -21,34 +21,50 @@
 #ifndef TESSERACT_CCUTIL_NORMSTRNGS_H_
 #define TESSERACT_CCUTIL_NORMSTRNGS_H_
 
-#include "genericvector.h"
-#include "strngs.h"
+#include <string>
+#include <vector>
 
-typedef signed int char32;
+#include "validator.h"
 
 namespace tesseract {
 
-// UTF-8 to UTF-32 conversion function.
-void UTF8ToUTF32(const char* utf8_str, GenericVector<char32>* str32);
-
-// UTF-32 to UTF-8 convesion function.
-void UTF32ToUTF8(const GenericVector<char32>& str32, STRING* utf8_str);
-
-// Normalize a single char32 using NFKC + OCR-specific transformations.
-// NOTE that proper NFKC may require multiple characters as input. The
-// assumption of this function is that the input is already as fully composed
-// as it can be, but may require some compatibility normalizations or just
-// OCR evaluation related normalizations.
-void NormalizeChar32(char32 ch, bool decompose, GenericVector<char32>* str);
-
-// Normalize a UTF8 string. Same as above, but for UTF8-encoded strings, that
-// can contain multiple UTF32 code points.
-STRING NormalizeUTF8String(bool decompose, const char* str8);
-// Default behavior is to compose, until it is proven that decomposed benefits
-// at least one language.
-inline STRING NormalizeUTF8String(const char* str8) {
-  return NormalizeUTF8String(false, str8);
-}
+// The standard unicode normalizations.
+enum class UnicodeNormMode {
+  kNFD,
+  kNFC,
+  kNFKD,
+  kNFKC,
+};
+
+// To normalize away differences in punctuation that are ambiguous, like
+// curly quotes and different widths of dash.
+enum class OCRNorm {
+  kNone,
+  kNormalize,
+};
+
+// To validate and normalize away some subtle differences that can occur in
+// Indic scripts, eg ensuring that an explicit virama is always followed by
+// a zero-width non-joiner.
+enum class GraphemeNorm {
+  kNone,
+  kNormalize,
+};
+
+// Normalizes a UTF8 string according to the given modes. Returns true on
+// success. If false is returned, some failure or invalidity was present, and
+// the result string is produced on a "best effort" basis.
+bool NormalizeUTF8String(UnicodeNormMode u_mode, OCRNorm ocr_normalize,
+                         GraphemeNorm grapheme_normalize, const char* str8,
+                         string* normalized);
+// Normalizes a UTF8 string according to the given modes and splits into
+// graphemes according to g_mode. Returns true on success. If false is returned,
+// some failure or invalidity was present, and the result string is produced on
+// a "best effort" basis.
+bool NormalizeCleanAndSegmentUTF8(UnicodeNormMode u_mode, OCRNorm ocr_normalize,
+                                  GraphemeNormMode g_mode, bool report_errors,
+                                  const char* str8,
+                                  std::vector<string>* graphemes);
 
 // Applies just the OCR-specific normalizations and return the normalized char.
 char32 OCRNormalize(char32 ch);

diff --git a/training/unicharset_training_utils.cpp b/training/unicharset_training_utils.cpp
@@ -122,8 +122,14 @@ void SetupBasicProperties(bool report_errors, bool decompose,
     }
 
     // Record normalized version of this unichar.
-    string normed_str = tesseract::NormalizeUTF8String(decompose, unichar_str);
-    if (unichar_id != 0 && !normed_str.empty()) {
+    string normed_str;
+    if (unichar_id != 0 &&
+        tesseract::NormalizeUTF8String(
+            decompose ? tesseract::UnicodeNormMode::kNFKD
+                      : tesseract::UnicodeNormMode::kNFKC,
+            tesseract::OCRNorm::kNormalize, tesseract::GraphemeNorm::kNone,
+            unichar_str, &normed_str) &&
+        !normed_str.empty()) {
       unicharset->set_normed(unichar_id, normed_str.c_str());
     } else {
       unicharset->set_normed(unichar_id, unichar_str);