Rewrote the recoder to use an encoding based on wubi instead of radic…

…al-stroke index, changed from normalized to unnormalized unichar representation
tesseract-ocr · Jul 25, 2017 · 0e95e2c · 0e95e2c
1 parent b0ead95
commit 0e95e2c
Show file tree

Hide file tree

Showing 2 changed files with 71 additions and 97 deletions.
diff --git a/ccutil/unicharcompress.cpp b/ccutil/unicharcompress.cpp
@@ -20,72 +20,65 @@
 ///////////////////////////////////////////////////////////////////////
 
 #include "unicharcompress.h"
+#include <memory>
 #include "tprintf.h"
 
 namespace tesseract {
 
 // String used to represent the null_id in direct_set.
 const char* kNullChar = "<nul>";
+// Radix to make unique values from the stored radical codes.
+const int kRadicalRadix = 29;
 
-// Local struct used only for processing the radical-stroke table.
-struct RadicalStroke {
-  RadicalStroke() : num_strokes(0) {}
-  RadicalStroke(const STRING& r, int s) : radical(r), num_strokes(s) {}
-
-  bool operator==(const RadicalStroke& other) const {
-    return radical == other.radical && num_strokes == other.num_strokes;
+// "Hash" function for const std::vector<int> computes the sum of elements.
+// Build a unique number for each code sequence that we can use as the index in
+// a hash map of ints instead of trying to hash the vectors.
+static int RadicalPreHash(const std::vector<int>& rs) {
+  size_t result = 0;
+  for (int radical : rs) {
+    result *= kRadicalRadix;
+    result += radical;
   }
+  return result;
+}
 
-  // The radical is encoded as a string because its format is of an int with
-  // an optional ' mark to indicate a simplified shape. To treat these as
-  // distinct, we use a string and a UNICHARSET to do the integer mapping.
-  STRING radical;
-  // The number of strokes we treat as dense and just take the face value from
-  // the table.
-  int num_strokes;
-};
+// A hash map to convert unicodes to radical encoding.
+typedef std::unordered_map<int, std::unique_ptr<std::vector<int>>> RSMap;
+// A hash map to count occurrences of each radical encoding.
+typedef std::unordered_map<int, int> RSCounts;
 
-// Hash functor for RadicalStroke.
-struct RadicalStrokedHash {
-  size_t operator()(const RadicalStroke& rs) const {
-    size_t result = rs.num_strokes;
-    for (int i = 0; i < rs.radical.length(); ++i) {
-      result ^= rs.radical[i] << (6 * i + 8);
-    }
-    return result;
+static bool DecodeRadicalLine(STRING* radical_data_line, RSMap* radical_map) {
+  if (radical_data_line->length() == 0 || (*radical_data_line)[0] == '#')
+    return true;
+  GenericVector<STRING> entries;
+  radical_data_line->split(' ', &entries);
+  if (entries.size() < 2) return false;
+  char* end = nullptr;
+  int unicode = strtol(&entries[0][0], &end, 10);
+  if (*end != '\0') return false;
+  std::unique_ptr<std::vector<int>> radicals(new std::vector<int>);
+  for (int i = 1; i < entries.size(); ++i) {
+    int radical = strtol(&entries[i][0], &end, 10);
+    if (*end != '\0') return false;
+    radicals->push_back(radical);
   }
-};
-
-// A hash map to convert unicodes to radical,stroke pair.
-typedef std::unordered_map<int, RadicalStroke> RSMap;
-// A hash map to count occurrences of each radical,stroke pair.
-typedef std::unordered_map<RadicalStroke, int, RadicalStrokedHash> RSCounts;
+  (*radical_map)[unicode] = std::move(radicals);
+  return true;
+}
 
 // Helper function builds the RSMap from the radical-stroke file, which has
 // already been read into a STRING. Returns false on error.
 // The radical_stroke_table is non-const because it gets split and the caller
 // is unlikely to want to use it again.
-static bool DecodeRadicalStrokeTable(STRING* radical_stroke_table,
-                                     RSMap* radical_map) {
+static bool DecodeRadicalTable(STRING* radical_data, RSMap* radical_map) {
   GenericVector<STRING> lines;
-  radical_stroke_table->split('\n', &lines);
+  radical_data->split('\n', &lines);
   for (int i = 0; i < lines.size(); ++i) {
-    if (lines[i].length() == 0 || lines[i][0] == '#') continue;
-    unsigned unicode;
-    int radical, strokes;
-    STRING str_radical;
-    if (sscanf(lines[i].string(), "%x\t%d.%d", &unicode, &radical, &strokes) ==
-        3) {
-      str_radical.add_str_int("", radical);
-    } else if (sscanf(lines[i].string(), "%x\t%d'.%d", &unicode, &radical,
-                      &strokes) == 3) {
-      str_radical.add_str_int("'", radical);
-    } else {
-      tprintf("Invalid format in radical stroke table at line %d: %s\n", i,
+    if (!DecodeRadicalLine(&lines[i], radical_map)) {
+      tprintf("Invalid format in radical table at line %d: %s\n", i,
               lines[i].string());
       return false;
     }
-    (*radical_map)[unicode] = RadicalStroke(str_radical, strokes);
   }
   return true;
 }
@@ -108,14 +101,13 @@ UnicharCompress& UnicharCompress::operator=(const UnicharCompress& src) {
 bool UnicharCompress::ComputeEncoding(const UNICHARSET& unicharset, int null_id,
                                       STRING* radical_stroke_table) {
   RSMap radical_map;
-  if (!DecodeRadicalStrokeTable(radical_stroke_table, &radical_map))
+  if (radical_stroke_table != nullptr &&
+      !DecodeRadicalTable(radical_stroke_table, &radical_map))
     return false;
   encoder_.clear();
   UNICHARSET direct_set;
-  UNICHARSET radicals;
-  // To avoid unused codes, clear the special codes from the unicharsets.
+  // To avoid unused codes, clear the special codes from the direct_set.
   direct_set.clear();
-  radicals.clear();
   // Always keep space as 0;
   direct_set.unichar_insert(" ", OldUncleanUnichars::kTrue);
   // Null char is next if we have one.
@@ -134,41 +126,31 @@ bool UnicharCompress::ComputeEncoding(const UNICHARSET& unicharset, int null_id,
   int han_offset = hangul_offset + kTotalJamos;
   int max_num_strokes = -1;
   for (int u = 0; u <= unicharset.size(); ++u) {
-    bool self_normalized = false;
     // We special-case allow null_id to be equal to unicharset.size() in case
     // there is no space in unicharset for it.
-    if (u == unicharset.size()) {
-      if (u == null_id) {
-        self_normalized = true;
-      } else {
-        break;  // Finished.
-      }
-    } else {
-      self_normalized = strcmp(unicharset.id_to_unichar(u),
-                               unicharset.get_normed_unichar(u)) == 0;
-    }
+    if (u == unicharset.size() && u != null_id) break;  // Finished
     RecodedCharID code;
     // Convert to unicodes.
     std::vector<char32> unicodes;
+    string cleaned;
+    if (u < unicharset.size())
+      cleaned = UNICHARSET::CleanupString(unicharset.id_to_unichar(u));
     if (u < unicharset.size() &&
-        (unicodes = UNICHAR::UTF8ToUTF32(unicharset.get_normed_unichar(u)))
-                .size() == 1) {
+        (unicodes = UNICHAR::UTF8ToUTF32(cleaned.c_str())).size() == 1) {
       // Check single unicodes for Hangul/Han and encode if so.
       int unicode = unicodes[0];
       int leading, vowel, trailing;
       auto it = radical_map.find(unicode);
       if (it != radical_map.end()) {
-        // This is Han. Convert to radical, stroke, index.
-        if (!radicals.contains_unichar(it->second.radical.string())) {
-          radicals.unichar_insert(it->second.radical.string(),
-                                  OldUncleanUnichars::kTrue);
+        // This is Han. Use the radical codes directly.
+        int num_radicals = it->second->size();
+        for (int c = 0; c < num_radicals; ++c) {
+          code.Set(c, han_offset + (*it->second)[c]);
         }
-        int radical = radicals.unichar_to_id(it->second.radical.string());
-        int num_strokes = it->second.num_strokes;
-        int num_samples = radical_counts[it->second]++;
-        if (num_strokes > max_num_strokes) max_num_strokes = num_strokes;
-        code.Set3(radical + han_offset, num_strokes + han_offset,
-                  num_samples + han_offset);
+        int pre_hash = RadicalPreHash(*it->second);
+        int num_samples = radical_counts[pre_hash]++;
+        if (num_samples > 0)
+          code.Set(num_radicals, han_offset + num_samples + kRadicalRadix);
       } else if (DecomposeHangul(unicode, &leading, &vowel, &trailing)) {
         // This is Hangul. Since we know the exact size of each part at compile
         // time, it gets the bottom set of codes.
@@ -190,9 +172,8 @@ bool UnicharCompress::ComputeEncoding(const UNICHARSET& unicharset, int null_id,
         for (int i = 0; i < unicodes.size(); ++i) {
           int position = code.length();
           if (position >= RecodedCharID::kMaxCodeLen) {
-            tprintf("Unichar %d=%s->%s is too long to encode!!\n", u,
-                    unicharset.id_to_unichar(u),
-                    unicharset.get_normed_unichar(u));
+            tprintf("Unichar %d=%s is too long to encode!!\n", u,
+                    unicharset.id_to_unichar(u));
             return false;
           }
           int uni = unicodes[i];
@@ -202,29 +183,33 @@ bool UnicharCompress::ComputeEncoding(const UNICHARSET& unicharset, int null_id,
             direct_set.unichar_insert(utf8);
           code.Set(position, direct_set.unichar_to_id(utf8));
           delete[] utf8;
-          if (direct_set.size() > unicharset.size()) {
+          if (direct_set.size() >
+              unicharset.size() + !unicharset.has_special_codes()) {
             // Code space got bigger!
             tprintf("Code space expanded from original unicharset!!\n");
             return false;
           }
         }
       }
     }
-    code.set_self_normalized(self_normalized);
     encoder_.push_back(code);
   }
   // Now renumber Han to make all codes unique. We already added han_offset to
   // all Han. Now separate out the radical, stroke, and count codes for Han.
   // In the uniqued Han encoding, the 1st code uses the next radical_map.size()
   // values, the 2nd code uses the next max_num_strokes+1 values, and the 3rd
   // code uses the rest for the max number of duplicated radical/stroke combos.
-  int num_radicals = radicals.size();
-  for (int u = 0; u < unicharset.size(); ++u) {
-    RecodedCharID* code = &encoder_[u];
-    if ((*code)(0) >= han_offset) {
-      code->Set(1, (*code)(1) + num_radicals);
-      code->Set(2, (*code)(2) + num_radicals + max_num_strokes + 1);
+  int code_offset = 0;
+  for (int i = 0; i < RecodedCharID::kMaxCodeLen; ++i) {
+    int max_offset = 0;
+    for (int u = 0; u < unicharset.size(); ++u) {
+      RecodedCharID* code = &encoder_[u];
+      if (code->length() <= i) continue;
+      max_offset = std::max(max_offset, (*code)(i)-han_offset);
+      code->Set(i, (*code)(i) + code_offset);
     }
+    if (max_offset == 0) break;
+    code_offset += max_offset + 1;
   }
   DefragmentCodeValues(null_id >= 0 ? 1 : -1);
   SetupDecoder();
@@ -395,8 +380,7 @@ void UnicharCompress::SetupDecoder() {
   is_valid_start_.init_to_size(code_range_, false);
   for (int c = 0; c < encoder_.size(); ++c) {
     const RecodedCharID& code = encoder_[c];
-    if (code.self_normalized() || decoder_.find(code) == decoder_.end())
-      decoder_[code] = c;
+    decoder_[code] = c;
     is_valid_start_[code(0)] = true;
     RecodedCharID prefix = code;
     int len = code.length() - 1;

diff --git a/ccutil/unicharcompress.h b/ccutil/unicharcompress.h
@@ -36,7 +36,7 @@ class RecodedCharID {
   // The maximum length of a code.
   static const int kMaxCodeLen = 9;
 
-  RecodedCharID() : self_normalized_(0), length_(0) {
+  RecodedCharID() : self_normalized_(1), length_(0) {
     memset(code_, 0, sizeof(code_));
   }
   void Truncate(int length) { length_ = length; }
@@ -54,8 +54,6 @@ class RecodedCharID {
     code_[2] = code2;
   }
   // Accessors
-  bool self_normalized() const { return self_normalized_ != 0; }
-  void set_self_normalized(bool value) { self_normalized_ = value; }
   int length() const { return length_; }
   int operator()(int index) const { return code_[index]; }
 
@@ -133,9 +131,6 @@ class RecodedCharID {
 // position). For non-CJK, the same code value CAN be used in multiple
 // positions, eg the ff ligature is converted to <f> <nullchar> <f>, where <f>
 // is the same code as is used for the single f.
-// NOTE that an intended consequence of using the normalized text from the
-// unicharset is that the fancy quotes all map to a single code, so round-trip
-// conversion doesn't work for all unichar-ids.
 class UnicharCompress {
  public:
   UnicharCompress();
@@ -175,12 +170,7 @@ class UnicharCompress {
   // invalid input), and the encoding itself in code.
   int EncodeUnichar(int unichar_id, RecodedCharID* code) const;
   // Decodes code, returning the original unichar-id, or
-  // INVALID_UNICHAR_ID if the input is invalid. Note that this is not a perfect
-  // inverse of EncodeUnichar, since the unichar-id of U+2019 (curly single
-  // quote), for example, will have the same encoding as the unichar-id of
-  // U+0027 (ascii '). The foldings are obtained from the input unicharset,
-  // which in turn obtains them from NormalizeUTF8String in normstrngs.cpp,
-  // and include NFKC normalization plus others like quote and dash folding.
+  // INVALID_UNICHAR_ID if the input is invalid.
   int DecodeUnichar(const RecodedCharID& code) const;
   // Returns true if the given code is a valid start or single code.
   bool IsValidFirstCode(int code) const { return is_valid_start_[code]; }