Changed the way unicharsets are handled to allow support for the ™ ch…

…aracter. Can find the issue where it was requested.
tesseract-ocr · Jul 24, 2017 · b0ead95 · b0ead95
1 parent 4efc539
commit b0ead95
Show file tree

Hide file tree

Showing 9 changed files with 177 additions and 112 deletions.
diff --git a/ccstruct/ratngs.cpp b/ccstruct/ratngs.cpp
@@ -24,6 +24,7 @@
 
 #include "ratngs.h"
 
+#include <string>
 #include "blobs.h"
 #include "callcpp.h"
 #include "genericvector.h"
@@ -200,10 +201,12 @@ WERD_CHOICE::WERD_CHOICE(const char *src_string,
     : unicharset_(&unicharset){
   GenericVector<UNICHAR_ID> encoding;
   GenericVector<char> lengths;
-  if (unicharset.encode_string(src_string, true, &encoding, &lengths, NULL)) {
+  string cleaned = unicharset.CleanupString(src_string);
+  if (unicharset.encode_string(cleaned.c_str(), true, &encoding, &lengths,
+                               NULL)) {
     lengths.push_back('\0');
     STRING src_lengths = &lengths[0];
-    this->init(src_string, src_lengths.string(), 0.0, 0.0, NO_PERM);
+    this->init(cleaned.c_str(), src_lengths.string(), 0.0, 0.0, NO_PERM);
   } else {  // There must have been an invalid unichar in the string.
     this->init(8);
     this->make_bad();

diff --git a/ccutil/ambigs.cpp b/ccutil/ambigs.cpp
@@ -357,7 +357,7 @@ bool UnicharAmbigs::InsertIntoTable(
   // Insert the corresponding correct ngram into the unicharset.
   // Unicharset code assumes that the "base" ngram is inserted into
   // the unicharset before fragments of this ngram are inserted.
-  unicharset->unichar_insert(replacement_string);
+  unicharset->unichar_insert(replacement_string, OldUncleanUnichars::kTrue);
   ambig_spec->correct_ngram_id =
     unicharset->unichar_to_id(replacement_string);
   if (replacement_ambig_part_size > 1) {
@@ -372,7 +372,7 @@ bool UnicharAmbigs::InsertIntoTable(
     } else {
       STRING frag_str = CHAR_FRAGMENT::to_string(
           replacement_string, i, test_ambig_part_size, false);
-      unicharset->unichar_insert(frag_str.string());
+      unicharset->unichar_insert(frag_str.string(), OldUncleanUnichars::kTrue);
       unichar_id = unicharset->unichar_to_id(frag_str.string());
     }
     ambig_spec->correct_fragments[i] = unichar_id;

diff --git a/ccutil/unicharcompress.cpp b/ccutil/unicharcompress.cpp
@@ -117,7 +117,7 @@ bool UnicharCompress::ComputeEncoding(const UNICHARSET& unicharset, int null_id,
   direct_set.clear();
   radicals.clear();
   // Always keep space as 0;
-  direct_set.unichar_insert(" ");
+  direct_set.unichar_insert(" ", OldUncleanUnichars::kTrue);
   // Null char is next if we have one.
   if (null_id >= 0) {
     direct_set.unichar_insert(kNullChar);
@@ -160,7 +160,8 @@ bool UnicharCompress::ComputeEncoding(const UNICHARSET& unicharset, int null_id,
       if (it != radical_map.end()) {
         // This is Han. Convert to radical, stroke, index.
         if (!radicals.contains_unichar(it->second.radical.string())) {
-          radicals.unichar_insert(it->second.radical.string());
+          radicals.unichar_insert(it->second.radical.string(),
+                                  OldUncleanUnichars::kTrue);
         }
         int radical = radicals.unichar_to_id(it->second.radical.string());
         int num_strokes = it->second.num_strokes;

diff --git a/ccutil/unicharmap.cpp b/ccutil/unicharmap.cpp
@@ -31,41 +31,24 @@ UNICHARMAP::~UNICHARMAP() {
     delete[] nodes;
 }
 
-// Search the given unichar representation in the tree. Each character in the
-// string is interpreted as an index in an array of nodes.
-UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr) const {
-  const char* current_char = unichar_repr;
-  UNICHARMAP_NODE* current_nodes = nodes;
-
-  assert(*unichar_repr != '\0');
-
-  do {
-    if (*(current_char + 1) == '\0')
-      return current_nodes[static_cast<unsigned char>(*current_char)].id;
-    current_nodes =
-        current_nodes[static_cast<unsigned char>(*current_char)].children;
-    ++current_char;
-  } while (true);
-}
-
 // Search the given unichar representation in the tree, using length characters
 // from it maximum. Each character in the string is interpreted as an index in
 // an array of nodes.
 UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr,
                                      int length) const {
-  const char* current_char = unichar_repr;
   UNICHARMAP_NODE* current_nodes = nodes;
 
   assert(*unichar_repr != '\0');
   assert(length > 0 && length <= UNICHAR_LEN);
 
+  int index = 0;
+  if (index >= length || unichar_repr[index] == '\0') return INVALID_UNICHAR_ID;
   do {
-    if (length == 1 || *(current_char + 1) == '\0')
-      return current_nodes[static_cast<unsigned char>(*current_char)].id;
+    if (index + 1 >= length || unichar_repr[index + 1] == '\0')
+      return current_nodes[static_cast<unsigned char>(unichar_repr[index])].id;
     current_nodes =
-        current_nodes[static_cast<unsigned char>(*current_char)].children;
-    ++current_char;
-    --length;
+        current_nodes[static_cast<unsigned char>(unichar_repr[index])].children;
+    ++index;
   } while (true);
 }
 
@@ -75,15 +58,12 @@ UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr,
 // string is interpreted as an index in an array of nodes.
 void UNICHARMAP::insert(const char* const unichar_repr, UNICHAR_ID id) {
   const char* current_char = unichar_repr;
+  if (*current_char == '\0') return;
   UNICHARMAP_NODE** current_nodes_pointer = &nodes;
-
-  assert(*unichar_repr != '\0');
-  assert(id >= 0);
-
   do {
     if (*current_nodes_pointer == 0)
       *current_nodes_pointer = new UNICHARMAP_NODE[256];
-    if (*(current_char + 1) == '\0') {
+    if (current_char[1] == '\0') {
       (*current_nodes_pointer)
           [static_cast<unsigned char>(*current_char)].id = id;
       return;
@@ -95,24 +75,6 @@ void UNICHARMAP::insert(const char* const unichar_repr, UNICHAR_ID id) {
   } while (true);
 }
 
-// Search the given unichar representation in the tree. Each character in the
-// string is interpreted as an index in an array of nodes. Stop once the tree
-// does not have anymore nodes or once we found the right unichar_repr.
-bool UNICHARMAP::contains(const char* const unichar_repr) const {
-  if (unichar_repr == NULL || *unichar_repr == '\0') return false;
-
-  const char* current_char = unichar_repr;
-  UNICHARMAP_NODE* current_nodes = nodes;
-
-  while (current_nodes != 0 && *(current_char + 1) != '\0') {
-    current_nodes =
-        current_nodes[static_cast<unsigned char>(*current_char)].children;
-    ++current_char;
-  }
-  return current_nodes != 0 && *(current_char + 1) == '\0' &&
-      current_nodes[static_cast<unsigned char>(*current_char)].id >= 0;
-}
-
 // Search the given unichar representation in the tree, using length characters
 // from it maximum. Each character in the string is interpreted as an index in
 // an array of nodes. Stop once the tree does not have anymore nodes or once we
@@ -121,24 +83,26 @@ bool UNICHARMAP::contains(const char* const unichar_repr,
                           int length) const {
   if (unichar_repr == NULL || *unichar_repr == '\0') return false;
   if (length <= 0 || length > UNICHAR_LEN) return false;
-
-  const char* current_char = unichar_repr;
+  int index = 0;
+  if (index >= length || unichar_repr[index] == '\0') return false;
   UNICHARMAP_NODE* current_nodes = nodes;
 
-  while (current_nodes != 0 && (length > 1 && *(current_char + 1) != '\0')) {
+  while (current_nodes != 0 && index + 1 < length &&
+         unichar_repr[index + 1] != '\0') {
     current_nodes =
-        current_nodes[static_cast<unsigned char>(*current_char)].children;
-    --length;
-    ++current_char;
+        current_nodes[static_cast<unsigned char>(unichar_repr[index])].children;
+    ++index;
   }
-  return current_nodes != 0 && (length == 1 || *(current_char + 1) == '\0') &&
-      current_nodes[static_cast<unsigned char>(*current_char)].id >= 0;
+  return current_nodes != 0 &&
+         (index + 1 >= length || unichar_repr[index + 1] == '\0') &&
+         current_nodes[static_cast<unsigned char>(unichar_repr[index])].id >= 0;
 }
 
 // Return the minimum number of characters that must be used from this string
 // to obtain a match in the UNICHARMAP.
 int UNICHARMAP::minmatch(const char* const unichar_repr) const {
   const char* current_char = unichar_repr;
+  if (*current_char == '\0') return 0;
   UNICHARMAP_NODE* current_nodes = nodes;
 
   while (current_nodes != NULL && *current_char != '\0') {

diff --git a/ccutil/unicharmap.h b/ccutil/unicharmap.h
@@ -36,21 +36,12 @@ class UNICHARMAP {
   // with the given id. The length of the representation MUST be non-zero.
   void insert(const char* const unichar_repr, UNICHAR_ID id);
 
-  // Return the id associated with the given unichar representation,
-  // this representation MUST exist within the UNICHARMAP.
-  // The length of the representation MUST be non-zero.
-  UNICHAR_ID unichar_to_id(const char* const unichar_repr) const;
-
   // Return the id associated with the given unichar representation,
   // this representation MUST exist within the UNICHARMAP. The first
   // length characters (maximum) from unichar_repr are used. The length
   // MUST be non-zero.
   UNICHAR_ID unichar_to_id(const char* const unichar_repr, int length) const;
 
-  // Return true if the given unichar representation is already present in the
-  // UNICHARMAP. The length of the representation MUST be non-zero.
-  bool contains(const char* const unichar_repr) const;
-
   // Return true if the given unichar representation is already present in the
   // UNICHARMAP. The first length characters (maximum) from unichar_repr are
   // used. The length MUST be non-zero.