Skip to content

Commit

Permalink
Changed the way unicharsets are handled to allow support for the ™ ch…
Browse files Browse the repository at this point in the history
…aracter. Can find the issue where it was requested.
  • Loading branch information
theraysmith committed Jul 24, 2017
1 parent 4efc539 commit b0ead95
Show file tree
Hide file tree
Showing 9 changed files with 177 additions and 112 deletions.
7 changes: 5 additions & 2 deletions ccstruct/ratngs.cpp
Expand Up @@ -24,6 +24,7 @@

#include "ratngs.h"

#include <string>
#include "blobs.h"
#include "callcpp.h"
#include "genericvector.h"
Expand Down Expand Up @@ -200,10 +201,12 @@ WERD_CHOICE::WERD_CHOICE(const char *src_string,
: unicharset_(&unicharset){
GenericVector<UNICHAR_ID> encoding;
GenericVector<char> lengths;
if (unicharset.encode_string(src_string, true, &encoding, &lengths, NULL)) {
string cleaned = unicharset.CleanupString(src_string);
if (unicharset.encode_string(cleaned.c_str(), true, &encoding, &lengths,
NULL)) {
lengths.push_back('\0');
STRING src_lengths = &lengths[0];
this->init(src_string, src_lengths.string(), 0.0, 0.0, NO_PERM);
this->init(cleaned.c_str(), src_lengths.string(), 0.0, 0.0, NO_PERM);
} else { // There must have been an invalid unichar in the string.
this->init(8);
this->make_bad();
Expand Down
4 changes: 2 additions & 2 deletions ccutil/ambigs.cpp
Expand Up @@ -357,7 +357,7 @@ bool UnicharAmbigs::InsertIntoTable(
// Insert the corresponding correct ngram into the unicharset.
// Unicharset code assumes that the "base" ngram is inserted into
// the unicharset before fragments of this ngram are inserted.
unicharset->unichar_insert(replacement_string);
unicharset->unichar_insert(replacement_string, OldUncleanUnichars::kTrue);
ambig_spec->correct_ngram_id =
unicharset->unichar_to_id(replacement_string);
if (replacement_ambig_part_size > 1) {
Expand All @@ -372,7 +372,7 @@ bool UnicharAmbigs::InsertIntoTable(
} else {
STRING frag_str = CHAR_FRAGMENT::to_string(
replacement_string, i, test_ambig_part_size, false);
unicharset->unichar_insert(frag_str.string());
unicharset->unichar_insert(frag_str.string(), OldUncleanUnichars::kTrue);
unichar_id = unicharset->unichar_to_id(frag_str.string());
}
ambig_spec->correct_fragments[i] = unichar_id;
Expand Down
5 changes: 3 additions & 2 deletions ccutil/unicharcompress.cpp
Expand Up @@ -117,7 +117,7 @@ bool UnicharCompress::ComputeEncoding(const UNICHARSET& unicharset, int null_id,
direct_set.clear();
radicals.clear();
// Always keep space as 0;
direct_set.unichar_insert(" ");
direct_set.unichar_insert(" ", OldUncleanUnichars::kTrue);
// Null char is next if we have one.
if (null_id >= 0) {
direct_set.unichar_insert(kNullChar);
Expand Down Expand Up @@ -160,7 +160,8 @@ bool UnicharCompress::ComputeEncoding(const UNICHARSET& unicharset, int null_id,
if (it != radical_map.end()) {
// This is Han. Convert to radical, stroke, index.
if (!radicals.contains_unichar(it->second.radical.string())) {
radicals.unichar_insert(it->second.radical.string());
radicals.unichar_insert(it->second.radical.string(),
OldUncleanUnichars::kTrue);
}
int radical = radicals.unichar_to_id(it->second.radical.string());
int num_strokes = it->second.num_strokes;
Expand Down
72 changes: 18 additions & 54 deletions ccutil/unicharmap.cpp
Expand Up @@ -31,41 +31,24 @@ UNICHARMAP::~UNICHARMAP() {
delete[] nodes;
}

// Search the given unichar representation in the tree. Each character in the
// string is interpreted as an index in an array of nodes.
UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr) const {
const char* current_char = unichar_repr;
UNICHARMAP_NODE* current_nodes = nodes;

assert(*unichar_repr != '\0');

do {
if (*(current_char + 1) == '\0')
return current_nodes[static_cast<unsigned char>(*current_char)].id;
current_nodes =
current_nodes[static_cast<unsigned char>(*current_char)].children;
++current_char;
} while (true);
}

// Search the given unichar representation in the tree, using length characters
// from it maximum. Each character in the string is interpreted as an index in
// an array of nodes.
UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr,
int length) const {
const char* current_char = unichar_repr;
UNICHARMAP_NODE* current_nodes = nodes;

assert(*unichar_repr != '\0');
assert(length > 0 && length <= UNICHAR_LEN);

int index = 0;
if (index >= length || unichar_repr[index] == '\0') return INVALID_UNICHAR_ID;
do {
if (length == 1 || *(current_char + 1) == '\0')
return current_nodes[static_cast<unsigned char>(*current_char)].id;
if (index + 1 >= length || unichar_repr[index + 1] == '\0')
return current_nodes[static_cast<unsigned char>(unichar_repr[index])].id;
current_nodes =
current_nodes[static_cast<unsigned char>(*current_char)].children;
++current_char;
--length;
current_nodes[static_cast<unsigned char>(unichar_repr[index])].children;
++index;
} while (true);
}

Expand All @@ -75,15 +58,12 @@ UNICHAR_ID UNICHARMAP::unichar_to_id(const char* const unichar_repr,
// string is interpreted as an index in an array of nodes.
void UNICHARMAP::insert(const char* const unichar_repr, UNICHAR_ID id) {
const char* current_char = unichar_repr;
if (*current_char == '\0') return;
UNICHARMAP_NODE** current_nodes_pointer = &nodes;

assert(*unichar_repr != '\0');
assert(id >= 0);

do {
if (*current_nodes_pointer == 0)
*current_nodes_pointer = new UNICHARMAP_NODE[256];
if (*(current_char + 1) == '\0') {
if (current_char[1] == '\0') {
(*current_nodes_pointer)
[static_cast<unsigned char>(*current_char)].id = id;
return;
Expand All @@ -95,24 +75,6 @@ void UNICHARMAP::insert(const char* const unichar_repr, UNICHAR_ID id) {
} while (true);
}

// Search the given unichar representation in the tree. Each character in the
// string is interpreted as an index in an array of nodes. Stop once the tree
// does not have anymore nodes or once we found the right unichar_repr.
bool UNICHARMAP::contains(const char* const unichar_repr) const {
if (unichar_repr == NULL || *unichar_repr == '\0') return false;

const char* current_char = unichar_repr;
UNICHARMAP_NODE* current_nodes = nodes;

while (current_nodes != 0 && *(current_char + 1) != '\0') {
current_nodes =
current_nodes[static_cast<unsigned char>(*current_char)].children;
++current_char;
}
return current_nodes != 0 && *(current_char + 1) == '\0' &&
current_nodes[static_cast<unsigned char>(*current_char)].id >= 0;
}

// Search the given unichar representation in the tree, using length characters
// from it maximum. Each character in the string is interpreted as an index in
// an array of nodes. Stop once the tree does not have anymore nodes or once we
Expand All @@ -121,24 +83,26 @@ bool UNICHARMAP::contains(const char* const unichar_repr,
int length) const {
if (unichar_repr == NULL || *unichar_repr == '\0') return false;
if (length <= 0 || length > UNICHAR_LEN) return false;

const char* current_char = unichar_repr;
int index = 0;
if (index >= length || unichar_repr[index] == '\0') return false;
UNICHARMAP_NODE* current_nodes = nodes;

while (current_nodes != 0 && (length > 1 && *(current_char + 1) != '\0')) {
while (current_nodes != 0 && index + 1 < length &&
unichar_repr[index + 1] != '\0') {
current_nodes =
current_nodes[static_cast<unsigned char>(*current_char)].children;
--length;
++current_char;
current_nodes[static_cast<unsigned char>(unichar_repr[index])].children;
++index;
}
return current_nodes != 0 && (length == 1 || *(current_char + 1) == '\0') &&
current_nodes[static_cast<unsigned char>(*current_char)].id >= 0;
return current_nodes != 0 &&
(index + 1 >= length || unichar_repr[index + 1] == '\0') &&
current_nodes[static_cast<unsigned char>(unichar_repr[index])].id >= 0;
}

// Return the minimum number of characters that must be used from this string
// to obtain a match in the UNICHARMAP.
int UNICHARMAP::minmatch(const char* const unichar_repr) const {
const char* current_char = unichar_repr;
if (*current_char == '\0') return 0;
UNICHARMAP_NODE* current_nodes = nodes;

while (current_nodes != NULL && *current_char != '\0') {
Expand Down
9 changes: 0 additions & 9 deletions ccutil/unicharmap.h
Expand Up @@ -36,21 +36,12 @@ class UNICHARMAP {
// with the given id. The length of the representation MUST be non-zero.
void insert(const char* const unichar_repr, UNICHAR_ID id);

// Return the id associated with the given unichar representation,
// this representation MUST exist within the UNICHARMAP.
// The length of the representation MUST be non-zero.
UNICHAR_ID unichar_to_id(const char* const unichar_repr) const;

// Return the id associated with the given unichar representation,
// this representation MUST exist within the UNICHARMAP. The first
// length characters (maximum) from unichar_repr are used. The length
// MUST be non-zero.
UNICHAR_ID unichar_to_id(const char* const unichar_repr, int length) const;

// Return true if the given unichar representation is already present in the
// UNICHARMAP. The length of the representation MUST be non-zero.
bool contains(const char* const unichar_repr) const;

// Return true if the given unichar representation is already present in the
// UNICHARMAP. The first length characters (maximum) from unichar_repr are
// used. The length MUST be non-zero.
Expand Down

0 comments on commit b0ead95

Please sign in to comment.