Skip to content

Commit

Permalink
Removed debug messages, forward compatability of traineddata files, f…
Browse files Browse the repository at this point in the history
…urther bug fix.
  • Loading branch information
theraysmith committed Jul 9, 2015
1 parent a303ab9 commit 4412269
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 53 deletions.
13 changes: 7 additions & 6 deletions ccstruct/pageres.cpp
@@ -1,7 +1,12 @@
/**********************************************************************
* File: pageres.cpp (Formerly page_res.c)
* Description: Results classes used by control.c
* Author: Phil Cheatle
* Description: Hierarchy of results classes from PAGE_RES to WERD_RES
* and an iterator class to iterate over the words.
* Main purposes:
* Easy way to iterate over the words without a 3-nested loop.
* Holds data used during word recognition.
* Holds information about alternative spacing paths.
* Author: Phil Cheatle
* Created: Tue Sep 22 08:42:49 BST 1992
*
* (C) Copyright 1992, Hewlett-Packard Ltd.
Expand Down Expand Up @@ -1478,8 +1483,6 @@ void PAGE_RES_IT::MakeCurrentWordFuzzy() {
WERD* real_word = word_res->word;
if (!real_word->flag(W_FUZZY_SP) && !real_word->flag(W_FUZZY_NON)) {
real_word->set_flag(W_FUZZY_SP, true);
tprintf("Made word fuzzy at:");
real_word->bounding_box().print();
if (word_res->combination) {
// The next word should be the corresponding part of combo, but we have
// already stepped past it, so find it by search.
Expand All @@ -1493,8 +1496,6 @@ void PAGE_RES_IT::MakeCurrentWordFuzzy() {
ASSERT_HOST(!real_word->flag(W_FUZZY_SP) &&
!real_word->flag(W_FUZZY_NON));
real_word->set_flag(W_FUZZY_SP, true);
tprintf("Made part of combo word fuzzy at:");
real_word->bounding_box().print();
}
}
}
Expand Down
5 changes: 4 additions & 1 deletion ccutil/tessdatamanager.cpp
Expand Up @@ -50,7 +50,10 @@ bool TessdataManager::Init(const char *data_file_name, int debug_level) {
ReverseN(&actual_tessdata_num_entries_,
sizeof(actual_tessdata_num_entries_));
}
ASSERT_HOST(actual_tessdata_num_entries_ <= TESSDATA_NUM_ENTRIES);
if (actual_tessdata_num_entries_ > TESSDATA_NUM_ENTRIES) {
// For forward compatability, truncate to the number we can handle.
actual_tessdata_num_entries_ = TESSDATA_NUM_ENTRIES;
}
fread(offset_table_, sizeof(inT64),
actual_tessdata_num_entries_, data_file_);
if (swap_) {
Expand Down
66 changes: 25 additions & 41 deletions ccutil/unicharset.cpp
Expand Up @@ -215,34 +215,6 @@ int UNICHARSET::step(const char* str) const {
if (encoding.empty() || encoding[0] == INVALID_UNICHAR_ID) return 0;
return lengths[0];
}
// As step except constraining the search to unichar-ids that are
// self-normalized. Unlike step, does not encode the whole string, therefore
// should be used on short strings (like those obtained from
// get_normed_unichar.)
int UNICHARSET::normed_step(const char* str) const {
// Find the length of the first matching unicharset member.
int length = ids.minmatch(str);
if (length == 0)
return 0; // Empty string or illegal char.

while (length <= UNICHAR_LEN) {
if (ids.contains(str, length)) {
int matched_id = unichar_to_id(str, length);
const GenericVector<UNICHAR_ID>& matched_norms = normed_ids(matched_id);
bool good_start = matched_norms.size() == 1 &&
matched_norms[0] == matched_id;
if (str[length] == '\0') {
return good_start ? length : 0;
}
if (normed_step(str + length) > 0)
return length; // This length works!
} else if (str[length] == '\0') {
return 0; // Ran out of string.
}
++length;
}
return 0;
}

// Return whether the given UTF-8 string is encodable with this UNICHARSET.
// If not encodable, write the first byte offset which cannot be converted
Expand Down Expand Up @@ -375,19 +347,13 @@ STRING UNICHARSET::debug_str(UNICHAR_ID id) const {
// stored in the file, and needs to be set when the UNICHARSET is loaded.
void UNICHARSET::set_normed_ids(UNICHAR_ID unichar_id) {
unichars[unichar_id].properties.normed_ids.truncate(0);
int length = unichars[unichar_id].properties.normed.length();
const char* normed_str = unichars[unichar_id].properties.normed.string();
int step = 0;
for (int offset = 0; offset < length; offset+= step) {
step = normed_step(normed_str + offset);
if (step == 0) {
unichars[unichar_id].properties.normed_ids.truncate(0);
unichars[unichar_id].properties.normed_ids.push_back(unichar_id);
break;
}
int normed_id = unichar_to_id(normed_str + offset, step);
ASSERT_HOST(normed_id >= 0);
unichars[unichar_id].properties.normed_ids.push_back(normed_id);
if (unichar_id == UNICHAR_SPACE && id_to_unichar(unichar_id)[0] == ' ') {
unichars[unichar_id].properties.normed_ids.push_back(UNICHAR_SPACE);
} else if (!encode_string(unichars[unichar_id].properties.normed.string(),
true, &unichars[unichar_id].properties.normed_ids,
NULL, NULL)) {
unichars[unichar_id].properties.normed_ids.truncate(0);
unichars[unichar_id].properties.normed_ids.push_back(unichar_id);
}
}

Expand Down Expand Up @@ -1015,6 +981,24 @@ void UNICHARSET::set_black_and_whitelist(const char* blacklist,
}
}

// Returns true if there are any repeated unicodes in the normalized
// text of any unichar-id in the unicharset.
bool UNICHARSET::AnyRepeatedUnicodes() const {
int start_id = 0;
if (has_special_codes()) start_id = SPECIAL_UNICHAR_CODES_COUNT;
for (int id = start_id; id < size_used; ++id) {
// Convert to unicodes.
GenericVector<int> unicodes;
if (UNICHAR::UTF8ToUnicode(get_normed_unichar(id), &unicodes) &&
unicodes.size() > 1) {
for (int u = 1; u < unicodes.size(); ++u) {
if (unicodes[u - 1] == unicodes[u]) return true;
}
}
}
return false;
}

int UNICHARSET::add_script(const char* script) {
for (int i = 0; i < script_table_size_used; ++i) {
if (strcmp(script, script_table[i]) == 0)
Expand Down
10 changes: 5 additions & 5 deletions ccutil/unicharset.h
Expand Up @@ -190,11 +190,6 @@ class UNICHARSET {
// WARNING: this function now encodes the whole string for precision.
// Use encode_string in preference to repeatedly calling step.
int step(const char* str) const;
// As step except constraining the search to unichar-ids that are
// self-normalized. Unlike step, does not encode the whole string, therefore
// should be used on short strings (like those obtained from
// get_normed_unichar.)
int normed_step(const char* str) const;

// Return whether the given UTF-8 string is encodable with this UNICHARSET.
// If not encodable, write the first byte offset which cannot be converted
Expand Down Expand Up @@ -678,6 +673,10 @@ class UNICHARSET {
kSpecialUnicharCodes[UNICHAR_BROKEN]) == 0;
}

// Returns true if there are any repeated unicodes in the normalized
// text of any unichar-id in the unicharset.
bool AnyRepeatedUnicodes() const;

// Return a pointer to the CHAR_FRAGMENT class if the given
// unichar id represents a character fragment.
const CHAR_FRAGMENT *get_fragment(UNICHAR_ID unichar_id) const {
Expand Down Expand Up @@ -775,6 +774,7 @@ class UNICHARSET {

// Returns normalized version of unichar with the given unichar_id.
const char *get_normed_unichar(UNICHAR_ID unichar_id) const {
if (unichar_id == UNICHAR_SPACE && has_special_codes()) return " ";
return unichars[unichar_id].properties.normed.string();
}
// Returns a vector of UNICHAR_IDs that represent the ids of the normalized
Expand Down

0 comments on commit 4412269

Please sign in to comment.