Skip to content

Commit

Permalink
Added convert to int and directory listing to combine_tessdata
Browse files Browse the repository at this point in the history
  • Loading branch information
theraysmith committed Aug 2, 2017
1 parent 2ef1aea commit 77c44cd
Show file tree
Hide file tree
Showing 3 changed files with 42 additions and 8 deletions.
8 changes: 8 additions & 0 deletions lstm/lstmrecognizer.h
Expand Up @@ -127,6 +127,14 @@ class LSTMRecognizer {
series->ScaleLayerLearningRate(&id[1], factor);
}

// Converts the network to int if not already.
void ConvertToInt() {
if ((training_flags_ & TF_INT_MODE) == 0) {
network_->ConvertToInt();
training_flags_ |= TF_INT_MODE;
}
}

// Provides access to the UNICHARSET that this classifier works with.
const UNICHARSET& GetUnicharset() const { return ccutil_.unicharset; }
// Provides access to the UnicharCompress that this classifier works with.
Expand Down
8 changes: 0 additions & 8 deletions lstm/lstmtrainer.h
Expand Up @@ -251,14 +251,6 @@ class LSTMTrainer : public LSTMRecognizer {
const UnicharCompress* recoder, bool simple_text,
int null_char, GenericVector<int>* labels);

// Converts the network to int if not already.
void ConvertToInt() {
if ((training_flags_ & TF_INT_MODE) == 0) {
network_->ConvertToInt();
training_flags_ |= TF_INT_MODE;
}
}

// Performs forward-backward on the given trainingdata.
// Returns the sample that was used or NULL if the next sample was deemed
// unusable. samples_trainer could be this or an alternative trainer that
Expand Down
34 changes: 34 additions & 0 deletions training/combine_tessdata.cpp
Expand Up @@ -18,6 +18,7 @@
//
///////////////////////////////////////////////////////////////////////

#include "lstmrecognizer.h"
#include "tessdatamanager.h"

// Main program to combine/extract/overwrite tessdata components
Expand Down Expand Up @@ -122,6 +123,31 @@ int main(int argc, char **argv) {

// Write the updated traineddata file.
tm.OverwriteComponents(new_traineddata_filename, argv+3, argc-3);
} else if (argc == 3 && strcmp(argv[1], "-c") == 0) {
tm.Init(argv[2]);
tesseract::TFile fp;
if (!tm.GetComponent(tesseract::TESSDATA_LSTM, &fp)) {
tprintf("No LSTM Component found in %s!\n", argv[2]);
exit(1);
}
tesseract::LSTMRecognizer recognizer;
if (!recognizer.DeSerialize(&tm, &fp)) {
tprintf("Failed to deserialize LSTM in %s!\n", argv[2]);
exit(1);
}
recognizer.ConvertToInt();
GenericVector<char> lstm_data;
fp.OpenWrite(&lstm_data);
ASSERT_HOST(recognizer.Serialize(&tm, &fp));
tm.OverwriteEntry(tesseract::TESSDATA_LSTM, &lstm_data[0],
lstm_data.size());
if (!tm.SaveFile(argv[2], nullptr)) {
tprintf("Failed to write modified traineddata:%s!\n", argv[2]);
exit(1);
}
} else if (argc == 3 && strcmp(argv[1], "-d") == 0) {
// Initialize TessdataManager with the data in the given traineddata file.
tm.Init(argv[2]);
} else {
printf("Usage for combining tessdata components:\n"
" %s language_data_path_prefix\n"
Expand All @@ -137,6 +163,14 @@ int main(int argc, char **argv) {
printf("Usage for unpacking all tessdata components:\n"
" %s -u traineddata_file output_path_prefix\n"
" (e.g. %s -u eng.traineddata tmp/eng.)\n", argv[0], argv[0]);
printf(
"Usage for listing directory of components:\n"
" %s -d traineddata_file\n",
argv[0]);
printf(
"Usage for compacting LSTM component to int:\n"
" %s -c traineddata_file\n",
argv[0]);
return 1;
}
tm.Directory();
Expand Down

0 comments on commit 77c44cd

Please sign in to comment.