Skip to content

Commit

Permalink
Limited max height to 48 even in variable height input, enabled neura…
Browse files Browse the repository at this point in the history
…l nets via ocr engine mode
  • Loading branch information
theraysmith committed Nov 8, 2016
1 parent c1c1e42 commit f24ef67
Show file tree
Hide file tree
Showing 8 changed files with 61 additions and 21 deletions.
5 changes: 5 additions & 0 deletions ChangeLog
@@ -1,3 +1,8 @@
2016-11-11 - V4.00.00
* Added new neural network system based on LSTMs, with major accuracy gains.
* Improvements to PDF rendering.
* Fixes to trainingdata rendering.

2016-02-17 - V3.04.01
* Added OSD renderer for psm 0. Works for single page and multi-page images.
* Improve tesstrain.sh script.
Expand Down
36 changes: 30 additions & 6 deletions api/tesseractmain.cpp
Expand Up @@ -90,7 +90,7 @@ void PrintVersionInfo() {
void PrintUsage(const char* program) {
printf(
"Usage:\n"
" %s --help | --help-psm | --version\n"
" %s --help | --help-psm | --help-oem | --version\n"
" %s --list-langs [--tessdata-dir PATH]\n"
" %s --print-parameters [options...] [configfile...]\n"
" %s imagename|stdin outputbase|stdout [options...] [configfile...]\n",
Expand Down Expand Up @@ -120,6 +120,18 @@ void PrintHelpForPSM() {
printf("%s", msg);
}

void PrintHelpForOEM() {
const char* msg =
"OCR Engine modes:\n"
" 0 Original Tesseract only.\n"
" 1 Cube only.\n"
" 2 Tesseract + cube.\n"
" 3 Default, based on what is available.\n"
" 4 Neural nets (LSTM) only.\n";

printf("%s", msg);
}

void PrintHelpMessage(const char* program) {
PrintUsage(program);

Expand All @@ -132,15 +144,18 @@ void PrintHelpMessage(const char* program) {
" -c VAR=VALUE Set value for config variables.\n"
" Multiple -c arguments are allowed.\n"
" -psm NUM Specify page segmentation mode.\n"
" -oem NUM Specify OCR Engine mode.\n"
"NOTE: These options must occur before any configfile.\n";

printf("\n%s\n", ocr_options);
PrintHelpForPSM();
PrintHelpForOEM();

const char* single_options =
"Single options:\n"
" -h, --help Show this help message.\n"
" --help-psm Show page segmentation modes.\n"
" --help-oem Show OCR Engine modes.\n"
" -v, --version Show version information.\n"
" --list-langs List available languages for tesseract engine.\n"
" --print-parameters Print tesseract parameters to stdout.\n";
Expand Down Expand Up @@ -214,7 +229,8 @@ void ParseArgs(const int argc, char** argv, const char** lang,
const char** datapath, bool* list_langs, bool* print_parameters,
GenericVector<STRING>* vars_vec,
GenericVector<STRING>* vars_values, int* arg_i,
tesseract::PageSegMode* pagesegmode) {
tesseract::PageSegMode* pagesegmode,
tesseract::OcrEngineMode* enginemode) {
if (argc == 1) {
PrintHelpMessage(argv[0]);
exit(0);
Expand All @@ -229,6 +245,10 @@ void ParseArgs(const int argc, char** argv, const char** lang,
PrintHelpForPSM();
exit(0);
}
if ((strcmp(argv[1], "--help-oem") == 0)) {
PrintHelpForOEM();
exit(0);
}
if ((strcmp(argv[1], "-v") == 0) || (strcmp(argv[1], "--version") == 0)) {
PrintVersionInfo();
exit(0);
Expand Down Expand Up @@ -258,6 +278,9 @@ void ParseArgs(const int argc, char** argv, const char** lang,
} else if (strcmp(argv[i], "-psm") == 0 && i + 1 < argc) {
*pagesegmode = static_cast<tesseract::PageSegMode>(atoi(argv[i + 1]));
++i;
} else if (strcmp(argv[i], "-oem") == 0 && i + 1 < argc) {
*enginemode = static_cast<tesseract::OcrEngineMode>(atoi(argv[i + 1]));
++i;
} else if (strcmp(argv[i], "--print-parameters") == 0) {
noocr = true;
*print_parameters = true;
Expand Down Expand Up @@ -355,6 +378,7 @@ int main(int argc, char** argv) {
bool print_parameters = false;
int arg_i = 1;
tesseract::PageSegMode pagesegmode = tesseract::PSM_AUTO;
tesseract::OcrEngineMode enginemode = tesseract::OEM_DEFAULT;
/* main() calls functions like ParseArgs which call exit().
* This results in memory leaks if vars_vec and vars_values are
* declared as auto variables (destructor is not called then). */
Expand All @@ -367,7 +391,8 @@ int main(int argc, char** argv) {
#endif /* HAVE_TIFFIO_H && _WIN32 */

ParseArgs(argc, argv, &lang, &image, &outputbase, &datapath, &list_langs,
&print_parameters, &vars_vec, &vars_values, &arg_i, &pagesegmode);
&print_parameters, &vars_vec, &vars_values, &arg_i, &pagesegmode,
&enginemode);

bool banner = false;
if (outputbase != NULL && strcmp(outputbase, "-") &&
Expand All @@ -380,9 +405,8 @@ int main(int argc, char** argv) {

api.SetOutputName(outputbase);

int init_failed =
api.Init(datapath, lang, tesseract::OEM_DEFAULT, &(argv[arg_i]),
argc - arg_i, &vars_vec, &vars_values, false);
int init_failed = api.Init(datapath, lang, enginemode, &(argv[arg_i]),
argc - arg_i, &vars_vec, &vars_values, false);
if (init_failed) {
fprintf(stderr, "Could not initialize tesseract.\n");
exit(1);
Expand Down
6 changes: 5 additions & 1 deletion ccmain/tessedit.cpp
Expand Up @@ -218,7 +218,11 @@ bool Tesseract::init_tesseract_lang_data(
if (tessdata_manager_debug_level)
tprintf("Loaded Cube with combiner\n");
} else if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
if (tessdata_manager.SeekToStart(TESSDATA_LSTM)) {
if (tessdata_manager.swap()) {
tprintf("Error: LSTM requested on big-endian hardware!!\n");
tprintf("Big-endian not yet supported! Loading tesseract.\n");
tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
} else if (tessdata_manager.SeekToStart(TESSDATA_LSTM)) {
lstm_recognizer_ = new LSTMRecognizer;
TFile fp;
fp.Open(tessdata_manager.GetDataFilePtr(), -1);
Expand Down
10 changes: 7 additions & 3 deletions ccstruct/imagedata.cpp
Expand Up @@ -217,7 +217,7 @@ Pix* ImageData::GetPix() const {
// The return value is the scaled Pix, which must be pixDestroyed after use,
// and scale_factor (if not NULL) is set to the scale factor that was applied
// to the image to achieve the target_height.
Pix* ImageData::PreScale(int target_height, float* scale_factor,
Pix* ImageData::PreScale(int target_height, int max_height, float* scale_factor,
int* scaled_width, int* scaled_height,
GenericVector<TBOX>* boxes) const {
int input_width = 0;
Expand All @@ -226,8 +226,12 @@ Pix* ImageData::PreScale(int target_height, float* scale_factor,
ASSERT_HOST(src_pix != NULL);
input_width = pixGetWidth(src_pix);
input_height = pixGetHeight(src_pix);
if (target_height == 0)
target_height = input_height;
if (target_height == 0) {
if (input_height > max_height)
target_height = max_height;
else
target_height = input_height;
}
float im_factor = static_cast<float>(target_height) / input_height;
if (scaled_width != NULL)
*scaled_width = IntCastRounded(im_factor * input_width);
Expand Down
5 changes: 3 additions & 2 deletions ccstruct/imagedata.h
Expand Up @@ -165,8 +165,9 @@ class ImageData {
// The return value is the scaled Pix, which must be pixDestroyed after use,
// and scale_factor (if not NULL) is set to the scale factor that was applied
// to the image to achieve the target_height.
Pix* PreScale(int target_height, float* scale_factor, int* scaled_width,
int* scaled_height, GenericVector<TBOX>* boxes) const;
Pix* PreScale(int target_height, int max_height, float* scale_factor,
int* scaled_width, int* scaled_height,
GenericVector<TBOX>* boxes) const;

int MemoryUsed() const;

Expand Down
7 changes: 5 additions & 2 deletions lstm/input.cpp
Expand Up @@ -25,6 +25,9 @@

namespace tesseract {

// Max height for variable height inputs before scaling anyway.
const int kMaxInputHeight = 48;

Input::Input(const STRING& name, int ni, int no)
: Network(NT_INPUT, name, ni, no), cached_x_scale_(1) {}
Input::Input(const STRING& name, const StaticShape& shape)
Expand Down Expand Up @@ -92,8 +95,8 @@ Pix* Input::PrepareLSTMInputs(const ImageData& image_data,
// Note that NumInputs() is defined as input image height.
int target_height = network->NumInputs();
int width, height;
Pix* pix =
image_data.PreScale(target_height, image_scale, &width, &height, nullptr);
Pix* pix = image_data.PreScale(target_height, kMaxInputHeight, image_scale,
&width, &height, nullptr);
if (pix == nullptr) {
tprintf("Bad pix from ImageData!\n");
return nullptr;
Expand Down
10 changes: 4 additions & 6 deletions lstm/lstmtrainer.cpp
Expand Up @@ -34,8 +34,6 @@

#include "callcpp.h"

using std::string;

namespace tesseract {

// Min actual error rate increase to constitute divergence.
Expand Down Expand Up @@ -203,7 +201,7 @@ bool LSTMTrainer::InitNetwork(const STRING& network_spec, int append_index,

// Initializes a trainer from a serialized TFNetworkModel proto.
// Returns the global step of TensorFlow graph or 0 if failed.
int LSTMTrainer::InitTensorFlowNetwork(const string& tf_proto) {
int LSTMTrainer::InitTensorFlowNetwork(const std::string& tf_proto) {
#ifdef INCLUDE_TENSORFLOW
delete network_;
TFNetwork* tf_net = new TFNetwork("TensorFlow");
Expand Down Expand Up @@ -1199,22 +1197,22 @@ double LSTMTrainer::ComputeCharError(const GenericVector<int>& truth_str,
// Computes a very simple bag of words word recall error rate.
// NOTE that this is destructive on both input strings.
double LSTMTrainer::ComputeWordError(STRING* truth_str, STRING* ocr_str) {
typedef TessHashMap<string, int, std::hash<string> > StrMap;
typedef TessHashMap<std::string, int, std::hash<std::string> > StrMap;
GenericVector<STRING> truth_words, ocr_words;
truth_str->split(' ', &truth_words);
if (truth_words.empty()) return 0.0;
ocr_str->split(' ', &ocr_words);
StrMap word_counts;
for (int i = 0; i < truth_words.size(); ++i) {
string truth_word(truth_words[i].string());
std::string truth_word(truth_words[i].string());
StrMap::iterator it = word_counts.find(truth_word);
if (it == word_counts.end())
word_counts.insert(make_pair(truth_word, 1));
else
++it->second;
}
for (int i = 0; i < ocr_words.size(); ++i) {
string ocr_word(ocr_words[i].string());
std::string ocr_word(ocr_words[i].string());
StrMap::iterator it = word_counts.find(ocr_word);
if (it == word_counts.end())
word_counts.insert(make_pair(ocr_word, -1));
Expand Down
3 changes: 2 additions & 1 deletion training/pango_font_info.cpp
Expand Up @@ -127,7 +127,8 @@ string PangoFontInfo::DescriptionName() const {
/* static */
void PangoFontInfo::SoftInitFontConfig() {
if (fonts_dir_.empty()) {
HardInitFontConfig(FLAGS_fonts_dir.c_str(), FLAGS_fontconfig_tmpdir.c_str());
HardInitFontConfig(FLAGS_fonts_dir.c_str(),
FLAGS_fontconfig_tmpdir.c_str());
}
}

Expand Down

0 comments on commit f24ef67

Please sign in to comment.