From 1c7e00611b5a6b0e2d2094f0a1326ff3c3e1fd98 Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Tue, 5 Mar 2019 15:52:15 +0100 Subject: [PATCH] Add initial support for traineddata files in standard archive formats This requires libarchive-dev. Tesseract can now load traineddata files in any of the archive formats which are supported by libarchive. Example of a zipped BagIt archive: $ unzip -l /usr/local/share/tessdata/zip.traineddata Archive: /usr/local/share/tessdata/zip.traineddata Length Date Time Name --------- ---------- ----- ---- 55 2019-03-05 15:27 bagit.txt 0 2019-03-05 15:25 data/ 1557 2019-03-05 15:28 manifest-sha256.txt 1082890 2019-03-05 15:25 data/eng.word-dawg 1487588 2019-03-05 15:25 data/eng.lstm 7477 2019-03-05 15:25 data/eng.unicharset 63346 2019-03-05 15:25 data/eng.shapetable 976552 2019-03-05 15:25 data/eng.inttemp 13408 2019-03-05 15:25 data/eng.normproto 4322 2019-03-05 15:25 data/eng.punc-dawg 4738 2019-03-05 15:25 data/eng.lstm-number-dawg 1410 2019-03-05 15:25 data/eng.freq-dawg 844 2019-03-05 15:25 data/eng.pffmtable 6360 2019-03-05 15:25 data/eng.lstm-unicharset 1012 2019-03-05 15:25 data/eng.lstm-recoder 1047 2019-03-05 15:25 data/eng.unicharambigs 4322 2019-03-05 15:25 data/eng.lstm-punc-dawg 16109842 2019-03-05 15:25 data/eng.bigram-dawg 80 2019-03-05 15:25 data/eng.version 6426 2019-03-05 15:25 data/eng.number-dawg 3694794 2019-03-05 15:25 data/eng.lstm-word-dawg --------- ------- 23468070 21 files `combine_tessdata -d` and `combine_tessdata -u` also work. The traineddata files in the new format can be generated with standard tools like zip or tar. More work is needed for other training tools and big endian support. Signed-off-by: Stefan Weil --- .travis.yml | 1 + configure.ac | 6 +++ src/api/Makefile.am | 1 + src/ccutil/Makefile.am | 2 + src/ccutil/tessdatamanager.cpp | 69 +++++++++++++++++++++++++++++----- src/ccutil/tessdatamanager.h | 6 ++- src/training/Makefile.am | 20 ++++++++++ 7 files changed, 95 insertions(+), 10 deletions(-) diff --git a/.travis.yml b/.travis.yml index 02f7c56798..5df8866b5a 100644 --- a/.travis.yml +++ b/.travis.yml @@ -21,6 +21,7 @@ addons: sources: #- ubuntu-toolchain-r-test packages: + - libarchive-dev #- g++-6 #matrix: diff --git a/configure.ac b/configure.ac index 1ba3ff387d..ef5d1b4751 100644 --- a/configure.ac +++ b/configure.ac @@ -422,6 +422,12 @@ else AC_MSG_ERROR([Leptonica 1.74 or higher is required. Try to install libleptonica-dev package.]) fi +PKG_CHECK_MODULES([libarchive], [libarchive], [have_libarchive=true], [have_libarchive=false]) +AM_CONDITIONAL([HAVE_LIBARCHIVE], [$have_libarchive]) +if $have_libarchive; then + AC_DEFINE([HAVE_LIBARCHIVE], [], [Enable libarchive]) +fi + AM_CONDITIONAL([ENABLE_TRAINING], true) # Check availability of ICU packages. diff --git a/src/api/Makefile.am b/src/api/Makefile.am index 2ab9118447..88eee32b81 100644 --- a/src/api/Makefile.am +++ b/src/api/Makefile.am @@ -88,6 +88,7 @@ tesseract_LDFLAGS = $(OPENCL_LDFLAGS) tesseract_LDADD += $(LEPTONICA_LIBS) tesseract_LDADD += $(OPENMP_CXXFLAGS) +tesseract_LDADD += $(libarchive_LIBS) if T_WIN tesseract_LDADD += -ltiff diff --git a/src/ccutil/Makefile.am b/src/ccutil/Makefile.am index 05d80b628a..692fd33095 100644 --- a/src/ccutil/Makefile.am +++ b/src/ccutil/Makefile.am @@ -40,6 +40,8 @@ libtesseract_ccutil_la_SOURCES = \ unichar.cpp unicharcompress.cpp unicharmap.cpp unicharset.cpp unicodes.cpp \ params.cpp universalambigs.cpp +AM_CPPFLAGS += $(libarchive_CFLAGS) + if T_WIN AM_CPPFLAGS += -DWINDLLNAME=\"lib@GENERIC_LIBRARY_NAME@\" endif diff --git a/src/ccutil/tessdatamanager.cpp b/src/ccutil/tessdatamanager.cpp index 9a7a75c93c..daf57eb673 100644 --- a/src/ccutil/tessdatamanager.cpp +++ b/src/ccutil/tessdatamanager.cpp @@ -2,7 +2,6 @@ // File: tessdatamanager.cpp // Description: Functions to handle loading/combining tesseract data files. // Author: Daria Antonova -// Created: Wed Jun 03 11:26:43 PST 2009 // // (C) Copyright 2009, Google Inc. // Licensed under the Apache License, Version 2.0 (the "License"); @@ -24,6 +23,12 @@ #include "tessdatamanager.h" #include +#include + +#if defined(HAVE_LIBARCHIVE) +#include +#include +#endif #include "errcode.h" #include "helpers.h" @@ -52,9 +57,49 @@ void TessdataManager::LoadFileLater(const char *data_file_name) { data_file_name_ = data_file_name; } +#if defined(HAVE_LIBARCHIVE) +bool TessdataManager::LoadArchiveFile(const char *filename) { + bool result = false; + archive *a = archive_read_new(); + if (a != nullptr) { + archive_read_support_filter_all(a); + archive_read_support_format_all(a); + if (archive_read_open_filename(a, filename, 8192) == ARCHIVE_OK) { + archive_entry *ae; + while (archive_read_next_header(a, &ae) == ARCHIVE_OK) { + const char *component = archive_entry_pathname(ae); + if (component != nullptr) { + TessdataType type; + if (TessdataTypeFromFileName(component, &type)) { + int64_t size = archive_entry_size(ae); + if (size > 0) { + entries_[type].resize_no_init(size); + if (archive_read_data(a, &entries_[type][0], size) == size) { + is_loaded_ = true; + } + } + } + } + } + result = is_loaded_; +#if defined(DEBUG) + } else { + tprintf("archive_read_open_filename(...,%s,...) failed, %s\n", + filename, strerror(archive_errno(a))); +#endif + } + archive_read_free(a); + } + return result; +} +#endif + bool TessdataManager::Init(const char *data_file_name) { GenericVector data; if (reader_ == nullptr) { +#if defined(HAVE_LIBARCHIVE) + if (LoadArchiveFile(data_file_name)) return true; +#endif if (!LoadDataFromFile(data_file_name, &data)) return false; } else { if (!(*reader_)(data_file_name, &data)) return false; @@ -65,6 +110,7 @@ bool TessdataManager::Init(const char *data_file_name) { // Loads from the given memory buffer as if a file. bool TessdataManager::LoadMemBuffer(const char *name, const char *data, int size) { + // TODO: This method supports only the proprietary file format. Clear(); data_file_name_ = name; TFile fp; @@ -78,10 +124,10 @@ bool TessdataManager::LoadMemBuffer(const char *name, const char *data, GenericVector offset_table; offset_table.resize_no_init(num_entries); if (!fp.DeSerialize(&offset_table[0], num_entries)) return false; - for (int i = 0; i < num_entries && i < TESSDATA_NUM_ENTRIES; ++i) { + for (unsigned i = 0; i < num_entries && i < TESSDATA_NUM_ENTRIES; ++i) { if (offset_table[i] >= 0) { int64_t entry_size = size - offset_table[i]; - int j = i + 1; + unsigned j = i + 1; while (j < num_entries && offset_table[j] == -1) ++j; if (j < num_entries) entry_size = offset_table[j] - offset_table[i]; entries_[i].resize_no_init(entry_size); @@ -106,6 +152,7 @@ void TessdataManager::OverwriteEntry(TessdataType type, const char *data, // Saves to the given filename. bool TessdataManager::SaveFile(const STRING &filename, FileWriter writer) const { + // TODO: This method supports only the proprietary file format. ASSERT_HOST(is_loaded_); GenericVector data; Serialize(&data); @@ -117,11 +164,12 @@ bool TessdataManager::SaveFile(const STRING &filename, // Serializes to the given vector. void TessdataManager::Serialize(GenericVector *data) const { + // TODO: This method supports only the proprietary file format. ASSERT_HOST(is_loaded_); // Compute the offset_table and total size. int64_t offset_table[TESSDATA_NUM_ENTRIES]; int64_t offset = sizeof(int32_t) + sizeof(offset_table); - for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { + for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { if (entries_[i].empty()) { offset_table[i] = -1; } else { @@ -135,7 +183,7 @@ void TessdataManager::Serialize(GenericVector *data) const { fp.OpenWrite(data); fp.Serialize(&num_entries); fp.Serialize(&offset_table[0], countof(offset_table)); - for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { + for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { if (!entries_[i].empty()) { fp.Serialize(&entries_[i][0], entries_[i].size()); } @@ -144,7 +192,7 @@ void TessdataManager::Serialize(GenericVector *data) const { // Resets to the initial state, keeping the reader. void TessdataManager::Clear() { - for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { + for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { entries_[i].clear(); } is_loaded_ = false; @@ -154,7 +202,7 @@ void TessdataManager::Clear() { void TessdataManager::Directory() const { tprintf("Version string:%s\n", VersionString().c_str()); int offset = TESSDATA_NUM_ENTRIES * sizeof(int64_t); - for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { + for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { if (!entries_[i].empty()) { tprintf("%d:%s:size=%d, offset=%d\n", i, kTessdataFileSuffixes[i], entries_[i].size(), offset); @@ -197,7 +245,7 @@ bool TessdataManager::CombineDataFiles( const char *language_data_path_prefix, const char *output_filename) { // Load individual tessdata components from files. - for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { + for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { TessdataType type; ASSERT_HOST(TessdataTypeFromFileSuffix(kTessdataFileSuffixes[i], &type)); STRING filename = language_data_path_prefix; @@ -229,6 +277,7 @@ bool TessdataManager::OverwriteComponents( char **component_filenames, int num_new_components) { // Open the files with the new components. + // TODO: This method supports only the proprietary file format. for (int i = 0; i < num_new_components; ++i) { TessdataType type; if (TessdataTypeFromFileName(component_filenames[i], &type)) { @@ -253,14 +302,16 @@ bool TessdataManager::ExtractToFile(const char *filename) { bool TessdataManager::TessdataTypeFromFileSuffix(const char *suffix, TessdataType *type) { - for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { + for (unsigned i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) { *type = static_cast(i); return true; } } +#if defined(DEBUG) tprintf("TessdataManager can't determine which tessdata" " component is represented by %s\n", suffix); +#endif return false; } diff --git a/src/ccutil/tessdatamanager.h b/src/ccutil/tessdatamanager.h index f003adb42d..4372f29138 100644 --- a/src/ccutil/tessdatamanager.h +++ b/src/ccutil/tessdatamanager.h @@ -214,6 +214,11 @@ class TessdataManager { */ bool ExtractToFile(const char *filename); + private: + + // Use libarchive. + bool LoadArchiveFile(const char *filename); + /** * Fills type with TessdataType of the tessdata component represented by the * given file name. E.g. tessdata/eng.unicharset -> TESSDATA_UNICHARSET. @@ -230,7 +235,6 @@ class TessdataManager { static bool TessdataTypeFromFileName(const char *filename, TessdataType *type); - private: // Name of file it came from. STRING data_file_name_; // Function to load the file when we need it. diff --git a/src/training/Makefile.am b/src/training/Makefile.am index d86b8d41df..41bf3735d0 100644 --- a/src/training/Makefile.am +++ b/src/training/Makefile.am @@ -280,3 +280,23 @@ set_unicharset_properties_LDADD += $(LEPTONICA_LIBS) text2image_LDADD += $(LEPTONICA_LIBS) unicharset_extractor_LDADD += $(LEPTONICA_LIBS) wordlist2dawg_LDADD += $(LEPTONICA_LIBS) + +extralib = $(libarchive_LIBS) + +if !DISABLED_LEGACY_ENGINE +ambiguous_words_LDADD += $(extralib) +classifier_tester_LDADD += $(extralib) +cntraining_LDADD += $(extralib) +mftraining_LDADD += $(extralib) +shapeclustering_LDADD += $(extralib) +endif +combine_lang_model_LDADD += $(extralib) +combine_tessdata_LDADD += $(extralib) +dawg2wordlist_LDADD += $(extralib) +lstmeval_LDADD += $(extralib) +lstmtraining_LDADD += $(extralib) +merge_unicharsets_LDADD += $(extralib) +set_unicharset_properties_LDADD += $(extralib) +text2image_LDADD += $(extralib) +unicharset_extractor_LDADD += $(extralib) +wordlist2dawg_LDADD += $(extralib)