diff --git a/unittest/Makefile.am b/unittest/Makefile.am index 681b9512aa..c24ded9a6c 100644 --- a/unittest/Makefile.am +++ b/unittest/Makefile.am @@ -1,3 +1,6 @@ +# Absolute path of directory 'langdata'. +LANGDATA_DIR=$(shell cd $(top_srcdir) && cd .. && pwd)/langdata + # Absolute path of directory 'tessdata' with traineddata files # (must be on same level as top source directory). TESSDATA_DIR=$(shell cd $(top_srcdir) && cd .. && pwd)/tessdata @@ -6,6 +9,7 @@ TESSDATA_DIR=$(shell cd $(top_srcdir) && cd .. && pwd)/tessdata # (using submodule test). TESTING_DIR=$(shell cd $(top_srcdir) && pwd)/test/testing +AM_CPPFLAGS += -DLANGDATA_DIR="\"$(LANGDATA_DIR)\"" AM_CPPFLAGS += -DTESSDATA_DIR="\"$(TESSDATA_DIR)\"" AM_CPPFLAGS += -DTESTING_DIR="\"$(TESTING_DIR)\"" AM_CPPFLAGS += -DPANGO_ENABLE_ENGINE @@ -83,7 +87,6 @@ GMOCK_LIBS = libgmock.la libgmock_main.la TESS_LIBS = $(top_builddir)/src/api/libtesseract.la TRAINING_LIBS = $(top_builddir)/src/training/libtesseract_training.la TRAINING_LIBS += $(top_builddir)/src/training/libtesseract_tessopt.la -TRAINING_LIBS += $(ICU_UC_LIBS) AM_CPPFLAGS += -isystem $(top_srcdir)/googletest/googletest/include \ -isystem $(top_srcdir)/googletest/googlemock/include @@ -101,6 +104,7 @@ check_PROGRAMS = \ indexmapbidi_test \ intfeaturemap_test \ intsimdmatrix_test \ + lang_model_test \ linlsq_test \ loadlang_test \ matrix_test \ @@ -149,7 +153,7 @@ colpartition_test_SOURCES = colpartition_test.cc colpartition_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS) commandlineflags_test_SOURCES = commandlineflags_test.cc -commandlineflags_test_LDADD = $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS) +commandlineflags_test_LDADD = $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS) $(ICU_UC_LIBS) denorm_test_SOURCES = denorm_test.cc denorm_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS) @@ -172,6 +176,9 @@ intfeaturemap_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS) intsimdmatrix_test_SOURCES = intsimdmatrix_test.cc intsimdmatrix_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS) +lang_model_test_SOURCES = lang_model_test.cc +lang_model_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS) $(ICU_I18N_LIBS) $(ICU_UC_LIBS) + linlsq_test_SOURCES = linlsq_test.cc linlsq_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS) @@ -222,7 +229,7 @@ tfile_test_SOURCES = tfile_test.cc tfile_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS) validator_test_SOURCES = validator_test.cc -validator_test_LDADD = $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS) +validator_test_LDADD = $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS) $(ICU_UC_LIBS) # for windows if T_WIN diff --git a/unittest/include_gunit.h b/unittest/include_gunit.h index 80c7b9cb1a..9f93271b3b 100644 --- a/unittest/include_gunit.h +++ b/unittest/include_gunit.h @@ -28,6 +28,15 @@ class file : public tesseract::File { static int Defaults() { return 0; } + + static std::string JoinPath(const std::string& s1, const std::string& s2) { + return tesseract::File::JoinPath(s1, s2); + } + + static std::string JoinPath(const std::string& s1, const std::string& s2, + const std::string& s3) { + return JoinPath(JoinPath(s1, s2), s3); + } }; #if !defined(ABSL_ARRAYSIZE) diff --git a/unittest/lang_model_test.cc b/unittest/lang_model_test.cc index 7a577c3aaa..77b4b85a11 100644 --- a/unittest/lang_model_test.cc +++ b/unittest/lang_model_test.cc @@ -1,13 +1,31 @@ -#include "tesseract/training/lang_model_helpers.h" +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. -#include "tesseract/lstm/lstmtrainer.h" -#include "tesseract/training/unicharset_training_utils.h" +#include // for std::string + +#include "absl/strings/str_cat.h" + +#include "gmock/gmock.h" // for testing::ElementsAreArray + +#include "include_gunit.h" +#include "lang_model_helpers.h" +#include "log.h" // for LOG +#include "lstmtrainer.h" +#include "unicharset_training_utils.h" namespace tesseract { namespace { -string TestDataNameToPath(const string& name) { - return file::JoinPath(FLAGS_test_srcdir, "testdata", name); +std::string TestDataNameToPath(const std::string& name) { + return file::JoinPath(TESTING_DIR, name); } // This is an integration test that verifies that CombineLangModel works to @@ -18,15 +36,15 @@ TEST(LangModelTest, AddACharacter) { constexpr char kTestString[] = "Simple ASCII string to encode !@#$%&"; constexpr char kTestStringRupees[] = "ASCII string with Rupee symbol ₹"; // Setup the arguments. - string script_dir = file::JoinPath(FLAGS_test_srcdir, "langdata"); - string eng_dir = file::JoinPath(script_dir, "eng"); - string unicharset_path = TestDataNameToPath("eng_beam.unicharset"); + std::string script_dir = LANGDATA_DIR; + std::string eng_dir = file::JoinPath(script_dir, "eng"); + std::string unicharset_path = TestDataNameToPath("eng_beam.unicharset"); UNICHARSET unicharset; EXPECT_TRUE(unicharset.load_from_file(unicharset_path.c_str())); - string version_str = "TestVersion"; - string output_dir = FLAGS_test_tmpdir; + std::string version_str = "TestVersion"; + std::string output_dir = FLAGS_test_tmpdir; LOG(INFO) << "Output dir=" << output_dir; - string lang1 = "eng"; + std::string lang1 = "eng"; bool pass_through_recoder = false; GenericVector words, puncs, numbers; // If these reads fail, we get a warning message and an empty list of words. @@ -44,7 +62,7 @@ TEST(LangModelTest, AddACharacter) { lang1, pass_through_recoder, words, puncs, numbers, lang_is_rtl, nullptr, nullptr)); // Init a trainer with it, and encode a string. - string traineddata1 = + std::string traineddata1 = file::JoinPath(output_dir, lang1, absl::StrCat(lang1, ".traineddata")); LSTMTrainer trainer1; trainer1.InitCharSet(traineddata1); @@ -58,13 +76,13 @@ TEST(LangModelTest, AddACharacter) { &unicharset); EXPECT_EQ(size_before + 1, unicharset.size()); // Generate the traineddata file. - string lang2 = "extended"; + std::string lang2 = "extended"; EXPECT_EQ(EXIT_SUCCESS, CombineLangModel(unicharset, script_dir, version_str, output_dir, lang2, pass_through_recoder, words, puncs, numbers, lang_is_rtl, nullptr, nullptr)); // Init a trainer with it, and encode a string. - string traineddata2 = + std::string traineddata2 = file::JoinPath(output_dir, lang2, absl::StrCat(lang2, ".traineddata")); LSTMTrainer trainer2; trainer2.InitCharSet(traineddata2); @@ -86,7 +104,7 @@ TEST(LangModelTest, AddACharacter) { } EXPECT_THAT(labels1_v, testing::ElementsAreArray(&labels2[0], labels2.size())); - // To make sure we weren't cheating somehow, we can now encode the Rupee + // To make sure we we are not cheating somehow, we can now encode the Rupee // symbol, which we could not do before. EXPECT_FALSE(trainer1.EncodeString(kTestStringRupees, &labels1)); EXPECT_TRUE(trainer2.EncodeString(kTestStringRupees, &labels2));