Skip to content

Commit

Permalink
unittest: Add lang_model_test (only works partially)
Browse files Browse the repository at this point in the history
The test currently has subtests which fail because of missing files.

Signed-off-by: Stefan Weil <sw@weilnetz.de>
  • Loading branch information
stweil committed Oct 12, 2018
1 parent de6a759 commit f93fb9d
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 18 deletions.
13 changes: 10 additions & 3 deletions unittest/Makefile.am
@@ -1,3 +1,6 @@
# Absolute path of directory 'langdata'.
LANGDATA_DIR=$(shell cd $(top_srcdir) && cd .. && pwd)/langdata

# Absolute path of directory 'tessdata' with traineddata files
# (must be on same level as top source directory).
TESSDATA_DIR=$(shell cd $(top_srcdir) && cd .. && pwd)/tessdata
Expand All @@ -6,6 +9,7 @@ TESSDATA_DIR=$(shell cd $(top_srcdir) && cd .. && pwd)/tessdata
# (using submodule test).
TESTING_DIR=$(shell cd $(top_srcdir) && pwd)/test/testing

AM_CPPFLAGS += -DLANGDATA_DIR="\"$(LANGDATA_DIR)\""
AM_CPPFLAGS += -DTESSDATA_DIR="\"$(TESSDATA_DIR)\""
AM_CPPFLAGS += -DTESTING_DIR="\"$(TESTING_DIR)\""
AM_CPPFLAGS += -DPANGO_ENABLE_ENGINE
Expand Down Expand Up @@ -83,7 +87,6 @@ GMOCK_LIBS = libgmock.la libgmock_main.la
TESS_LIBS = $(top_builddir)/src/api/libtesseract.la
TRAINING_LIBS = $(top_builddir)/src/training/libtesseract_training.la
TRAINING_LIBS += $(top_builddir)/src/training/libtesseract_tessopt.la
TRAINING_LIBS += $(ICU_UC_LIBS)
AM_CPPFLAGS += -isystem $(top_srcdir)/googletest/googletest/include \
-isystem $(top_srcdir)/googletest/googlemock/include

Expand All @@ -101,6 +104,7 @@ check_PROGRAMS = \
indexmapbidi_test \
intfeaturemap_test \
intsimdmatrix_test \
lang_model_test \
linlsq_test \
loadlang_test \
matrix_test \
Expand Down Expand Up @@ -149,7 +153,7 @@ colpartition_test_SOURCES = colpartition_test.cc
colpartition_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)

commandlineflags_test_SOURCES = commandlineflags_test.cc
commandlineflags_test_LDADD = $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS)
commandlineflags_test_LDADD = $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS) $(ICU_UC_LIBS)

denorm_test_SOURCES = denorm_test.cc
denorm_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)
Expand All @@ -172,6 +176,9 @@ intfeaturemap_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)
intsimdmatrix_test_SOURCES = intsimdmatrix_test.cc
intsimdmatrix_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)

lang_model_test_SOURCES = lang_model_test.cc
lang_model_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS) $(ICU_I18N_LIBS) $(ICU_UC_LIBS)

linlsq_test_SOURCES = linlsq_test.cc
linlsq_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)

Expand Down Expand Up @@ -222,7 +229,7 @@ tfile_test_SOURCES = tfile_test.cc
tfile_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)

validator_test_SOURCES = validator_test.cc
validator_test_LDADD = $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS)
validator_test_LDADD = $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS) $(ICU_UC_LIBS)

# for windows
if T_WIN
Expand Down
9 changes: 9 additions & 0 deletions unittest/include_gunit.h
Expand Up @@ -28,6 +28,15 @@ class file : public tesseract::File {
static int Defaults() {
return 0;
}

static std::string JoinPath(const std::string& s1, const std::string& s2) {
return tesseract::File::JoinPath(s1, s2);
}

static std::string JoinPath(const std::string& s1, const std::string& s2,
const std::string& s3) {
return JoinPath(JoinPath(s1, s2), s3);
}
};

#if !defined(ABSL_ARRAYSIZE)
Expand Down
48 changes: 33 additions & 15 deletions unittest/lang_model_test.cc
@@ -1,13 +1,31 @@
#include "tesseract/training/lang_model_helpers.h"
// (C) Copyright 2017, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "tesseract/lstm/lstmtrainer.h"
#include "tesseract/training/unicharset_training_utils.h"
#include <string> // for std::string

#include "absl/strings/str_cat.h"

#include "gmock/gmock.h" // for testing::ElementsAreArray

#include "include_gunit.h"
#include "lang_model_helpers.h"
#include "log.h" // for LOG
#include "lstmtrainer.h"
#include "unicharset_training_utils.h"

namespace tesseract {
namespace {

string TestDataNameToPath(const string& name) {
return file::JoinPath(FLAGS_test_srcdir, "testdata", name);
std::string TestDataNameToPath(const std::string& name) {
return file::JoinPath(TESTING_DIR, name);
}

// This is an integration test that verifies that CombineLangModel works to
Expand All @@ -18,15 +36,15 @@ TEST(LangModelTest, AddACharacter) {
constexpr char kTestString[] = "Simple ASCII string to encode !@#$%&";
constexpr char kTestStringRupees[] = "ASCII string with Rupee symbol ₹";
// Setup the arguments.
string script_dir = file::JoinPath(FLAGS_test_srcdir, "langdata");
string eng_dir = file::JoinPath(script_dir, "eng");
string unicharset_path = TestDataNameToPath("eng_beam.unicharset");
std::string script_dir = LANGDATA_DIR;
std::string eng_dir = file::JoinPath(script_dir, "eng");
std::string unicharset_path = TestDataNameToPath("eng_beam.unicharset");
UNICHARSET unicharset;
EXPECT_TRUE(unicharset.load_from_file(unicharset_path.c_str()));
string version_str = "TestVersion";
string output_dir = FLAGS_test_tmpdir;
std::string version_str = "TestVersion";
std::string output_dir = FLAGS_test_tmpdir;
LOG(INFO) << "Output dir=" << output_dir;
string lang1 = "eng";
std::string lang1 = "eng";
bool pass_through_recoder = false;
GenericVector<STRING> words, puncs, numbers;
// If these reads fail, we get a warning message and an empty list of words.
Expand All @@ -44,7 +62,7 @@ TEST(LangModelTest, AddACharacter) {
lang1, pass_through_recoder, words, puncs,
numbers, lang_is_rtl, nullptr, nullptr));
// Init a trainer with it, and encode a string.
string traineddata1 =
std::string traineddata1 =
file::JoinPath(output_dir, lang1, absl::StrCat(lang1, ".traineddata"));
LSTMTrainer trainer1;
trainer1.InitCharSet(traineddata1);
Expand All @@ -58,13 +76,13 @@ TEST(LangModelTest, AddACharacter) {
&unicharset);
EXPECT_EQ(size_before + 1, unicharset.size());
// Generate the traineddata file.
string lang2 = "extended";
std::string lang2 = "extended";
EXPECT_EQ(EXIT_SUCCESS,
CombineLangModel(unicharset, script_dir, version_str, output_dir,
lang2, pass_through_recoder, words, puncs, numbers,
lang_is_rtl, nullptr, nullptr));
// Init a trainer with it, and encode a string.
string traineddata2 =
std::string traineddata2 =
file::JoinPath(output_dir, lang2, absl::StrCat(lang2, ".traineddata"));
LSTMTrainer trainer2;
trainer2.InitCharSet(traineddata2);
Expand All @@ -86,7 +104,7 @@ TEST(LangModelTest, AddACharacter) {
}
EXPECT_THAT(labels1_v,
testing::ElementsAreArray(&labels2[0], labels2.size()));
// To make sure we weren't cheating somehow, we can now encode the Rupee
// To make sure we we are not cheating somehow, we can now encode the Rupee
// symbol, which we could not do before.
EXPECT_FALSE(trainer1.EncodeString(kTestStringRupees, &labels1));
EXPECT_TRUE(trainer2.EncodeString(kTestStringRupees, &labels2));
Expand Down

0 comments on commit f93fb9d

Please sign in to comment.