Skip to content

Commit

Permalink
Merge pull request #1982 from stweil/unittest
Browse files Browse the repository at this point in the history
Add more unit tests
  • Loading branch information
zdenop committed Oct 12, 2018
2 parents 30081c5 + 2916dc8 commit c5042ab
Show file tree
Hide file tree
Showing 7 changed files with 536 additions and 412 deletions.
4 changes: 3 additions & 1 deletion src/ccmain/paragraphs_internal.h
Expand Up @@ -2,7 +2,6 @@
* File: paragraphs_internal.h
* Description: Paragraph Detection internal data structures.
* Author: David Eger
* Created: 11 March 2011
*
* (C) Copyright 2011, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
Expand All @@ -21,10 +20,12 @@
#define TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_

#include "paragraphs.h"
#include "publictypes.h" // for ParagraphJustification

// NO CODE OUTSIDE OF paragraphs.cpp AND TESTS SHOULD NEED TO ACCESS
// DATA STRUCTURES OR FUNCTIONS IN THIS FILE.

class UNICHARSET;
class WERD_CHOICE;

namespace tesseract {
Expand Down Expand Up @@ -299,4 +300,5 @@ void CanonicalizeDetectionResults(
PARA_LIST *paragraphs);

} // namespace

#endif // TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
38 changes: 35 additions & 3 deletions unittest/Makefile.am
@@ -1,3 +1,6 @@
# Absolute path of directory 'langdata'.
LANGDATA_DIR=$(shell cd $(top_srcdir) && cd .. && pwd)/langdata

# Absolute path of directory 'tessdata' with traineddata files
# (must be on same level as top source directory).
TESSDATA_DIR=$(shell cd $(top_srcdir) && cd .. && pwd)/tessdata
Expand All @@ -6,6 +9,7 @@ TESSDATA_DIR=$(shell cd $(top_srcdir) && cd .. && pwd)/tessdata
# (using submodule test).
TESTING_DIR=$(shell cd $(top_srcdir) && pwd)/test/testing

AM_CPPFLAGS += -DLANGDATA_DIR="\"$(LANGDATA_DIR)\""
AM_CPPFLAGS += -DTESSDATA_DIR="\"$(TESSDATA_DIR)\""
AM_CPPFLAGS += -DTESTING_DIR="\"$(TESTING_DIR)\""
AM_CPPFLAGS += -DPANGO_ENABLE_ENGINE
Expand Down Expand Up @@ -47,7 +51,17 @@ libabseil_la_SOURCES += ../abseil/absl/base/internal/throw_delegate.cc
libabseil_la_SOURCES += ../abseil/absl/base/internal/unscaledcycleclock.cc
libabseil_la_SOURCES += ../abseil/absl/numeric/int128.cc
libabseil_la_SOURCES += ../abseil/absl/strings/ascii.cc
libabseil_la_SOURCES += ../abseil/absl/strings/charconv.cc
libabseil_la_SOURCES += ../abseil/absl/strings/internal/charconv_bigint.cc
libabseil_la_SOURCES += ../abseil/absl/strings/internal/charconv_parse.cc
libabseil_la_SOURCES += ../abseil/absl/strings/internal/memutil.cc
libabseil_la_SOURCES += ../abseil/absl/strings/internal/str_format/arg.cc
libabseil_la_SOURCES += ../abseil/absl/strings/internal/str_format/bind.cc
libabseil_la_SOURCES += ../abseil/absl/strings/internal/str_format/extension.cc
libabseil_la_SOURCES += ../abseil/absl/strings/internal/str_format/float_conversion.cc
libabseil_la_SOURCES += ../abseil/absl/strings/internal/str_format/output.cc
libabseil_la_SOURCES += ../abseil/absl/strings/internal/str_format/parser.cc
libabseil_la_SOURCES += ../abseil/absl/strings/numbers.cc
libabseil_la_SOURCES += ../abseil/absl/strings/str_cat.cc
libabseil_la_SOURCES += ../abseil/absl/strings/str_split.cc
libabseil_la_SOURCES += ../abseil/absl/strings/string_view.cc
Expand All @@ -73,7 +87,6 @@ GMOCK_LIBS = libgmock.la libgmock_main.la
TESS_LIBS = $(top_builddir)/src/api/libtesseract.la
TRAINING_LIBS = $(top_builddir)/src/training/libtesseract_training.la
TRAINING_LIBS += $(top_builddir)/src/training/libtesseract_tessopt.la
TRAINING_LIBS += $(ICU_UC_LIBS)
AM_CPPFLAGS += -isystem $(top_srcdir)/googletest/googletest/include \
-isystem $(top_srcdir)/googletest/googlemock/include

Expand All @@ -87,14 +100,18 @@ check_PROGRAMS = \
denorm_test \
fileio_test \
heap_test \
imagedata_test \
indexmapbidi_test \
intfeaturemap_test \
intsimdmatrix_test \
lang_model_test \
linlsq_test \
loadlang_test \
mastertrainer_test \
matrix_test \
nthitem_test \
osd_test \
paragraphs_test \
progress_test \
qrsequence_test \
rect_test \
Expand Down Expand Up @@ -137,7 +154,7 @@ colpartition_test_SOURCES = colpartition_test.cc
colpartition_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)

commandlineflags_test_SOURCES = commandlineflags_test.cc
commandlineflags_test_LDADD = $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS)
commandlineflags_test_LDADD = $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS) $(ICU_UC_LIBS)

denorm_test_SOURCES = denorm_test.cc
denorm_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)
Expand All @@ -148,6 +165,9 @@ fileio_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TESS_LIBS)
heap_test_SOURCES = heap_test.cc
heap_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)

imagedata_test_SOURCES = imagedata_test.cc
imagedata_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TESS_LIBS)

indexmapbidi_test_SOURCES = indexmapbidi_test.cc
indexmapbidi_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)

Expand All @@ -157,18 +177,30 @@ intfeaturemap_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)
intsimdmatrix_test_SOURCES = intsimdmatrix_test.cc
intsimdmatrix_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)

lang_model_test_SOURCES = lang_model_test.cc
lang_model_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS) $(ICU_I18N_LIBS) $(ICU_UC_LIBS)

linlsq_test_SOURCES = linlsq_test.cc
linlsq_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)

loadlang_test_SOURCES = loadlang_test.cc
loadlang_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS) $(LEPTONICA_LIBS)

mastertrainer_test_SOURCES = mastertrainer_test.cc
mastertrainer_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS)

matrix_test_SOURCES = matrix_test.cc
matrix_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)

nthitem_test_SOURCES = nthitem_test.cc
nthitem_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)

pango_font_info_test_SOURCES = pango_font_info_test.cc
pango_font_info_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)

paragraphs_test_SOURCES = paragraphs_test.cc
paragraphs_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TESS_LIBS)

osd_test_SOURCES = osd_test.cc
osd_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS) $(LEPTONICA_LIBS)

Expand Down Expand Up @@ -201,7 +233,7 @@ tfile_test_SOURCES = tfile_test.cc
tfile_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)

validator_test_SOURCES = validator_test.cc
validator_test_LDADD = $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS)
validator_test_LDADD = $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS) $(ICU_UC_LIBS)

# for windows
if T_WIN
Expand Down
40 changes: 29 additions & 11 deletions unittest/imagedata_test.cc
@@ -1,7 +1,24 @@
#include "tesseract/ccstruct/imagedata.h"
// (C) Copyright 2017, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <string>
#include <vector>

#include "absl/strings/str_cat.h"
#include "absl/strings/str_format.h"

#include "imagedata.h"
#include "include_gunit.h"
#include "log.h"

using tesseract::DocumentCache;
using tesseract::DocumentData;
using tesseract::ImageData;
Expand All @@ -15,8 +32,8 @@ class ImagedataTest : public ::testing::Test {
ImagedataTest() {}

// Creates a fake DocumentData, writes it to a file, and returns the filename.
string MakeFakeDoc(int num_pages, int doc_id,
std::vector<string>* page_texts) {
std::string MakeFakeDoc(int num_pages, int doc_id,
std::vector<std::string>* page_texts) {
// The size of the fake images that we will use.
const int kImageSize = 1048576;
// Not using a real image here - just an array of zeros! We are just testing
Expand All @@ -26,7 +43,7 @@ class ImagedataTest : public ::testing::Test {
for (int p = 0; p < num_pages; ++p) {
// Make some fake text that is different for each page and save it.
page_texts->push_back(
StringPrintf("Page %d of %d in doc %d", p, num_pages, doc_id));
absl::StrFormat("Page %d of %d in doc %d", p, num_pages, doc_id));
// Make an imagedata and put it in the document.
ImageData* imagedata =
ImageData::Build("noname", p, "eng", fake_image.data(),
Expand All @@ -35,7 +52,7 @@ class ImagedataTest : public ::testing::Test {
write_doc.AddPageToDocument(imagedata);
}
// Write it to a file.
string filename = file::JoinPath(
std::string filename = file::JoinPath(
FLAGS_test_tmpdir, absl::StrCat("documentdata", doc_id, ".lstmf"));
EXPECT_TRUE(write_doc.SaveDocument(filename.c_str(), nullptr));
return filename;
Expand All @@ -52,8 +69,8 @@ TEST_F(ImagedataTest, CachesProperly) {
// Order in which to read the pages, with some sequential and some seeks.
const int kPageReadOrder[] = {0, 1, 2, 3, 8, 4, 5, 6, 7, 11, 10, 9, -1};

std::vector<string> page_texts;
string filename = MakeFakeDoc(kNumPages, 0, &page_texts);
std::vector<std::string> page_texts;
std::string filename = MakeFakeDoc(kNumPages, 0, &page_texts);
// Now try getting it back with different memory allowances and check that
// the pages can still be read.
for (int m = 0; kMemoryAllowances[m] > 0; ++m) {
Expand All @@ -65,7 +82,8 @@ TEST_F(ImagedataTest, CachesProperly) {
for (int p = 0; kPageReadOrder[p] >= 0; ++p) {
int page = kPageReadOrder[p];
const ImageData* imagedata = read_doc.GetPage(page);
EXPECT_NE(reinterpret_cast<const ImageData*>(nullptr), imagedata);
EXPECT_NE(nullptr, imagedata);
//EXPECT_NE(reinterpret_cast<ImageData*>(nullptr), imagedata);
// Check that this is the right page.
EXPECT_STREQ(page_texts[page].c_str(),
imagedata->transcription().string());
Expand All @@ -78,11 +96,11 @@ TEST_F(ImagedataTest, CachesMultiDocs) {
// and the two caching strategies read images in the right order.
// Number of pages in each document.
const std::vector<int> kNumPages = {6, 5, 7};
std::vector<std::vector<string>> page_texts;
std::vector<std::vector<std::string>> page_texts;
GenericVector<STRING> filenames;
for (int d = 0; d < kNumPages.size(); ++d) {
page_texts.emplace_back(std::vector<string>());
string filename = MakeFakeDoc(kNumPages[d], d, &page_texts.back());
page_texts.emplace_back(std::vector<std::string>());
std::string filename = MakeFakeDoc(kNumPages[d], d, &page_texts.back());
filenames.push_back(STRING(filename.c_str()));
}
// Now try getting them back with different cache strategies and check that
Expand Down
9 changes: 9 additions & 0 deletions unittest/include_gunit.h
Expand Up @@ -28,6 +28,15 @@ class file : public tesseract::File {
static int Defaults() {
return 0;
}

static std::string JoinPath(const std::string& s1, const std::string& s2) {
return tesseract::File::JoinPath(s1, s2);
}

static std::string JoinPath(const std::string& s1, const std::string& s2,
const std::string& s3) {
return JoinPath(JoinPath(s1, s2), s3);
}
};

#if !defined(ABSL_ARRAYSIZE)
Expand Down
48 changes: 33 additions & 15 deletions unittest/lang_model_test.cc
@@ -1,13 +1,31 @@
#include "tesseract/training/lang_model_helpers.h"
// (C) Copyright 2017, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "tesseract/lstm/lstmtrainer.h"
#include "tesseract/training/unicharset_training_utils.h"
#include <string> // for std::string

#include "absl/strings/str_cat.h"

#include "gmock/gmock.h" // for testing::ElementsAreArray

#include "include_gunit.h"
#include "lang_model_helpers.h"
#include "log.h" // for LOG
#include "lstmtrainer.h"
#include "unicharset_training_utils.h"

namespace tesseract {
namespace {

string TestDataNameToPath(const string& name) {
return file::JoinPath(FLAGS_test_srcdir, "testdata", name);
std::string TestDataNameToPath(const std::string& name) {
return file::JoinPath(TESTING_DIR, name);
}

// This is an integration test that verifies that CombineLangModel works to
Expand All @@ -18,15 +36,15 @@ TEST(LangModelTest, AddACharacter) {
constexpr char kTestString[] = "Simple ASCII string to encode !@#$%&";
constexpr char kTestStringRupees[] = "ASCII string with Rupee symbol ₹";
// Setup the arguments.
string script_dir = file::JoinPath(FLAGS_test_srcdir, "langdata");
string eng_dir = file::JoinPath(script_dir, "eng");
string unicharset_path = TestDataNameToPath("eng_beam.unicharset");
std::string script_dir = LANGDATA_DIR;
std::string eng_dir = file::JoinPath(script_dir, "eng");
std::string unicharset_path = TestDataNameToPath("eng_beam.unicharset");
UNICHARSET unicharset;
EXPECT_TRUE(unicharset.load_from_file(unicharset_path.c_str()));
string version_str = "TestVersion";
string output_dir = FLAGS_test_tmpdir;
std::string version_str = "TestVersion";
std::string output_dir = FLAGS_test_tmpdir;
LOG(INFO) << "Output dir=" << output_dir;
string lang1 = "eng";
std::string lang1 = "eng";
bool pass_through_recoder = false;
GenericVector<STRING> words, puncs, numbers;
// If these reads fail, we get a warning message and an empty list of words.
Expand All @@ -44,7 +62,7 @@ TEST(LangModelTest, AddACharacter) {
lang1, pass_through_recoder, words, puncs,
numbers, lang_is_rtl, nullptr, nullptr));
// Init a trainer with it, and encode a string.
string traineddata1 =
std::string traineddata1 =
file::JoinPath(output_dir, lang1, absl::StrCat(lang1, ".traineddata"));
LSTMTrainer trainer1;
trainer1.InitCharSet(traineddata1);
Expand All @@ -58,13 +76,13 @@ TEST(LangModelTest, AddACharacter) {
&unicharset);
EXPECT_EQ(size_before + 1, unicharset.size());
// Generate the traineddata file.
string lang2 = "extended";
std::string lang2 = "extended";
EXPECT_EQ(EXIT_SUCCESS,
CombineLangModel(unicharset, script_dir, version_str, output_dir,
lang2, pass_through_recoder, words, puncs, numbers,
lang_is_rtl, nullptr, nullptr));
// Init a trainer with it, and encode a string.
string traineddata2 =
std::string traineddata2 =
file::JoinPath(output_dir, lang2, absl::StrCat(lang2, ".traineddata"));
LSTMTrainer trainer2;
trainer2.InitCharSet(traineddata2);
Expand All @@ -86,7 +104,7 @@ TEST(LangModelTest, AddACharacter) {
}
EXPECT_THAT(labels1_v,
testing::ElementsAreArray(&labels2[0], labels2.size()));
// To make sure we weren't cheating somehow, we can now encode the Rupee
// To make sure we we are not cheating somehow, we can now encode the Rupee
// symbol, which we could not do before.
EXPECT_FALSE(trainer1.EncodeString(kTestStringRupees, &labels1));
EXPECT_TRUE(trainer2.EncodeString(kTestStringRupees, &labels2));
Expand Down

0 comments on commit c5042ab

Please sign in to comment.