Merge pull request #1982 from stweil/unittest

Add more unit tests
tesseract-ocr · Oct 12, 2018 · c5042ab · c5042ab
2 parents 30081c5 + 2916dc8
commit c5042ab
Show file tree

Hide file tree

Showing 7 changed files with 536 additions and 412 deletions.
diff --git a/src/ccmain/paragraphs_internal.h b/src/ccmain/paragraphs_internal.h
@@ -2,7 +2,6 @@
  * File:        paragraphs_internal.h
  * Description: Paragraph Detection internal data structures.
  * Author:      David Eger
- * Created:     11 March 2011
  *
  * (C) Copyright 2011, Google Inc.
  ** Licensed under the Apache License, Version 2.0 (the "License");
@@ -21,10 +20,12 @@
 #define TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
 
 #include "paragraphs.h"
+#include "publictypes.h"        // for ParagraphJustification
 
 // NO CODE OUTSIDE OF paragraphs.cpp AND TESTS SHOULD NEED TO ACCESS
 // DATA STRUCTURES OR FUNCTIONS IN THIS FILE.
 
+class UNICHARSET;
 class WERD_CHOICE;
 
 namespace tesseract {
@@ -299,4 +300,5 @@ void CanonicalizeDetectionResults(
     PARA_LIST *paragraphs);
 
 }  // namespace
+
 #endif  // TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
diff --git a/unittest/Makefile.am b/unittest/Makefile.am
@@ -1,3 +1,6 @@
+# Absolute path of directory 'langdata'.
+LANGDATA_DIR=$(shell cd $(top_srcdir) && cd .. && pwd)/langdata
+
 # Absolute path of directory 'tessdata' with traineddata files
 # (must be on same level as top source directory).
 TESSDATA_DIR=$(shell cd $(top_srcdir) && cd .. && pwd)/tessdata
@@ -6,6 +9,7 @@ TESSDATA_DIR=$(shell cd $(top_srcdir) && cd .. && pwd)/tessdata
 # (using submodule test).
 TESTING_DIR=$(shell cd $(top_srcdir) && pwd)/test/testing
 
+AM_CPPFLAGS += -DLANGDATA_DIR="\"$(LANGDATA_DIR)\""
 AM_CPPFLAGS += -DTESSDATA_DIR="\"$(TESSDATA_DIR)\""
 AM_CPPFLAGS += -DTESTING_DIR="\"$(TESTING_DIR)\""
 AM_CPPFLAGS += -DPANGO_ENABLE_ENGINE
@@ -47,7 +51,17 @@ libabseil_la_SOURCES += ../abseil/absl/base/internal/throw_delegate.cc
 libabseil_la_SOURCES += ../abseil/absl/base/internal/unscaledcycleclock.cc
 libabseil_la_SOURCES += ../abseil/absl/numeric/int128.cc
 libabseil_la_SOURCES += ../abseil/absl/strings/ascii.cc
+libabseil_la_SOURCES += ../abseil/absl/strings/charconv.cc
+libabseil_la_SOURCES += ../abseil/absl/strings/internal/charconv_bigint.cc
+libabseil_la_SOURCES += ../abseil/absl/strings/internal/charconv_parse.cc
 libabseil_la_SOURCES += ../abseil/absl/strings/internal/memutil.cc
+libabseil_la_SOURCES += ../abseil/absl/strings/internal/str_format/arg.cc
+libabseil_la_SOURCES += ../abseil/absl/strings/internal/str_format/bind.cc
+libabseil_la_SOURCES += ../abseil/absl/strings/internal/str_format/extension.cc
+libabseil_la_SOURCES += ../abseil/absl/strings/internal/str_format/float_conversion.cc
+libabseil_la_SOURCES += ../abseil/absl/strings/internal/str_format/output.cc
+libabseil_la_SOURCES += ../abseil/absl/strings/internal/str_format/parser.cc
+libabseil_la_SOURCES += ../abseil/absl/strings/numbers.cc
 libabseil_la_SOURCES += ../abseil/absl/strings/str_cat.cc
 libabseil_la_SOURCES += ../abseil/absl/strings/str_split.cc
 libabseil_la_SOURCES += ../abseil/absl/strings/string_view.cc
@@ -73,7 +87,6 @@ GMOCK_LIBS =  libgmock.la libgmock_main.la
 TESS_LIBS = $(top_builddir)/src/api/libtesseract.la
 TRAINING_LIBS = $(top_builddir)/src/training/libtesseract_training.la
 TRAINING_LIBS += $(top_builddir)/src/training/libtesseract_tessopt.la
-TRAINING_LIBS += $(ICU_UC_LIBS)
 AM_CPPFLAGS +=   -isystem $(top_srcdir)/googletest/googletest/include \
                  -isystem $(top_srcdir)/googletest/googlemock/include
 
@@ -87,14 +100,18 @@ check_PROGRAMS = \
   denorm_test \
   fileio_test \
   heap_test \
+  imagedata_test \
   indexmapbidi_test \
   intfeaturemap_test \
   intsimdmatrix_test \
+  lang_model_test \
   linlsq_test \
   loadlang_test \
+  mastertrainer_test \
   matrix_test \
   nthitem_test \
   osd_test \
+  paragraphs_test \
   progress_test \
   qrsequence_test \
   rect_test \
@@ -137,7 +154,7 @@ colpartition_test_SOURCES = colpartition_test.cc
 colpartition_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)
 
 commandlineflags_test_SOURCES = commandlineflags_test.cc
-commandlineflags_test_LDADD = $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS)
+commandlineflags_test_LDADD = $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS) $(ICU_UC_LIBS)
 
 denorm_test_SOURCES = denorm_test.cc
 denorm_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)
@@ -148,6 +165,9 @@ fileio_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TESS_LIBS)
 heap_test_SOURCES = heap_test.cc
 heap_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)
 
+imagedata_test_SOURCES = imagedata_test.cc
+imagedata_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TESS_LIBS)
+
 indexmapbidi_test_SOURCES = indexmapbidi_test.cc
 indexmapbidi_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)
 
@@ -157,18 +177,30 @@ intfeaturemap_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)
 intsimdmatrix_test_SOURCES = intsimdmatrix_test.cc
 intsimdmatrix_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)
 
+lang_model_test_SOURCES = lang_model_test.cc
+lang_model_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS) $(ICU_I18N_LIBS) $(ICU_UC_LIBS)
+
 linlsq_test_SOURCES = linlsq_test.cc
 linlsq_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)
 
 loadlang_test_SOURCES = loadlang_test.cc
 loadlang_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS) $(LEPTONICA_LIBS)
 
+mastertrainer_test_SOURCES = mastertrainer_test.cc
+mastertrainer_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS)
+
 matrix_test_SOURCES = matrix_test.cc
 matrix_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)
 
 nthitem_test_SOURCES = nthitem_test.cc
 nthitem_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)
 
+pango_font_info_test_SOURCES = pango_font_info_test.cc
+pango_font_info_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)
+
+paragraphs_test_SOURCES = paragraphs_test.cc
+paragraphs_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TESS_LIBS)
+
 osd_test_SOURCES = osd_test.cc
 osd_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS) $(LEPTONICA_LIBS)
 
@@ -201,7 +233,7 @@ tfile_test_SOURCES = tfile_test.cc
 tfile_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)
 
 validator_test_SOURCES = validator_test.cc
-validator_test_LDADD = $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS)
+validator_test_LDADD = $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS) $(ICU_UC_LIBS)
 
 # for windows
 if T_WIN

diff --git a/unittest/imagedata_test.cc b/unittest/imagedata_test.cc
@@ -1,7 +1,24 @@
-#include "tesseract/ccstruct/imagedata.h"
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #include <string>
 #include <vector>
 
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+
+#include "imagedata.h"
+#include "include_gunit.h"
+#include "log.h"
+
 using tesseract::DocumentCache;
 using tesseract::DocumentData;
 using tesseract::ImageData;
@@ -15,8 +32,8 @@ class ImagedataTest : public ::testing::Test {
   ImagedataTest() {}
 
   // Creates a fake DocumentData, writes it to a file, and returns the filename.
-  string MakeFakeDoc(int num_pages, int doc_id,
-                     std::vector<string>* page_texts) {
+  std::string MakeFakeDoc(int num_pages, int doc_id,
+                     std::vector<std::string>* page_texts) {
     // The size of the fake images that we will use.
     const int kImageSize = 1048576;
     // Not using a real image here - just an array of zeros! We are just testing
@@ -26,7 +43,7 @@ class ImagedataTest : public ::testing::Test {
     for (int p = 0; p < num_pages; ++p) {
       // Make some fake text that is different for each page and save it.
       page_texts->push_back(
-          StringPrintf("Page %d of %d in doc %d", p, num_pages, doc_id));
+          absl::StrFormat("Page %d of %d in doc %d", p, num_pages, doc_id));
       // Make an imagedata and put it in the document.
       ImageData* imagedata =
           ImageData::Build("noname", p, "eng", fake_image.data(),
@@ -35,7 +52,7 @@ class ImagedataTest : public ::testing::Test {
       write_doc.AddPageToDocument(imagedata);
     }
     // Write it to a file.
-    string filename = file::JoinPath(
+    std::string filename = file::JoinPath(
         FLAGS_test_tmpdir, absl::StrCat("documentdata", doc_id, ".lstmf"));
     EXPECT_TRUE(write_doc.SaveDocument(filename.c_str(), nullptr));
     return filename;
@@ -52,8 +69,8 @@ TEST_F(ImagedataTest, CachesProperly) {
   // Order in which to read the pages, with some sequential and some seeks.
   const int kPageReadOrder[] = {0, 1, 2, 3, 8, 4, 5, 6, 7, 11, 10, 9, -1};
 
-  std::vector<string> page_texts;
-  string filename = MakeFakeDoc(kNumPages, 0, &page_texts);
+  std::vector<std::string> page_texts;
+  std::string filename = MakeFakeDoc(kNumPages, 0, &page_texts);
   // Now try getting it back with different memory allowances and check that
   // the pages can still be read.
   for (int m = 0; kMemoryAllowances[m] > 0; ++m) {
@@ -65,7 +82,8 @@ TEST_F(ImagedataTest, CachesProperly) {
     for (int p = 0; kPageReadOrder[p] >= 0; ++p) {
       int page = kPageReadOrder[p];
       const ImageData* imagedata = read_doc.GetPage(page);
-      EXPECT_NE(reinterpret_cast<const ImageData*>(nullptr), imagedata);
+      EXPECT_NE(nullptr, imagedata);
+      //EXPECT_NE(reinterpret_cast<ImageData*>(nullptr), imagedata);
       // Check that this is the right page.
       EXPECT_STREQ(page_texts[page].c_str(),
                    imagedata->transcription().string());
@@ -78,11 +96,11 @@ TEST_F(ImagedataTest, CachesMultiDocs) {
   // and the two caching strategies read images in the right order.
   // Number of pages in each document.
   const std::vector<int> kNumPages = {6, 5, 7};
-  std::vector<std::vector<string>> page_texts;
+  std::vector<std::vector<std::string>> page_texts;
   GenericVector<STRING> filenames;
   for (int d = 0; d < kNumPages.size(); ++d) {
-    page_texts.emplace_back(std::vector<string>());
-    string filename = MakeFakeDoc(kNumPages[d], d, &page_texts.back());
+    page_texts.emplace_back(std::vector<std::string>());
+    std::string filename = MakeFakeDoc(kNumPages[d], d, &page_texts.back());
     filenames.push_back(STRING(filename.c_str()));
   }
   // Now try getting them back with different cache strategies and check that

diff --git a/unittest/include_gunit.h b/unittest/include_gunit.h
@@ -28,6 +28,15 @@ class file : public tesseract::File {
   static int Defaults() {
     return 0;
   }
+
+  static std::string JoinPath(const std::string& s1, const std::string& s2) {
+    return tesseract::File::JoinPath(s1, s2);
+  }
+
+  static std::string JoinPath(const std::string& s1, const std::string& s2,
+                              const std::string& s3) {
+    return JoinPath(JoinPath(s1, s2), s3);
+  }
 };
 
 #if !defined(ABSL_ARRAYSIZE)

diff --git a/unittest/lang_model_test.cc b/unittest/lang_model_test.cc
@@ -1,13 +1,31 @@
-#include "tesseract/training/lang_model_helpers.h"
+// (C) Copyright 2017, Google Inc.
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
 
-#include "tesseract/lstm/lstmtrainer.h"
-#include "tesseract/training/unicharset_training_utils.h"
+#include <string>                       // for std::string
+
+#include "absl/strings/str_cat.h"
+
+#include "gmock/gmock.h"                // for testing::ElementsAreArray
+
+#include "include_gunit.h"
+#include "lang_model_helpers.h"
+#include "log.h"                        // for LOG
+#include "lstmtrainer.h"
+#include "unicharset_training_utils.h"
 
 namespace tesseract {
 namespace {
 
-string TestDataNameToPath(const string& name) {
-  return file::JoinPath(FLAGS_test_srcdir, "testdata", name);
+std::string TestDataNameToPath(const std::string& name) {
+  return file::JoinPath(TESTING_DIR, name);
 }
 
 // This is an integration test that verifies that CombineLangModel works to
@@ -18,15 +36,15 @@ TEST(LangModelTest, AddACharacter) {
   constexpr char kTestString[] = "Simple ASCII string to encode !@#$%&";
   constexpr char kTestStringRupees[] = "ASCII string with Rupee symbol ₹";
   // Setup the arguments.
-  string script_dir = file::JoinPath(FLAGS_test_srcdir, "langdata");
-  string eng_dir = file::JoinPath(script_dir, "eng");
-  string unicharset_path = TestDataNameToPath("eng_beam.unicharset");
+  std::string script_dir = LANGDATA_DIR;
+  std::string eng_dir = file::JoinPath(script_dir, "eng");
+  std::string unicharset_path = TestDataNameToPath("eng_beam.unicharset");
   UNICHARSET unicharset;
   EXPECT_TRUE(unicharset.load_from_file(unicharset_path.c_str()));
-  string version_str = "TestVersion";
-  string output_dir = FLAGS_test_tmpdir;
+  std::string version_str = "TestVersion";
+  std::string output_dir = FLAGS_test_tmpdir;
   LOG(INFO) << "Output dir=" << output_dir;
-  string lang1 = "eng";
+  std::string lang1 = "eng";
   bool pass_through_recoder = false;
   GenericVector<STRING> words, puncs, numbers;
   // If these reads fail, we get a warning message and an empty list of words.
@@ -44,7 +62,7 @@ TEST(LangModelTest, AddACharacter) {
                                 lang1, pass_through_recoder, words, puncs,
                                 numbers, lang_is_rtl, nullptr, nullptr));
   // Init a trainer with it, and encode a string.
-  string traineddata1 =
+  std::string traineddata1 =
       file::JoinPath(output_dir, lang1, absl::StrCat(lang1, ".traineddata"));
   LSTMTrainer trainer1;
   trainer1.InitCharSet(traineddata1);
@@ -58,13 +76,13 @@ TEST(LangModelTest, AddACharacter) {
                        &unicharset);
   EXPECT_EQ(size_before + 1, unicharset.size());
   // Generate the traineddata file.
-  string lang2 = "extended";
+  std::string lang2 = "extended";
   EXPECT_EQ(EXIT_SUCCESS,
             CombineLangModel(unicharset, script_dir, version_str, output_dir,
                              lang2, pass_through_recoder, words, puncs, numbers,
                              lang_is_rtl, nullptr, nullptr));
   // Init a trainer with it, and encode a string.
-  string traineddata2 =
+  std::string traineddata2 =
       file::JoinPath(output_dir, lang2, absl::StrCat(lang2, ".traineddata"));
   LSTMTrainer trainer2;
   trainer2.InitCharSet(traineddata2);
@@ -86,7 +104,7 @@ TEST(LangModelTest, AddACharacter) {
   }
   EXPECT_THAT(labels1_v,
               testing::ElementsAreArray(&labels2[0], labels2.size()));
-  // To make sure we weren't cheating somehow, we can now encode the Rupee
+  // To make sure we we are not cheating somehow, we can now encode the Rupee
   // symbol, which we could not do before.
   EXPECT_FALSE(trainer1.EncodeString(kTestStringRupees, &labels1));
   EXPECT_TRUE(trainer2.EncodeString(kTestStringRupees, &labels2));