Skip to content

Commit

Permalink
Fix unicharset_test
Browse files Browse the repository at this point in the history
  • Loading branch information
Shreeshrii committed Jan 18, 2019
1 parent db3ed5d commit 20ed60b
Show file tree
Hide file tree
Showing 3 changed files with 48 additions and 11 deletions.
10 changes: 10 additions & 0 deletions unittest/Makefile.am
Expand Up @@ -8,10 +8,12 @@ TESSDATA_DIR=$(shell cd $(top_srcdir) && cd .. && pwd)/tessdata
# Absolute path of directory 'testing' with test images and ground truth texts
# (using submodule test).
TESTING_DIR=$(shell cd $(top_srcdir) && pwd)/test/testing
TESTDATA_DIR=$(shell cd $(top_srcdir) && pwd)/test/testdata

AM_CPPFLAGS += -DLANGDATA_DIR="\"$(LANGDATA_DIR)\""
AM_CPPFLAGS += -DTESSDATA_DIR="\"$(TESSDATA_DIR)\""
AM_CPPFLAGS += -DTESTING_DIR="\"$(TESTING_DIR)\""
AM_CPPFLAGS += -DTESTDATA_DIR="\"$(TESTDATA_DIR)\""
AM_CPPFLAGS += -DPANGO_ENABLE_ENGINE
AM_CPPFLAGS += -I$(top_builddir)/src/api
AM_CPPFLAGS += -I$(top_srcdir)/src/api
Expand Down Expand Up @@ -123,6 +125,8 @@ check_PROGRAMS = \

if ENABLE_TRAINING
check_PROGRAMS += commandlineflags_test
check_PROGRAMS += unichar_test
check_PROGRAMS += unicharset_test
check_PROGRAMS += validator_test
endif

Expand Down Expand Up @@ -235,6 +239,12 @@ tabvector_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)
tfile_test_SOURCES = tfile_test.cc
tfile_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)

unichar_test_SOURCES = unichar_test.cc
unichar_test_LDADD = $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS) $(ICU_UC_LIBS)

unicharset_test_SOURCES = unicharset_test.cc
unicharset_test_LDADD = $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS) $(ICU_UC_LIBS)

validator_test_SOURCES = validator_test.cc
validator_test_LDADD = $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS) $(ICU_UC_LIBS)

Expand Down
18 changes: 15 additions & 3 deletions unittest/unichar_test.cc
@@ -1,4 +1,16 @@
#include "tesseract/ccutil/unichar.h"
// (C) Copyright 2017, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "unichar.h"
#include "include_gunit.h"

using tesseract::UNICHAR;

Expand All @@ -13,7 +25,7 @@ TEST(UnicharTest, Conversion) {
// Check for round-trip conversion.
std::vector<char32> utf32 = UNICHAR::UTF8ToUTF32(kUTF8Src);
EXPECT_THAT(utf32, testing::ElementsAreArray(kUTF32Src));
string utf8 = UNICHAR::UTF32ToUTF8(utf32);
std::string utf8 = UNICHAR::UTF32ToUTF8(utf32);
EXPECT_STREQ(kUTF8Src, utf8.c_str());
}

Expand All @@ -25,7 +37,7 @@ TEST(UnicharTest, InvalidText) {
std::vector<char32> utf32 = UNICHAR::UTF8ToUTF32(kInvalidUTF8);
EXPECT_TRUE(utf32.empty());
// Invalid utf32 produces an empty string.
string utf8 = UNICHAR::UTF32ToUTF8(kInvalidUTF32);
std::string utf8 = UNICHAR::UTF32ToUTF8(kInvalidUTF32);
EXPECT_TRUE(utf8.empty());
}

Expand Down
31 changes: 23 additions & 8 deletions unittest/unicharset_test.cc
@@ -1,4 +1,19 @@
#include "tesseract/ccutil/unicharset.h"
// (C) Copyright 2017, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include <string>
#include "log.h" // for LOG
#include "unicharset.h"
#include "gmock/gmock.h" // for testing::ElementsAreArray
#include "include_gunit.h"

using testing::ElementsAreArray;

Expand Down Expand Up @@ -32,7 +47,7 @@ TEST(UnicharsetTest, Basics) {
std::vector<int> v(&labels[0], &labels[0] + labels.size());
EXPECT_THAT(v, ElementsAreArray({3, 4, 4, 5, 7, 6}));
// With the fi ligature encoding fails without a pre-cleanup.
string lig_str = "af\ufb01ne";
std::string lig_str = "af\ufb01ne";
EXPECT_FALSE(
u.encode_string(lig_str.c_str(), true, &labels, nullptr, nullptr));
lig_str = u.CleanupString(lig_str.c_str());
Expand Down Expand Up @@ -62,7 +77,7 @@ TEST(UnicharsetTest, Multibyte) {
EXPECT_EQ(u.size(), 9);
EXPECT_EQ(u.unichar_to_id("\u0627"), 3);
EXPECT_EQ(u.unichar_to_id("\u062c"), 4);
// The first two bytes of this string is \u0627, which matches id 3;
// The first two bytes of this std::string is \u0627, which matches id 3;
EXPECT_EQ(u.unichar_to_id("\u0627\u062c", 2), 3);
EXPECT_EQ(u.unichar_to_id("\u062f"), 5);
// Individual f and i are not present, but they are there as a pair.
Expand All @@ -79,13 +94,13 @@ TEST(UnicharsetTest, Multibyte) {
// With the fi ligature the fi is picked out.
GenericVector<char> lengths;
int encoded_length;
string src_str = "\u0627\u062c\ufb01\u0635\u062b";
std::string src_str = "\u0627\u062c\ufb01\u0635\u062b";
// src_str has to be pre-cleaned for lengths to be correct.
string cleaned = u.CleanupString(src_str.c_str());
std::string cleaned = u.CleanupString(src_str.c_str());
EXPECT_TRUE(u.encode_string(cleaned.c_str(), true, &labels, &lengths,
&encoded_length));
EXPECT_EQ(encoded_length, cleaned.size());
string len_str(&lengths[0], lengths.size());
std::string len_str(&lengths[0], lengths.size());
EXPECT_STREQ(len_str.c_str(), "\002\002\002\002\002");
v = std::vector<int>(&labels[0], &labels[0] + labels.size());
EXPECT_THAT(v, ElementsAreArray({3, 4, 6, 8, 7}));
Expand Down Expand Up @@ -128,8 +143,8 @@ TEST(UnicharsetTest, MultibyteBigrams) {
TEST(UnicharsetTest, OldStyle) {
// This test verifies an old unicharset that contains fi/fl ligatures loads
// and keeps all the entries.
string filename =
file::JoinPath(FLAGS_test_srcdir, "testdata", "eng.unicharset");
std::string filename =
file::JoinPath(TESTDATA_DIR, "eng.unicharset");
UNICHARSET u;
LOG(INFO) << "Filename=" << filename;
EXPECT_TRUE(u.load_from_file(filename.c_str()));
Expand Down

0 comments on commit 20ed60b

Please sign in to comment.