From 611d5e635897d8ac97a0a5a9bdc4697bd343eb04 Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Tue, 22 Jan 2019 15:17:54 +0100 Subject: [PATCH] unittest: Fix and enable validate_indic_test Signed-off-by: Stefan Weil --- unittest/Makefile.am | 4 ++ unittest/normstrngs_test.h | 41 +++++++++----- unittest/validate_indic_test.cc | 97 ++++++++++++++++++--------------- 3 files changed, 85 insertions(+), 57 deletions(-) diff --git a/unittest/Makefile.am b/unittest/Makefile.am index e93c5dd72f..ce014b9c08 100644 --- a/unittest/Makefile.am +++ b/unittest/Makefile.am @@ -136,6 +136,7 @@ check_PROGRAMS += unichar_test check_PROGRAMS += unicharset_test check_PROGRAMS += unicharcompress_test check_PROGRAMS += validate_grapheme_test +check_PROGRAMS += validate_indic_test check_PROGRAMS += validator_test endif @@ -266,6 +267,9 @@ unicharset_test_LDADD = $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS) $(ICU_UC_LIB validate_grapheme_test_SOURCES = validate_grapheme_test.cc validate_grapheme_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS) $(ICU_I18N_LIBS) $(ICU_UC_LIBS) +validate_indic_test_SOURCES = validate_indic_test.cc +validate_indic_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS) $(ICU_I18N_LIBS) $(ICU_UC_LIBS) + validator_test_SOURCES = validator_test.cc validator_test_LDADD = $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS) $(ICU_UC_LIBS) diff --git a/unittest/normstrngs_test.h b/unittest/normstrngs_test.h index e19238b047..2961bf613a 100644 --- a/unittest/normstrngs_test.h +++ b/unittest/normstrngs_test.h @@ -1,42 +1,55 @@ +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #ifndef TESSERACT_UNITTEST_NORMSTRNGS_TEST_H_ #define TESSERACT_UNITTEST_NORMSTRNGS_TEST_H_ +#include // for std::stringstream #include #include -#include "base/stringprintf.h" #include "absl/strings/str_cat.h" #include "absl/strings/str_join.h" -#include "tesseract/ccutil/unichar.h" +#include "unichar.h" namespace tesseract { -inline string CodepointList(const std::vector& str32) { - string result; +inline std::string CodepointList(const std::vector& str32) { + std::stringstream result; int total_chars = str32.size(); + result << std::hex; for (int i = 0; i < total_chars; ++i) { - StringAppendF(&result, "[%x]", str32[i]); + result << "[" << str32[i] << "]"; } - return result; + return result.str(); } -inline string PrintString32WithUnicodes(const string& str) { +inline std::string PrintString32WithUnicodes(const std::string& str) { std::vector str32 = UNICHAR::UTF8ToUTF32(str.c_str()); return absl::StrCat("\"", str, "\" ", CodepointList(str32)); } -inline string PrintStringVectorWithUnicodes(const std::vector& glyphs) { - string result; +inline std::string PrintStringVectorWithUnicodes(const std::vector& glyphs) { + std::string result; for (const auto& s : glyphs) { - absl::StrAppend(&result, "Glyph:", PrintString32WithUnicodes(s), "\n"); + result += "Glyph:"; + result += PrintString32WithUnicodes(s) + "\n"; } return result; } -inline void ExpectGraphemeModeResults(const string& str, UnicodeNormMode u_mode, +inline void ExpectGraphemeModeResults(const std::string& str, UnicodeNormMode u_mode, int unicode_count, int glyph_count, int grapheme_count, - const string& target_str) { - std::vector glyphs; + const std::string& target_str) { + std::vector glyphs; EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( u_mode, OCRNorm::kNone, GraphemeNormMode::kIndividualUnicodes, true, str.c_str(), &glyphs)); @@ -60,7 +73,7 @@ inline void ExpectGraphemeModeResults(const string& str, UnicodeNormMode u_mode, true, str.c_str(), &glyphs)); EXPECT_EQ(glyphs.size(), 1) << PrintStringVectorWithUnicodes(glyphs); EXPECT_EQ(target_str, glyphs[0]); - string result; + std::string result; EXPECT_TRUE(NormalizeUTF8String( u_mode, OCRNorm::kNone, GraphemeNorm::kNormalize, str.c_str(), &result)); EXPECT_EQ(target_str, result); diff --git a/unittest/validate_indic_test.cc b/unittest/validate_indic_test.cc index e87635fd00..42f78a2249 100644 --- a/unittest/validate_indic_test.cc +++ b/unittest/validate_indic_test.cc @@ -1,6 +1,17 @@ -#include "tesseract/training/normstrngs.h" - -#include "tesseract/unittest/normstrngs_test.h" +// (C) Copyright 2017, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "include_gunit.h" +#include "normstrngs.h" +#include "normstrngs_test.h" namespace tesseract { namespace { @@ -16,8 +27,8 @@ namespace { // normalizer always puts a termninating ZWNJ on the end if not present, // and accepts the string as valid. TEST(ValidateIndicTest, AddsJoinerToTerminalVirama) { - string str = "\u0c15\u0c4d"; // KA - virama - string target_str = "\u0c15\u0c4d\u200c"; // KA - virama - ZWNJ + std::string str = "\u0c15\u0c4d"; // KA - virama + std::string target_str = "\u0c15\u0c4d\u200c"; // KA - virama - ZWNJ ExpectGraphemeModeResults(str, UnicodeNormMode::kNFC, 3, 2, 1, target_str); // Same result if we started with the normalized string. ExpectGraphemeModeResults(target_str, UnicodeNormMode::kNFC, 3, 2, 1, @@ -26,8 +37,8 @@ TEST(ValidateIndicTest, AddsJoinerToTerminalVirama) { // Only one dependent vowel is allowed. TEST(ValidateIndicTest, OnlyOneDependentVowel) { - string str = "\u0d15\u0d3e\u0d42"; // KA AA UU - string dest; + std::string str = "\u0d15\u0d3e\u0d42"; // KA AA UU + std::string dest; EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize, str.c_str(), &dest)) @@ -43,13 +54,13 @@ TEST(ValidateIndicTest, OnlyOneDependentVowel) { // References: // http://www.omniglot.com/writing/telugu.htm TEST(ValidateIndicTest, OnlyOneVowelModifier) { - string str = "\u0c26\u0c4d\u0c01"; // DA virama candrabindu - string result; + std::string str = "\u0c26\u0c4d\u0c01"; // DA virama candrabindu + std::string result; EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize, str.c_str(), &result)); // It made 1 grapheme of 4 chars, by terminating the explicit virama. - EXPECT_EQ(string("\u0c26\u0c4d\u200c\u0c01"), result); + EXPECT_EQ(std::string("\u0c26\u0c4d\u200c\u0c01"), result); str = "\u0995\u0983\u0981"; // KA visarga candrabindu EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, @@ -73,8 +84,8 @@ TEST(ValidateIndicTest, OnlyOneVowelModifier) { // and the Microsoft page // http://www.microsoft.com/typography/otfntdev/teluguot/shaping.aspx TEST(ValidateIndicTest, VowelModifierMustBeLast) { - string str = "\u0c28\u0c02\u0c3f"; // NA Sunna I - string dest; + std::string str = "\u0c28\u0c02\u0c3f"; // NA Sunna I + std::string dest; EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize, str.c_str(), &dest)) @@ -96,8 +107,8 @@ TEST(ValidateIndicTest, VowelModifierMustBeLast) { // Principles of the Devanagari Script: Dependent Vowel Signs (Matras). // + http://varamozhi.sourceforge.net/iscii91.pdf TEST(ValidateIndicTest, MatrasFollowConsonantsNotVowels) { - string str = "\u0c05\u0c47"; // A EE - string dest; + std::string str = "\u0c05\u0c47"; // A EE + std::string dest; EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize, str.c_str(), &dest)) @@ -111,8 +122,8 @@ TEST(ValidateIndicTest, MatrasFollowConsonantsNotVowels) { // Sub-graphemes are allowed if GraphemeNorm is turned off. TEST(ValidateIndicTest, SubGraphemes) { - string str = "\u0d3e"; // AA - string dest; + std::string str = "\u0d3e"; // AA + std::string dest; EXPECT_FALSE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize, str.c_str(), &dest)) @@ -124,32 +135,32 @@ TEST(ValidateIndicTest, SubGraphemes) { } TEST(ValidateIndicTest, Nukta) { - string str = "\u0c95\u0cbc\u0ccd\u0cb9"; // KA Nukta Virama HA - std::vector glyphs; + std::string str = "\u0c95\u0cbc\u0ccd\u0cb9"; // KA Nukta Virama HA + std::vector glyphs; EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit, true, str.c_str(), &glyphs)); EXPECT_EQ(glyphs.size(), 3); - EXPECT_EQ(glyphs[2], string("\u0ccd\u0cb9")); + EXPECT_EQ(glyphs[2], std::string("\u0ccd\u0cb9")); // Swapped Nukta and Virama are not allowed, but NFC normalization fixes it. - string str2 = "\u0c95\u0ccd\u0cbc\u0cb9"; // KA Virama Nukta HA + std::string str2 = "\u0c95\u0ccd\u0cbc\u0cb9"; // KA Virama Nukta HA ExpectGraphemeModeResults(str2, UnicodeNormMode::kNFC, 4, 3, 1, str); } // Sinhala has some of its own specific rules. See www.macciato.com/sinhala TEST(ValidateIndicTest, SinhalaRakaransaya) { - string str = "\u0d9a\u0dca\u200d\u0dbb"; // KA Virama ZWJ Rayanna - string dest; + std::string str = "\u0d9a\u0dca\u200d\u0dbb"; // KA Virama ZWJ Rayanna + std::string dest; EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize, str.c_str(), &dest)) << PrintString32WithUnicodes(str); EXPECT_EQ(dest, str); - std::vector glyphs; + std::vector glyphs; EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit, true, str.c_str(), &glyphs)); EXPECT_EQ(glyphs.size(), 2); - EXPECT_EQ(glyphs[1], string("\u0dca\u200d\u0dbb")); + EXPECT_EQ(glyphs[1], std::string("\u0dca\u200d\u0dbb")); // Can be followed by a dependent vowel. str += "\u0dd9"; // E EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, @@ -159,8 +170,8 @@ TEST(ValidateIndicTest, SinhalaRakaransaya) { } TEST(ValidateIndicTest, SinhalaYansaya) { - string str = "\u0d9a\u0dca\u200d\u0dba"; // KA Virama ZWJ Yayanna - string dest; + std::string str = "\u0d9a\u0dca\u200d\u0dba"; // KA Virama ZWJ Yayanna + std::string dest; EXPECT_TRUE(NormalizeUTF8String(UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNorm::kNormalize, str.c_str(), &dest)) << PrintString32WithUnicodes(str); @@ -171,51 +182,51 @@ TEST(ValidateIndicTest, SinhalaYansaya) { GraphemeNorm::kNormalize, str.c_str(), &dest)) << PrintString32WithUnicodes(str); EXPECT_EQ(dest, str); - std::vector glyphs; + std::vector glyphs; EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit, true, str.c_str(), &glyphs)); EXPECT_EQ(glyphs.size(), 3); - EXPECT_EQ(glyphs[1], string("\u0dca\u200d\u0dba")); + EXPECT_EQ(glyphs[1], std::string("\u0dca\u200d\u0dba")); } TEST(ValidateIndicTest, SinhalaRepaya) { - string str = "\u0d9a\u0dbb\u0dca\u200d\u0db8"; // KA Rayanna Virama ZWJ MA - std::vector glyphs; + std::string str = "\u0d9a\u0dbb\u0dca\u200d\u0db8"; // KA Rayanna Virama ZWJ MA + std::vector glyphs; EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true, str.c_str(), &glyphs)); EXPECT_EQ(glyphs.size(), 2); - EXPECT_EQ(glyphs[1], string("\u0dbb\u0dca\u200d\u0db8")); + EXPECT_EQ(glyphs[1], std::string("\u0dbb\u0dca\u200d\u0db8")); EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit, true, str.c_str(), &glyphs)); EXPECT_EQ(glyphs.size(), 3); - EXPECT_EQ(glyphs[1], string("\u0dbb\u0dca\u200d")); + EXPECT_EQ(glyphs[1], std::string("\u0dbb\u0dca\u200d")); } TEST(ValidateIndicTest, SinhalaSpecials) { // Sinhala has some exceptions from the usual rules. - string str = "\u0dc0\u0d9c\u0dca\u200d\u0dbb\u0dca\u200d\u0dbb\u0dca\u200d"; - std::vector glyphs; + std::string str = "\u0dc0\u0d9c\u0dca\u200d\u0dbb\u0dca\u200d\u0dbb\u0dca\u200d"; + std::vector glyphs; EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit, true, str.c_str(), &glyphs)); EXPECT_EQ(glyphs.size(), 5) << PrintStringVectorWithUnicodes(glyphs); - EXPECT_EQ(glyphs[0], string("\u0dc0")); - EXPECT_EQ(glyphs[1], string("\u0d9c")); - EXPECT_EQ(glyphs[2], string("\u0dca\u200d\u0dbb")); - EXPECT_EQ(glyphs[3], string("\u0dca\u200d")); - EXPECT_EQ(glyphs[4], string("\u0dbb\u0dca\u200d")); + EXPECT_EQ(glyphs[0], std::string("\u0dc0")); + EXPECT_EQ(glyphs[1], std::string("\u0d9c")); + EXPECT_EQ(glyphs[2], std::string("\u0dca\u200d\u0dbb")); + EXPECT_EQ(glyphs[3], std::string("\u0dca\u200d")); + EXPECT_EQ(glyphs[4], std::string("\u0dbb\u0dca\u200d")); str = "\u0dc3\u0dbb\u0dca\u200d\u0dbb\u0dca\u200d\u0dcf"; EXPECT_TRUE(NormalizeCleanAndSegmentUTF8( UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit, true, str.c_str(), &glyphs)); EXPECT_EQ(glyphs.size(), 4) << PrintStringVectorWithUnicodes(glyphs); - EXPECT_EQ(glyphs[0], string("\u0dc3")); - EXPECT_EQ(glyphs[1], string("\u0dbb\u0dca\u200d")); - EXPECT_EQ(glyphs[2], string("\u0dbb\u0dca\u200d")); - EXPECT_EQ(glyphs[3], string("\u0dcf")); + EXPECT_EQ(glyphs[0], std::string("\u0dc3")); + EXPECT_EQ(glyphs[1], std::string("\u0dbb\u0dca\u200d")); + EXPECT_EQ(glyphs[2], std::string("\u0dbb\u0dca\u200d")); + EXPECT_EQ(glyphs[3], std::string("\u0dcf")); } } // namespace