/
validator.h
246 lines (225 loc) · 10.7 KB
/
validator.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
/**********************************************************************
* File: validator.h
* Description: Base class for various text validators. Intended mainly for
* scripts that use a virama character.
* Author: Ray Smith
*
* (C) Copyright 2017, Google Inc.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
**********************************************************************/
#ifndef TESSERACT_TRAINING_VALIDATOR_H_
#define TESSERACT_TRAINING_VALIDATOR_H_
#include <memory>
#include <vector>
#include "unichar.h"
namespace tesseract {
// Different kinds of grapheme normalization - not just for Indic!
// A grapheme is a syllable unit in Indic and can be several unicodes.
// In other scripts, a grapheme is a base character and accent/diacritic
// combination, as not all accented characters have a single composed form.
enum class GraphemeNormMode {
// Validation result is a single string, even if input is multi-word.
kSingleString,
// Standard unicode graphemes are validated and output as grapheme units.
kCombined,
// Graphemes are validated and sub-divided. For virama-using scripts, units
// that correspond to repeatable glyphs are generated. (Mostly single unicodes
// but viramas and joiners are paired with the most sensible neighbor.)
// For non-virama scripts, this means that base/accent pairs are separated,
// ie the output is individual unicodes.
kGlyphSplit,
// The output is always single unicodes, regardless of the script.
kIndividualUnicodes,
};
// An enum representing the scripts that use a virama character. It is
// guaranteed that the value of any element, (except kNonVirama) can be cast
// to a unicode (char32) value that represents the start of the unicode range
// of the corresponding script.
enum class ViramaScript : char32 {
kNonVirama = 0,
kDevanagari = 0x900,
kBengali = 0x980,
kGurmukhi = 0xa00,
kGujarati = 0xa80,
kOriya = 0xb00,
kTamil = 0xb80,
kTelugu = 0xc00,
kKannada = 0xc80,
kMalayalam = 0xd00,
kSinhala = 0xd80,
kMyanmar = 0x1000,
kKhmer = 0x1780,
kJavanese = 0xa980,
};
// Base class offers a validation API and protected methods to allow subclasses
// to easily build the validated/segmented output.
class Validator {
public:
// Validates and cleans the src vector of unicodes to the *dest, according to
// g_mode. In the case of kSingleString, a single vector containing the whole
// result is added to *dest. With kCombined, multiple vectors are added to
// *dest with one grapheme in each. With kGlyphSplit, multiple vectors are
// added to *dest with a smaller unit representing a glyph in each.
// In case of validation error, returns false and as much as possible of the
// input, without discarding invalid text.
static bool ValidateCleanAndSegment(GraphemeNormMode g_mode,
bool report_errors,
const std::vector<char32>& src,
std::vector<std::vector<char32>>* dest);
// Returns true if the unicode ch is a non-printing zero-width mark of no
// significance to OCR training or evaluation.
static bool IsZeroWidthMark(char32 ch) {
return ch == kZeroWidthSpace || ch == kLeftToRightMark ||
ch == kRightToLeftMark || ch == kInvalid;
}
virtual ~Validator();
// Some specific but universally useful unicodes.
static const char32 kZeroWidthSpace;
static const char32 kZeroWidthNonJoiner;
static const char32 kZeroWidthJoiner;
static const char32 kLeftToRightMark;
static const char32 kRightToLeftMark;
static const char32 kInvalid;
protected:
// These are more or less the character class identifiers in the ISCII
// standard, section 8. They have been augmented with the Unicode meta
// characters Zero Width Joiner and Zero Width Non Joiner, and the
// Unicode Vedic Marks.
// The best sources of information on Unicode and Indic scripts are:
// http://varamozhi.sourceforge.net/iscii91.pdf
// http://www.unicode.org/versions/Unicode9.0.0/ch12.pdf
// http://unicode.org/faq/indic.html
// http://www.microsoft.com/typography/otfntdev/teluguot/shaping.aspx
enum class CharClass {
// NOTE: The values of the enum members are meaningless and arbitrary, ie
// they are not used for sorting, or any other risky application.
// The reason they are what they are is they are a single character
// abbreviation that can be used in a regexp/BNF definition of a grammar,
// IN A COMMENT, and still not relied upon in the code.
kConsonant = 'C',
kVowel = 'V',
kVirama = 'H', // (aka Halant)
kMatra = 'M', // (aka Dependent Vowel)
kMatraPiece = 'P', // unicode provides pieces of Matras.
kVowelModifier = 'D', // (candrabindu, anusvara, visarga, other marks)
kZeroWidthNonJoiner = 'z', // Unicode Zero Width Non-Joiner U+200C
kZeroWidthJoiner = 'Z', // Unicode Zero Width Joiner U+200D
kVedicMark = 'v', // Modifiers can come modify any indic syllable.
kNukta = 'N', // Occurs only immediately after consonants.
kRobat = 'R', // Khmer only.
kOther = 'O', // (digits, measures, non-Indic, etc)
// Additional classes used only by ValidateGrapheme.
kWhitespace = ' ',
kCombiner = 'c', // Combiners other than virama.
};
using IndicPair = std::pair<CharClass, char32>;
Validator(ViramaScript script, bool report_errors)
: script_(script),
codes_used_(0),
output_used_(0),
report_errors_(report_errors) {}
// Factory method that understands how to map script to the right subclass.
static std::unique_ptr<Validator> ScriptValidator(ViramaScript script,
bool report_errors);
// Internal version of the public static ValidateCleanAndSegment.
// Validates and cleans the src vector of unicodes to the *dest, according to
// its type and the given g_mode.
// In case of validation error, returns false and returns as much as possible
// of the input, without discarding invalid text.
bool ValidateCleanAndSegmentInternal(GraphemeNormMode g_mode,
const std::vector<char32>& src,
std::vector<std::vector<char32>>* dest);
// Moves the results from parts_ or output_ to dest according to g_mode.
void MoveResultsToDest(GraphemeNormMode g_mode,
std::vector<std::vector<char32>>* dest);
// Computes and returns the ViramaScript corresponding to the most frequent
// virama-using script in the input, or kNonVirama if none are present.
static ViramaScript MostFrequentViramaScript(
const std::vector<char32>& utf32);
// Returns true if the given UTF-32 unicode is a "virama" character.
static bool IsVirama(char32 unicode);
// Returns true if the given UTF-32 unicode is a vedic accent.
static bool IsVedicAccent(char32 unicode);
// Returns true if the script is one that uses subscripts for conjuncts.
bool IsSubscriptScript() const;
// Helper function appends the next element of codes_ only to output_,
// without touching parts_
// Returns true at the end of codes_.
bool CodeOnlyToOutput() {
output_.push_back(codes_[codes_used_].second);
return ++codes_used_ == codes_.size();
}
// Helper function adds a length-element vector to parts_ from the last length
// elements of output_. If there are more than length unused elements in
// output_, adds unicodes as single-element vectors to parts_ to catch
// output_used_ up to output->size() - length before adding the length-element
// vector.
void MultiCodePart(unsigned length) {
while (output_used_ + length < output_.size()) {
parts_.emplace_back(
std::initializer_list<char32>{output_[output_used_++]});
}
parts_.emplace_back(std::initializer_list<char32>{output_[output_used_]});
while (++output_used_ < output_.size()) {
parts_.back().push_back(output_[output_used_]);
}
}
// Helper function appends the next element of codes_ to output_, and then
// calls MultiCodePart to add the appropriate components to parts_.
// Returns true at the end of codes_.
bool UseMultiCode(unsigned length) {
output_.push_back(codes_[codes_used_].second);
MultiCodePart(length);
return ++codes_used_ == codes_.size();
}
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
// parts_ and output_. Returns true if a valid Grapheme was consumed,
// otherwise does not increment codes_used_.
virtual bool ConsumeGraphemeIfValid() = 0;
// Sets codes_ to the class codes for the given unicode text.
void ComputeClassCodes(const std::vector<char32>& text);
// Returns the CharClass corresponding to the given Unicode ch.
virtual CharClass UnicodeToCharClass(char32 ch) const = 0;
// Resets to the initial state.
void Clear();
// Number of unicodes in each Indic codepage.
static const int kIndicCodePageSize = 128;
// Lowest unicode value of any Indic script. (Devanagari).
static const char32 kMinIndicUnicode = 0x900;
// Highest unicode value of any consistent (ISCII-based) Indic script.
static const char32 kMaxSinhalaUnicode = 0xdff;
// Highest unicode value of any virama-using script. (Khmer).
static const char32 kMaxViramaScriptUnicode = 0x17ff;
// Some special unicodes.
static const char32 kSinhalaVirama = 0xdca;
static const char32 kMyanmarVirama = 0x1039;
static const char32 kKhmerVirama = 0x17d2;
// Javanese Script - aksarajawa
static const char32 kJavaneseVirama = 0xa9c0;
static const char32 kMaxJavaneseUnicode = 0xa9df;
// Script we are operating on.
ViramaScript script_;
// Input unicodes with assigned CharClass is the data to be validated.
std::vector<IndicPair> codes_;
// Glyph-like components of the input.
std::vector<std::vector<char32>> parts_;
// Copied validated unicodes from codes_ that are OK to output.
std::vector<char32> output_;
// The number of elements of codes_ that have been processed so far.
unsigned codes_used_;
// The number of elements of output_ that have already been added to parts_.
unsigned output_used_;
// Log error messages for reasons why text is invalid.
bool report_errors_;
};
} // namespace tesseract
#endif // TESSERACT_TRAINING_VALIDATOR_H_