/
validate_javanese.cpp
275 lines (263 loc) · 10.8 KB
/
validate_javanese.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
/**********************************************************************
* File: validate_javanese.cpp
* Description: Text validator for Javanese Script - aksara jawa.
* Author: Shree Devi Kumar
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
**********************************************************************/
#include "validate_javanese.h"
#include "errcode.h"
#include "tprintf.h"
namespace tesseract {
// Returns whether codes matches the pattern for a Javanese Grapheme.
// Taken from unicode standard:
// http://www.unicode.org/charts/PDF/UA980.pdf
// http://www.unicode.org/versions/Unicode11.0.0/ch17.pdf
// The Consonant class here includes independent vowels.
// The order of components in an orthographic syllable as expressed in BNF is:
// {C F} C {{R}Y} {V{A}} {Z}
// Translated to the codes used by the CharClass enum:
// [(V|C[N])(H)] (V|C[N]) [[N]N] [M[D]] [v]
// Also see https://r12a.github.io/scripts/javanese/ for detailed notes.
// Validation rules copied from validate_indic.cpp and modified for Javanese.
// Indic - for reference
// + vowel Grapheme: V[D](v)*
// + consonant Grapheme: (C[N](H|HZ|Hz|ZH)?)*C[N](H|Hz)?[M[P]][D](v)*
bool ValidateJavanese::ConsumeGraphemeIfValid() {
switch (codes_[codes_used_].first) {
case CharClass::kConsonant:
return ConsumeConsonantHeadIfValid() && ConsumeConsonantTailIfValid();
case CharClass::kVowel:
case CharClass::kVedicMark:
return ConsumeVowelIfValid();
case CharClass::kZeroWidthJoiner:
case CharClass::kZeroWidthNonJoiner:
// Apart from within an aksara, joiners are silently dropped.
if (report_errors_)
tprintf("Dropping isolated joiner: 0x%x\n", codes_[codes_used_].second);
++codes_used_;
return true;
case CharClass::kOther:
UseMultiCode(1);
return true;
default:
if (report_errors_) {
tprintf("Invalid start of grapheme sequence:%c=0x%x\n",
codes_[codes_used_].first, codes_[codes_used_].second);
}
return false;
}
}
// Helper consumes/copies a virama and any associated post-virama joiners.
// A linking virama (with either type of pre-virama joiner, post-virama ZWJ, or
// no joiner at all) must be followed by a consonant.
// A non-linking (explicit) virama is indicated by a ZWNJ after it, or a non
// consonant, space, or character from a different script. We clean up the
// representation to make it consistent by adding a ZWNJ if missing from a
// non-linking virama. Returns false with an invalid sequence.
bool ValidateJavanese::ConsumeViramaIfValid(IndicPair joiner, bool post_matra) {
const unsigned num_codes = codes_.size();
if (joiner.first == CharClass::kOther) {
CodeOnlyToOutput();
if (codes_used_ < num_codes &&
codes_[codes_used_].second == kZeroWidthJoiner) {
// Post-matra viramas must be explicit, so no joiners allowed here.
if (post_matra) {
if (report_errors_) tprintf("ZWJ after a post-matra virama!!\n");
return false;
}
if (codes_used_ + 1 < num_codes &&
codes_[codes_used_ - 2].second != kCakra &&
(codes_[codes_used_ + 1].second == kZeroWidthNonJoiner ||
codes_[codes_used_ + 1].second == kPengkal ||
codes_[codes_used_ + 1].second == kCakra)) {
// This combination will be picked up later.
ASSERT_HOST(!CodeOnlyToOutput());
} else {
// Half-form with optional Nukta.
unsigned len = output_.size() + 1 - output_used_;
if (UseMultiCode(len)) return true;
}
if (codes_used_ < num_codes &&
codes_[codes_used_].second == kZeroWidthNonJoiner) {
if (output_used_ == output_.size() ||
output_[output_used_] != kCakra) {
if (report_errors_) {
tprintf("Virama ZWJ ZWNJ in non-Sinhala: base=0x%x!\n",
static_cast<int>(script_));
}
return false;
}
// Special Sinhala case of Stand-alone Repaya. ['RA' H Z z]
if (UseMultiCode(4)) return true;
}
} else if (codes_used_ == num_codes ||
codes_[codes_used_].first != CharClass::kConsonant ||
post_matra) {
if (codes_used_ == num_codes ||
codes_[codes_used_].second != kZeroWidthNonJoiner) {
// It is valid to have an unterminated virama at the end of a word, but
// for consistency, we will always add ZWNJ if not present.
CodeOnlyToOutput();
} else {
CodeOnlyToOutput();
}
// Explicit virama [H z]
MultiCodePart(2);
}
} else {
// Pre-virama joiner [{Z|z} H] requests specific conjunct.
if (UseMultiCode(2)) {
if (report_errors_)
tprintf("Invalid pre-virama joiner with no 2nd consonant!!\n");
return false;
}
if (codes_[codes_used_].second == kZeroWidthJoiner ||
codes_[codes_used_].second == kZeroWidthNonJoiner) {
if (report_errors_) {
tprintf("JHJ!!: 0x%x 0x%x 0x%x\n", joiner.second, output_.back(),
codes_[codes_used_].second);
}
return false;
}
}
// It is good so far as it goes.
return true;
}
// Helper consumes/copies a series of consonants separated by viramas while
// valid, but not any vowel or other modifiers.
bool ValidateJavanese::ConsumeConsonantHeadIfValid() {
const unsigned num_codes = codes_.size();
// Consonant aksara
do {
CodeOnlyToOutput();
// Special Sinhala case of [H Z Yayana/Rayana].
int index = output_.size() - 3;
if (output_used_ + 3 <= output_.size() &&
(output_.back() == kPengkal || output_.back() == kCakra) &&
IsVirama(output_[index]) && output_[index + 1] == kZeroWidthJoiner) {
MultiCodePart(3);
}
bool have_nukta = false;
if (codes_used_ < num_codes &&
codes_[codes_used_].first == CharClass::kNukta) {
have_nukta = true;
CodeOnlyToOutput();
}
// Test for subscript conjunct.
index = output_.size() - 2 - have_nukta;
if (output_used_ + 2 + have_nukta <= output_.size() && IsSubscriptScript() &&
IsVirama(output_[index])) {
// Output previous virama, consonant + optional nukta.
MultiCodePart(2 + have_nukta);
}
IndicPair joiner(CharClass::kOther, 0);
if (codes_used_ < num_codes &&
(codes_[codes_used_].second == kZeroWidthJoiner ||
(codes_[codes_used_].second == kZeroWidthNonJoiner &&
script_ == ViramaScript::kMalayalam))) {
joiner = codes_[codes_used_];
if (++codes_used_ == num_codes) {
if (report_errors_) {
tprintf("Skipping ending joiner: 0x%x 0x%x\n", output_.back(),
joiner.second);
}
return true;
}
if (codes_[codes_used_].first == CharClass::kVirama) {
output_.push_back(joiner.second);
} else {
if (report_errors_) {
tprintf("Skipping unnecessary joiner: 0x%x 0x%x 0x%x\n",
output_.back(), joiner.second, codes_[codes_used_].second);
}
joiner = std::make_pair(CharClass::kOther, 0);
}
}
if (codes_used_ < num_codes &&
codes_[codes_used_].first == CharClass::kVirama) {
if (!ConsumeViramaIfValid(joiner, false)) return false;
} else {
break; // No virama, so the run of consonants is over.
}
} while (codes_used_ < num_codes &&
codes_[codes_used_].first == CharClass::kConsonant);
if (output_used_ < output_.size()) MultiCodePart(1);
return true;
}
// Helper consumes/copies a tail part of a consonant, comprising optional
// matra/piece, vowel modifier, vedic mark, terminating virama.
bool ValidateJavanese::ConsumeConsonantTailIfValid() {
if (codes_used_ == codes_.size()) return true;
// No virama: Finish the grapheme.
// Are multiple matras allowed?
if (codes_[codes_used_].first == CharClass::kMatra) {
if (UseMultiCode(1)) return true;
if (codes_[codes_used_].first == CharClass::kMatraPiece) {
if (UseMultiCode(1)) return true;
}
}
// Tarung also used for long versions of u and o vowels and vocalic r
// Taling + Tarung is valid eg. ꦏ + ◌ꦺ + ◌ꦴ
while (codes_[codes_used_].first == CharClass::kMatraPiece) {
if (UseMultiCode(1)) return true;
}
while (codes_[codes_used_].first == CharClass::kVowelModifier) {
if (UseMultiCode(1)) return true;
// Only Malayalam allows only repeated 0xd02.
if (script_ != ViramaScript::kMalayalam || output_.back() != 0xd02) break;
}
while (codes_[codes_used_].first == CharClass::kVedicMark) {
if (UseMultiCode(1)) return true;
}
if (codes_[codes_used_].first == CharClass::kVirama) {
if (!ConsumeViramaIfValid(IndicPair(CharClass::kOther, 0), true)) {
return false;
}
}
// What we have consumed so far is a valid consonant cluster.
if (output_used_ < output_.size()) MultiCodePart(1);
return true;
}
// Helper consumes/copies a vowel and optional modifiers.
bool ValidateJavanese::ConsumeVowelIfValid() {
if (UseMultiCode(1)) return true;
while (codes_[codes_used_].first == CharClass::kVowelModifier) {
if (UseMultiCode(1)) return true;
// Only Malayalam allows repeated modifiers?
if (script_ != ViramaScript::kMalayalam) break;
}
while (codes_[codes_used_].first == CharClass::kVedicMark) {
if (UseMultiCode(1)) return true;
}
// What we have consumed so far is a valid vowel cluster.
return true;
}
Validator::CharClass ValidateJavanese::UnicodeToCharClass(char32 ch) const {
if (ch == kZeroWidthNonJoiner) return CharClass::kZeroWidthNonJoiner;
if (ch == kZeroWidthJoiner) return CharClass::kZeroWidthJoiner;
// Offset from the start of the relevant unicode code block aka code page.
int off = ch - static_cast<char32>(script_);
// Anything in another code block is other.
if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther;
if (off < 0x4) return CharClass::kVowelModifier;
if (off <= 0x32) return CharClass::kConsonant; // includes independent vowels
if (off == 0x33) return CharClass::kNukta; // A9B3 CECAK TELU
if (off == 0x34) return CharClass::kMatraPiece; // A9B4 TARUNG two part vowels
if (off <= 0x39) return CharClass::kMatra;
if (off <= 0x3a) return CharClass::kConsonant; // A9BA TALING - pre base vowel
if (off <= 0x3d) return CharClass::kMatra;
if (off <= 0x3f) return CharClass::kNukta; // A9BE-A9BF PENGKAL-CAKRA medial consonants
if (off == 0x40) return CharClass::kVirama; // A9C0 PANGKON
return CharClass::kOther;
}
} // namespace tesseract