Skip to content

Commit

Permalink
Initial COmmit to add Aksara Jawa - Javanese script
Browse files Browse the repository at this point in the history
  • Loading branch information
Shreeshrii committed Aug 3, 2018
1 parent e9b4e21 commit 0eb7be1
Show file tree
Hide file tree
Showing 7 changed files with 185 additions and 8 deletions.
4 changes: 2 additions & 2 deletions src/training/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -186,9 +186,9 @@ set(unicharset_training_src
unicharset_training_utils.h

validate_grapheme.h validate_indic.h validate_khmer.h
validate_myanmar.h validator.h
validate_javanese.h validate_myanmar.h validator.h
validate_grapheme.cpp validate_indic.cpp validate_khmer.cpp
validate_myanmar.cpp validator.cpp
validate_javanese.cpp validate_myanmar.cpp validator.cpp

)
add_library (unicharset_training ${unicharset_training_src})
Expand Down
2 changes: 2 additions & 0 deletions src/training/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ noinst_HEADERS = \
util.h \
validate_grapheme.h \
validate_indic.h \
validate_javanese.h \
validate_khmer.h \
validate_myanmar.h \
validator.h
Expand Down Expand Up @@ -76,6 +77,7 @@ libtesseract_training_la_SOURCES = \
unicharset_training_utils.cpp \
validate_grapheme.cpp \
validate_indic.cpp \
validate_javanese.h \
validate_khmer.cpp \
validate_myanmar.cpp \
validator.cpp
Expand Down
5 changes: 3 additions & 2 deletions src/training/language-specific.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
VALID_LANGUAGE_CODES="afr amh ara asm aze aze_cyrl bel ben bih bod bos bul cat
ceb ces chi_sim chi_tra chr cym cyr_lid dan deu div dzo
ell eng enm epo est eus fas fil fin fra frk frm gle glg
grc guj hat heb hin hrv hun hye iku ind isl ita ita_old
grc guj hat heb hin hrv hun hye iast iku ind isl ita ita_old
jav jpn kan kat kat_old kaz khm kir kor kur lao lat
lat_lid lav lit mal mar mkd mlt msa mya nep nld nor ori
pan pol por pus ron rus san sin slk slv snd spa spa_old
Expand Down Expand Up @@ -961,6 +961,7 @@ set_lang_specific_parameters() {
glg ) ;;
hat ) ;;
hrv ) ;;
iast ) ;;
ind ) ;;
isl ) ;;
ita ) ;;
Expand Down Expand Up @@ -1171,7 +1172,7 @@ set_lang_specific_parameters() {
LANG_IS_RTL="1"
NORM_MODE="2" ;;
asm | ben | bih | hin | mar | nep | guj | kan | mal | tam | tel | pan | \
dzo | sin | san | bod | ori | khm | mya | tha | lao )
dzo | sin | san | bod | ori | khm | mya | tha | lao | jav )
LANG_IS_RTL="0"
NORM_MODE="2" ;;
* )
Expand Down
116 changes: 116 additions & 0 deletions src/training/validate_javanese.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
/**********************************************************************
* File: validate_javanese.cpp
* Description: Text validator for Javanese Script - aksara jawa.
* Author: Shree Devi Kumar
* Created: August 03, 2018
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
**********************************************************************/

#include "validate_javanese.h"
#include "errcode.h"
#include "tprintf.h"

namespace tesseract {

// Returns whether codes matches the pattern for a Javanese Grapheme.
// Taken from unicode standard:
// http://www.unicode.org/charts/PDF/UA980.pdf
// http://www.unicode.org/versions/Unicode11.0.0/ch17.pdf
// Also the Consonant class here includes independent vowels, as they are
// treated the same anyway.

bool ValidateJavanese::ConsumeGraphemeIfValid() {
int num_codes = codes_.size();
if (codes_used_ == num_codes) return false;
if (codes_[codes_used_].first == CharClass::kOther) {
UseMultiCode(1);
return true;
}
if (codes_[codes_used_].first != CharClass::kConsonant) {
if (report_errors_) {
tprintf("Invalid start of Javanese syllable:0x%x\n",
codes_[codes_used_].second);
}
return false;
}
if (UseMultiCode(1)) return true;
if ( codes_[codes_used_].first == CharClass::kNukta) {
if (UseMultiCode(1)) return true;
}
while (codes_used_ + 1 < num_codes &&
codes_[codes_used_].first == CharClass::kVirama &&
codes_[codes_used_ + 1].first == CharClass::kConsonant) {
ASSERT_HOST(!CodeOnlyToOutput());
if (UseMultiCode(2)) return true;
if (codes_[codes_used_].first == CharClass::kRobat) {
if (UseMultiCode(1)) return true;
}
}
int num_matra_parts = 0;
if (codes_[codes_used_].second == kZeroWidthJoiner ||
codes_[codes_used_].second == kZeroWidthNonJoiner) {
if (CodeOnlyToOutput()) {
if (report_errors_) {
tprintf("Unterminated joiner: 0x%x\n", output_.back());
}
return false;
}
++num_matra_parts;
}
// Not quite as shown by the BNF, the matra piece is allowed as a matra on its
// own or as an addition to other matras.
if (codes_[codes_used_].first == CharClass::kMatra) {
++num_matra_parts;
if (UseMultiCode(num_matra_parts)) return true;
} else if (num_matra_parts) {
if (report_errors_) {
tprintf("Joiner with non-dependent vowel after it!:0x%x 0x%x\n",
output_.back(), codes_[codes_used_].second);
}
return false;
}
if (codes_[codes_used_].first == CharClass::kMatraPiece &&
codes_[codes_used_ - 1].first != CharClass::kMatraPiece) {
if (UseMultiCode(1)) return true;
}
if (codes_[codes_used_].first == CharClass::kVowelModifier) {
if (UseMultiCode(1)) return true;
}
if (codes_used_ + 1 < num_codes &&
codes_[codes_used_].first == CharClass::kVirama &&
codes_[codes_used_ + 1].first == CharClass::kConsonant) {
ASSERT_HOST(!CodeOnlyToOutput());
if (UseMultiCode(2)) return true;
}
return true;
}

Validator::CharClass ValidateJavanese::UnicodeToCharClass(char32 ch) const {
if (IsVedicAccent(ch)) return CharClass::kVedicMark;
if (ch == kZeroWidthNonJoiner) return CharClass::kZeroWidthNonJoiner;
if (ch == kZeroWidthJoiner) return CharClass::kZeroWidthJoiner;
// Offset from the start of the relevant unicode code block aka code page.
int off = ch - static_cast<char32>(script_);
// Anything in another code block is other.
if (off < 0 || off >= kIndicCodePageSize) return CharClass::kOther;
if (off < 0x4) return CharClass::kVowelModifier;
if (off <= 0x32) return CharClass::kConsonant; // includes independent vowels
if (off == 0x33) return CharClass::kNukta; // A9B3 CECAK TELU
if (off == 0x34) return CharClass::kVowelModifier; // A9B4 TARUNG
if (off <= 0x3d) return CharClass::kMatra;
if (off <= 0x3f) return CharClass::kVowelModifier; // A9BE-A9BF PENGKAL-CAKRA
if (off == 0x40) return CharClass::kVirama; // A9C0 PANGKON
return CharClass::kOther;
}

} // namespace tesseract
45 changes: 45 additions & 0 deletions src/training/validate_javanese.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
/**********************************************************************
* File: validate_javanese.h
* Description: Text validator for Javanese Script - aksara jawa.
* Author: Shree Devi Kumar
* Created: August 03, 2018
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
**********************************************************************/

#ifndef TESSERACT_TRAINING_VALIDATE_JAVANESE_H_
#define TESSERACT_TRAINING_VALIDATE_JAVANESE_H_

#include "validator.h"

namespace tesseract {

// Subclass of Validator that validates and segments Javanese.
class ValidateJavanese : public Validator {
public:
ValidateJavanese(ViramaScript script, bool report_errors)
: Validator(script, report_errors) {}
~ValidateJavanese() {}

protected:
// Returns whether codes matches the pattern for an Javanese Grapheme.
// Consumes the next Grapheme in codes_[codes_used_++...] and copies it to
// parts_ and output_. Returns true if a valid Grapheme was consumed,
// otherwise does not increment codes_used_.
bool ConsumeGraphemeIfValid() override;
// Returns the CharClass corresponding to the given Unicode ch.
CharClass UnicodeToCharClass(char32 ch) const override;
};

} // namespace tesseract

#endif // TESSERACT_TRAINING_VALIDATE_JAVANESE_H_
17 changes: 13 additions & 4 deletions src/training/validator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include "unicode/uscript.h" // From libicu
#include "validate_grapheme.h"
#include "validate_indic.h"
#include "validate_javanese.h"
#include "validate_khmer.h"
#include "validate_myanmar.h"

Expand Down Expand Up @@ -68,6 +69,9 @@ std::unique_ptr<Validator> Validator::ScriptValidator(ViramaScript script,
case ViramaScript::kNonVirama:
return std::unique_ptr<Validator>(
new ValidateGrapheme(script, report_errors));
case ViramaScript::kJavanese:
return std::unique_ptr<Validator>(
new ValidateJavanese(script, report_errors));
case ViramaScript::kMyanmar:
return std::unique_ptr<Validator>(
new ValidateMyanmar(script, report_errors));
Expand Down Expand Up @@ -135,13 +139,13 @@ ViramaScript Validator::MostFrequentViramaScript(
const std::vector<char32>& utf32) {
std::unordered_map<int, int> histogram;
for (char32 ch : utf32) {
// Determine the codepage base. For the Indic scripts, and Khmer, it is
// Determine the codepage base. For the Indic scripts, Khmer and Javanese, it is
// sufficient to divide by kIndicCodePageSize but Myanmar is all over the
// unicode code space, so use its script id.
int base = ch / kIndicCodePageSize;
IcuErrorCode err;
UScriptCode script_code = uscript_getScript(ch, err);
if ((kMinIndicUnicode <= ch && ch <= kMaxViramaScriptUnicode &&
if ((kMinIndicUnicode <= ch && ch <= kMaxJavaneseUnicode &&
script_code != USCRIPT_COMMON) ||
script_code == USCRIPT_MYANMAR) {
if (script_code == USCRIPT_MYANMAR)
Expand All @@ -156,6 +160,7 @@ ViramaScript Validator::MostFrequentViramaScript(
char32 codebase = static_cast<char32>(base * kIndicCodePageSize);
// Check for validity.
if (codebase == static_cast<char32>(ViramaScript::kMyanmar) ||
codebase == static_cast<char32>(ViramaScript::kJavanese) ||
codebase == static_cast<char32>(ViramaScript::kKhmer) ||
(static_cast<char32>(ViramaScript::kDevanagari) <= codebase &&
codebase <= static_cast<char32>(ViramaScript::kSinhala))) {
Expand All @@ -170,7 +175,9 @@ ViramaScript Validator::MostFrequentViramaScript(
bool Validator::IsVirama(char32 unicode) {
return (kMinIndicUnicode <= unicode && unicode <= kMaxSinhalaUnicode &&
(unicode & 0x7f) == 0x4d) ||
unicode == kSinhalaVirama || unicode == kMyanmarVirama ||
unicode == kSinhalaVirama ||
unicode == kJavaneseVirama ||
unicode == kMyanmarVirama ||
unicode == kKhmerVirama;
}

Expand All @@ -186,7 +193,9 @@ bool Validator::IsVedicAccent(char32 unicode) {
bool Validator::IsSubscriptScript() const {
return script_ == ViramaScript::kTelugu ||
script_ == ViramaScript::kKannada ||
script_ == ViramaScript::kMyanmar || script_ == ViramaScript::kKhmer;
script_ == ViramaScript::kJavanese ||
script_ == ViramaScript::kMyanmar ||
script_ == ViramaScript::kKhmer;
}

void Validator::ComputeClassCodes(const std::vector<char32>& text) {
Expand Down
4 changes: 4 additions & 0 deletions src/training/validator.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ enum class ViramaScript : char32 {
kSinhala = 0xd80,
kMyanmar = 0x1000,
kKhmer = 0x1780,
kJavanese = 0xa980,
};

// Base class offers a validation API and protected methods to allow subclasses
Expand Down Expand Up @@ -221,6 +222,9 @@ class Validator {
static const char32 kSinhalaVirama = 0xdca;
static const char32 kMyanmarVirama = 0x1039;
static const char32 kKhmerVirama = 0x17d2;
// Javanese Script - aksarajawa
static const char32 kJavaneseVirama = 0xa9c0;
static const char32 kMaxJavaneseUnicode = 0xa9df;

// Script we are operating on.
ViramaScript script_;
Expand Down

0 comments on commit 0eb7be1

Please sign in to comment.