forked from mozilla/pdf.js
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpdf_find_utils.js
115 lines (101 loc) · 2.99 KB
/
pdf_find_utils.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
/* Copyright 2018 Mozilla Foundation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
const CharacterType = {
SPACE: 0,
ALPHA_LETTER: 1,
PUNCT: 2,
HAN_LETTER: 3,
KATAKANA_LETTER: 4,
HIRAGANA_LETTER: 5,
HALFWIDTH_KATAKANA_LETTER: 6,
THAI_LETTER: 7,
};
function isAlphabeticalScript(charCode) {
return charCode < 0x2e80;
}
function isAscii(charCode) {
return (charCode & 0xff80) === 0;
}
function isAsciiAlpha(charCode) {
return (
(charCode >= /* a = */ 0x61 && charCode <= /* z = */ 0x7a) ||
(charCode >= /* A = */ 0x41 && charCode <= /* Z = */ 0x5a)
);
}
function isAsciiDigit(charCode) {
return charCode >= /* 0 = */ 0x30 && charCode <= /* 9 = */ 0x39;
}
function isAsciiSpace(charCode) {
return (
charCode === /* SPACE = */ 0x20 ||
charCode === /* TAB = */ 0x09 ||
charCode === /* CR = */ 0x0d ||
charCode === /* LF = */ 0x0a
);
}
function isHan(charCode) {
return (
(charCode >= 0x3400 && charCode <= 0x9fff) ||
(charCode >= 0xf900 && charCode <= 0xfaff)
);
}
function isKatakana(charCode) {
return charCode >= 0x30a0 && charCode <= 0x30ff;
}
function isHiragana(charCode) {
return charCode >= 0x3040 && charCode <= 0x309f;
}
function isHalfwidthKatakana(charCode) {
return charCode >= 0xff60 && charCode <= 0xff9f;
}
function isThai(charCode) {
return (charCode & 0xff80) === 0x0e00;
}
/**
* This function is based on the word-break detection implemented in:
* https://hg.mozilla.org/mozilla-central/file/tip/intl/lwbrk/WordBreaker.cpp
*/
function getCharacterType(charCode) {
if (isAlphabeticalScript(charCode)) {
if (isAscii(charCode)) {
if (isAsciiSpace(charCode)) {
return CharacterType.SPACE;
} else if (
isAsciiAlpha(charCode) ||
isAsciiDigit(charCode) ||
charCode === /* UNDERSCORE = */ 0x5f
) {
return CharacterType.ALPHA_LETTER;
}
return CharacterType.PUNCT;
} else if (isThai(charCode)) {
return CharacterType.THAI_LETTER;
} else if (charCode === /* NBSP = */ 0xa0) {
return CharacterType.SPACE;
}
return CharacterType.ALPHA_LETTER;
}
if (isHan(charCode)) {
return CharacterType.HAN_LETTER;
} else if (isKatakana(charCode)) {
return CharacterType.KATAKANA_LETTER;
} else if (isHiragana(charCode)) {
return CharacterType.HIRAGANA_LETTER;
} else if (isHalfwidthKatakana(charCode)) {
return CharacterType.HALFWIDTH_KATAKANA_LETTER;
}
return CharacterType.ALPHA_LETTER;
}
export { CharacterType, getCharacterType };