-
Notifications
You must be signed in to change notification settings - Fork 61
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Introducing a new SQLite extension function: GET_PHONEBOOK_INDEX
This function will produce a normalized upper case first letter from a given string. Bug: 2407129 Change-Id: Idfafca04342d43ef43cfdff0e431e0a6a8cf5c68
- Loading branch information
1 parent
aae12b8
commit 3a74962
Showing
4 changed files
with
264 additions
and
12 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,162 @@ | ||
/* | ||
* Copyright 2010, The Android Open Source Project | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#include <ctype.h> | ||
#include <string.h> | ||
|
||
#include <unicode/ucol.h> | ||
#include <unicode/uiter.h> | ||
#include <unicode/ustring.h> | ||
#include <unicode/utypes.h> | ||
|
||
#include "PhonebookIndex.h" | ||
#include "PhoneticStringUtils.h" | ||
|
||
#define SMALL_BUFFER_SIZE 10 | ||
|
||
namespace android { | ||
|
||
// IMPORTANT! Keep the codes below SORTED. We are doing a binary search on the array | ||
static UChar DEFAULT_CHAR_MAP[] = { | ||
0x00C6, 'A', // AE | ||
0x00DF, 'S', // Etzett | ||
0x1100, 0x3131, // HANGUL LETTER KIYEOK | ||
0x1101, 0x3132, // HANGUL LETTER SSANGKIYEOK | ||
0x1102, 0x3134, // HANGUL LETTER NIEUN | ||
0x1103, 0x3137, // HANGUL LETTER TIKEUT | ||
0x1104, 0x3138, // HANGUL LETTER SSANGTIKEUT | ||
0x1105, 0x3139, // HANGUL LETTER RIEUL | ||
0x1106, 0x3141, // HANGUL LETTER MIEUM | ||
0x1107, 0x3142, // HANGUL LETTER PIEUP | ||
0x1108, 0x3143, // HANGUL LETTER SSANGPIEUP | ||
0x1109, 0x3145, // HANGUL LETTER SIOS | ||
0x110A, 0x3146, // HANGUL LETTER SSANGSIOS | ||
0x110B, 0x3147, // HANGUL LETTER IEUNG | ||
0x110C, 0x3148, // HANGUL LETTER CIEUC | ||
0x110D, 0x3149, // HANGUL LETTER SSANGCIEUC | ||
0x110E, 0x314A, // HANGUL LETTER CHIEUCH | ||
0x110F, 0x314B, // HANGUL LETTER KHIEUKH | ||
0x1110, 0x314C, // HANGUL LETTER THIEUTH | ||
0x1111, 0x314D, // HANGUL LETTER PHIEUPH | ||
0x1112, 0x314E, // HANGUL LETTER HIEUH | ||
0x111A, 0x3140, // HANGUL LETTER RIEUL-HIEUH | ||
0x1121, 0x3144, // HANGUL LETTER PIEUP-SIOS | ||
0x1161, 0x314F, // HANGUL LETTER A | ||
0x1162, 0x3150, // HANGUL LETTER AE | ||
0x1163, 0x3151, // HANGUL LETTER YA | ||
0x1164, 0x3152, // HANGUL LETTER YAE | ||
0x1165, 0x3153, // HANGUL LETTER EO | ||
0x1166, 0x3154, // HANGUL LETTER E | ||
0x1167, 0x3155, // HANGUL LETTER YEO | ||
0x1168, 0x3156, // HANGUL LETTER YE | ||
0x1169, 0x3157, // HANGUL LETTER O | ||
0x116A, 0x3158, // HANGUL LETTER WA | ||
0x116B, 0x3159, // HANGUL LETTER WAE | ||
0x116C, 0x315A, // HANGUL LETTER OE | ||
0x116D, 0x315B, // HANGUL LETTER YO | ||
0x116E, 0x315C, // HANGUL LETTER U | ||
0x116F, 0x315D, // HANGUL LETTER WEO | ||
0x1170, 0x315E, // HANGUL LETTER WE | ||
0x1171, 0x315F, // HANGUL LETTER WI | ||
0x1172, 0x3160, // HANGUL LETTER YU | ||
0x1173, 0x3161, // HANGUL LETTER EU | ||
0x1174, 0x3162, // HANGUL LETTER YI | ||
0x1175, 0x3163, // HANGUL LETTER I | ||
0x11AA, 0x3133, // HANGUL LETTER KIYEOK-SIOS | ||
0x11AC, 0x3135, // HANGUL LETTER NIEUN-CIEUC | ||
0x11AD, 0x3136, // HANGUL LETTER NIEUN-HIEUH | ||
0x11B0, 0x313A, // HANGUL LETTER RIEUL-KIYEOK | ||
0x11B1, 0x313B, // HANGUL LETTER RIEUL-MIEUM | ||
0x11B3, 0x313D, // HANGUL LETTER RIEUL-SIOS | ||
0x11B4, 0x313E, // HANGUL LETTER RIEUL-THIEUTH | ||
0x11B5, 0x313F, // HANGUL LETTER RIEUL-PHIEUPH | ||
}; | ||
|
||
/** | ||
* Binary search to map an individual character to the corresponding phone book index. | ||
*/ | ||
static UChar map_character(UChar c, UChar * char_map, int32_t length) { | ||
int from = 0, to = length; | ||
while (from < to) { | ||
int m = ((to + from) >> 1) & ~0x1; // Only consider even positions | ||
UChar cm = char_map[m]; | ||
if (cm == c) { | ||
return char_map[m + 1]; | ||
} else if (cm < c) { | ||
from = m + 2; | ||
} else { | ||
to = m; | ||
} | ||
} | ||
return 0; | ||
} | ||
|
||
/** | ||
* Returns TRUE if the character belongs to a Hanzi unicode block | ||
*/ | ||
static bool is_CJK(UChar c) { | ||
return | ||
(0x4e00 <= c && c <= 0x9fff) // CJK_UNIFIED_IDEOGRAPHS | ||
|| (0x3400 <= c && c <= 0x4dbf) // CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A | ||
|| (0x3000 <= c && c <= 0x303f) // CJK_SYMBOLS_AND_PUNCTUATION | ||
|| (0x2e80 <= c && c <= 0x2eff) // CJK_RADICALS_SUPPLEMENT | ||
|| (0x3300 <= c && c <= 0x33ff) // CJK_COMPATIBILITY | ||
|| (0xfe30 <= c && c <= 0xfe4f) // CJK_COMPATIBILITY_FORMS | ||
|| (0xf900 <= c && c <= 0xfaff); // CJK_COMPATIBILITY_IDEOGRAPHS | ||
} | ||
|
||
UChar GetPhonebookIndex(UCharIterator * iter, const char * locale) { | ||
UChar dest[SMALL_BUFFER_SIZE]; | ||
|
||
// Normalize the first character to remove accents using the NFD normalization | ||
UErrorCode errorCode = U_ZERO_ERROR; | ||
int32_t len = unorm_next(iter, dest, SMALL_BUFFER_SIZE * sizeof(UChar), UNORM_NFD, | ||
0 /* options */, TRUE /* normalize */, NULL, &errorCode); | ||
if (U_FAILURE(errorCode) || len == 0) { | ||
return 0; | ||
} | ||
|
||
UChar c = dest[0]; | ||
|
||
// We are only interested in letters | ||
if (!u_isalpha(c)) { | ||
return 0; | ||
} | ||
|
||
c = u_toupper(c); | ||
|
||
// Check for explicitly mapped characters | ||
UChar c_mapped = map_character(c, DEFAULT_CHAR_MAP, sizeof(DEFAULT_CHAR_MAP) / sizeof(UChar)); | ||
if (c_mapped != 0) { | ||
return c_mapped; | ||
} | ||
|
||
// Convert Kanas to Hiragana | ||
UChar next = len > 2 ? dest[1] : 0; | ||
c = android::GetNormalizedCodePoint(c, next, NULL); | ||
|
||
if (is_CJK(c)) { | ||
if (strncmp(locale, "ja", 2) == 0) { | ||
return 0x8A18; // Kanji character used as a heading in letters, notices and other documents | ||
} else { | ||
return 0; | ||
} | ||
} | ||
|
||
return c; | ||
} | ||
|
||
} // namespace android |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
/* | ||
** | ||
** Copyright 2010, The Android Open Source Project | ||
** | ||
** Licensed under the Apache License, Version 2.0 (the "License"); | ||
** you may not use this file except in compliance with the License. | ||
** You may obtain a copy of the License at | ||
** | ||
** http://www.apache.org/licenses/LICENSE-2.0 | ||
** | ||
** Unless required by applicable law or agreed to in writing, software | ||
** distributed under the License is distributed on an "AS IS" BASIS, | ||
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
** See the License for the specific language governing permissions and | ||
** limitations under the License. | ||
*/ | ||
|
||
#ifndef _ANDROID_PHONEBOOK_INDEX_H | ||
#define _ANDROID_PHONEBOOK_INDEX_H | ||
|
||
#include <unicode/uiter.h> | ||
#include <unicode/utypes.h> | ||
|
||
namespace android { | ||
|
||
/** | ||
* A character converter that takes a UNICODE character and produces the | ||
* phonebook index for it in the specified locale. For example, "a" becomes "A" | ||
* and so does A with accents. Conversion rules differ from locale | ||
* locale, which is why this function takes locale as an argument. | ||
*/ | ||
UChar GetPhonebookIndex(UCharIterator * iter, const char * locale); | ||
|
||
} // namespace android | ||
|
||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters