Skip to content
This repository has been archived by the owner on Sep 29, 2021. It is now read-only.

Commit

Permalink
8206120: Add test cases for lenient Japanese era parsing
Browse files Browse the repository at this point in the history
8211398: Square character support for the Japanese new era
8218915: Change isJavaIdentifierStart and isJavaIdentifierPart to handle new code points
Reviewed-by: coffeys, naoto
Contributed-by: deepak.kejriwal@oracle.com
  • Loading branch information
rpatil committed Feb 22, 2019
1 parent 41cc5c8 commit 686855c
Show file tree
Hide file tree
Showing 9 changed files with 395 additions and 3 deletions.
10 changes: 10 additions & 0 deletions make/data/characterdata/CharacterData00.java.template
Expand Up @@ -105,11 +105,21 @@ class CharacterData00 extends CharacterData {
}

boolean isJavaIdentifierStart(int ch) {
// isJavaIdentifierStart strictly conforms to code points assigned
// in Unicode 10.0. Since code point {32FF} is not from Unicode 10.0,
// return false.
if(ch == 0x32FF)
return false;
int props = getProperties(ch);
return ((props & $$maskIdentifierInfo) >= $$lowJavaStart);
}

boolean isJavaIdentifierPart(int ch) {
// isJavaIdentifierPart strictly conforms to code points assigned
// in Unicode 10.0. Since code point {32FF} is not from Unicode 10.0,
// return false.
if(ch == 0x32FF)
return false;
int props = getProperties(ch);
return ((props & $$nonzeroJavaPart) != 0);
}
Expand Down
1 change: 1 addition & 0 deletions make/data/unicodedata/UnicodeData.txt
Expand Up @@ -11729,6 +11729,7 @@
32FC;CIRCLED KATAKANA WI;So;0;L;<circle> 30F0;;;;N;;;;;
32FD;CIRCLED KATAKANA WE;So;0;L;<circle> 30F1;;;;N;;;;;
32FE;CIRCLED KATAKANA WO;So;0;L;<circle> 30F2;;;;N;;;;;
32FF;SQUARE ERA NAME NEWERA;So;0;L;<square> 5143 53F7;;;;N;SQUARED TWO IDEOGRAPHS ERA NAME NEWERA;;;;
3300;SQUARE APAATO;So;0;L;<square> 30A2 30D1 30FC 30C8;;;;N;SQUARED APAATO;;;;
3301;SQUARE ARUHUA;So;0;L;<square> 30A2 30EB 30D5 30A1;;;;N;SQUARED ARUHUA;;;;
3302;SQUARE ANPEA;So;0;L;<square> 30A2 30F3 30DA 30A2;;;;N;SQUARED ANPEA;;;;
Expand Down
4 changes: 2 additions & 2 deletions src/java.base/share/classes/java/lang/Character.java
Expand Up @@ -5396,7 +5396,7 @@ public static enum UnicodeScript {
0x3260, // 3260..327E; HANGUL
0x327F, // 327F..32CF; COMMON
0x32D0, // 32D0..32FE; KATAKANA
0x32FF, // 32FF ; UNKNOWN
0x32FF, // 32FF ; COMMON
0x3300, // 3300..3357; KATAKANA
0x3358, // 3358..33FF; COMMON
0x3400, // 3400..4DB5; HAN
Expand Down Expand Up @@ -6913,7 +6913,7 @@ public static enum UnicodeScript {
HANGUL, // 3260..327E
COMMON, // 327F..32CF
KATAKANA, // 32D0..32FE
UNKNOWN, // 32FF
COMMON, // 32FF
KATAKANA, // 3300..3357
COMMON, // 3358..33FF
HAN, // 3400..4DB5
Expand Down
1 change: 1 addition & 0 deletions test/jdk/java/lang/Character/Scripts.txt
Expand Up @@ -402,6 +402,7 @@
328A..32B0 ; Common # So [39] CIRCLED IDEOGRAPH MOON..CIRCLED IDEOGRAPH NIGHT
32B1..32BF ; Common # No [15] CIRCLED NUMBER THIRTY SIX..CIRCLED NUMBER FIFTY
32C0..32CF ; Common # So [16] IDEOGRAPHIC TELEGRAPH SYMBOL FOR JANUARY..LIMITED LIABILITY SIGN
32FF ; Common # So SQUARE ERA NAME NEWERA
3358..33FF ; Common # So [168] IDEOGRAPHIC TELEGRAPH SYMBOL FOR HOUR ZERO..SQUARE GAL
4DC0..4DFF ; Common # So [64] HEXAGRAM FOR THE CREATIVE HEAVEN..HEXAGRAM FOR BEFORE COMPLETION
A700..A716 ; Common # Sk [23] MODIFIER LETTER CHINESE TONE YIN PING..MODIFIER LETTER EXTRA-LOW LEFT-STEM TONE BAR
Expand Down
290 changes: 290 additions & 0 deletions test/jdk/java/lang/Character/TestIsJavaIdentifierMethods.java
@@ -0,0 +1,290 @@
/*
* Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/

/**
* @test
* @summary Test behavior of isJavaIdentifierXX, isJavaLetter, and
* isJavaLetterOrDigit methods for all code points.
* @bug 8218915
*/

public class TestIsJavaIdentifierMethods {

// Japanese Era Square character code point not present in Unicode 10.0
private static final int JAPANESE_ERA_CODEPOINT = 0x32FF;

public static void main(String[] args) {
testIsJavaIdentifierPart_int();
testIsJavaIdentifierPart_char();
testIsJavaIdentifierStart_int();
testIsJavaIdentifierStart_char();
testIsJavaLetter();
testIsJavaLetterOrDigit();
}

/**
* Assertion testing for public static boolean isJavaIdentifierPart(int
* codePoint), A character may be part of a Java identifier if any of the
* following are true:
* <ul>
* <li>it is a letter</li>
* <li>it is a currency symbol (such as <code>'$'</code>)</li>
* <li>it is a connecting punctuation character (such as <code>'_'</code>)
* </li>
* <li>it is a digit</li>
* <li>it is a numeric letter (such as a Roman numeral character)</li>
* <li>it is a combining mark</li>
* <li>it is a non-spacing mark</li>
* <li><code>isIdentifierIgnorable</code> returns <code>true</code> for the
* character</li>
* </ul>
* All code points from (0x0000..0x10FFFF) are tested.
*/
public static void testIsJavaIdentifierPart_int() {
for (int cp = 0; cp <= Character.MAX_CODE_POINT; cp++) {
boolean expected = false;
// Since Character.isJavaIdentifierPart(int) strictly conforms to
// character information from version 10.0 of the Unicode Standard,
// check if code point is "Japanese Era Square character code
// point". If the code point is "Japanese Era Square character code
// point", value of variable "expected" is considered false.
if (cp != JAPANESE_ERA_CODEPOINT) {
byte type = (byte) Character.getType(cp);
expected = Character.isLetter(cp)
|| type == Character.CURRENCY_SYMBOL
|| type == Character.CONNECTOR_PUNCTUATION
|| Character.isDigit(cp)
|| type == Character.LETTER_NUMBER
|| type == Character.COMBINING_SPACING_MARK
|| type == Character.NON_SPACING_MARK
|| Character.isIdentifierIgnorable(cp);
}

if (Character.isJavaIdentifierPart(cp) != expected) {
throw new RuntimeException(
"Character.isJavaIdentifierPart(int) failed for codepoint "
+ Integer.toHexString(cp));
}
}
}

/**
* Assertion testing for public static boolean isJavaIdentifierPart(char
* ch), A character may be part of a Java identifier if any of the
* following are true:
* <ul>
* <li>it is a letter;
* <li>it is a currency symbol (such as "$");
* <li>it is a connecting punctuation character (such as "_");
* <li>it is a digit;
* <li>it is a numeric letter (such as a Roman numeral character);
* <li>it is a combining mark;
* <li>it is a non-spacing mark;
* <li>isIdentifierIgnorable returns true for the character.
* </ul>
* All Unicode code points in the BMP (0x0000..0xFFFF) are tested.
*/
public static void testIsJavaIdentifierPart_char() {
for (int i = 0; i <= Character.MAX_VALUE; ++i) {
char ch = (char) i;
boolean expected = false;
// Since Character.isJavaIdentifierPart(char) strictly conforms to
// character information from version 10.0 of the Unicode Standard,
// check if code point is "Japanese Era Square character code
// point". If the code point is "Japanese Era Square character code
// point", value of variable "expected" is considered false.
if (i != JAPANESE_ERA_CODEPOINT) {
byte type = (byte) Character.getType(ch);
expected = Character.isLetter(ch)
|| type == Character.CURRENCY_SYMBOL
|| type == Character.CONNECTOR_PUNCTUATION
|| Character.isDigit(ch)
|| type == Character.LETTER_NUMBER
|| type == Character.COMBINING_SPACING_MARK
|| type == Character.NON_SPACING_MARK
|| Character.isIdentifierIgnorable(ch);
}

if (Character.isJavaIdentifierPart((char) i) != expected) {
throw new RuntimeException(
"Character.isJavaIdentifierPart(char) failed for codepoint "
+ Integer.toHexString(i));
}
}
}

/**
* Assertion testing for public static boolean isJavaIdentifierStart(int
* codePoint), A character may start a Java identifier if and only if it is
* one of the following:
* <ul>
* <li>it is a letter;</li>
* <li>getType(ch) returns LETTER_NUMBER;</li>
* <li>it is a currency symbol (such as "$");</li>
* <li>it is a connecting punctuation character (such as "_");</li>
* </ul>
* All Code points from (0x0000..0x10FFFF) are tested.
*/
public static void testIsJavaIdentifierStart_int() {
for (int cp = 0; cp <= Character.MAX_CODE_POINT; cp++) {
boolean expected = false;
// Since Character.isJavaIdentifierStart(int) strictly conforms to
// character information from version 10.0 of the Unicode Standard,
// check if code point is "Japanese Era Square character code
// point". If the code point is "Japanese Era Square character code
// point", value of variable "expected" is considered false.
if (cp != JAPANESE_ERA_CODEPOINT) {
byte type = (byte) Character.getType(cp);
expected = Character.isLetter(cp)
|| type == Character.LETTER_NUMBER
|| type == Character.CURRENCY_SYMBOL
|| type == Character.CONNECTOR_PUNCTUATION;
}

if (Character.isJavaIdentifierStart(cp) != expected) {
throw new RuntimeException(
"Character.isJavaIdentifierStart(int) failed for codepoint "
+ Integer.toHexString(cp));
}
}
}

/**
* Assertion testing for public static boolean isJavaIdentifierStart(char),
* A character may start a Java identifier if and only if it is
* one of the following:
* <ul>
* <li>it is a letter;</li>
* <li>getType(ch) returns LETTER_NUMBER;</li>
* <li>it is a currency symbol (such as "$");</li>
* <li>it is a connecting punctuation character (such as "_");</li>
* </ul>
* All Unicode code points in the BMP (0x0000..0xFFFF) are tested.
*/
public static void testIsJavaIdentifierStart_char() {
for (int i = 0; i <= Character.MAX_VALUE; i++) {
char ch = (char) i;
boolean expected = false;
// Since Character.isJavaIdentifierStart(char) strictly conforms to
// character information from version 10.0 of the Unicode Standard,
// check if code point is "Japanese Era Square character code
// point". If the code point is "Japanese Era Square character code
// point", value of variable "expected" is considered false.
if (i != JAPANESE_ERA_CODEPOINT) {
byte type = (byte) Character.getType(ch);
expected = Character.isLetter(ch)
|| type == Character.LETTER_NUMBER
|| type == Character.CURRENCY_SYMBOL
|| type == Character.CONNECTOR_PUNCTUATION;
}

if (Character.isJavaIdentifierStart(ch) != expected) {
throw new RuntimeException(
"Character.isJavaIdentifierStart(char) failed for codepoint "
+ Integer.toHexString(i));
}
}
}

/**
* Assertion testing for public static boolean isJavaLetter(char ch), A
* character may start a Java identifier if and only if one of the following
* is true:
* <ul>
* <li>isLetter(ch) returns true
* <li>getType(ch) returns LETTER_NUMBER
* <li>ch is a currency symbol (such as "$")
* <li>ch is a connecting punctuation character (such as "_").
* </ul>
* All Unicode code points in the BMP (0x0000..0xFFFF) are tested.
*/
public static void testIsJavaLetter() {
for (int i = 0; i <= Character.MAX_VALUE; ++i) {
char ch = (char) i;
boolean expected = false;
// Since Character.isJavaLetter(char) strictly conforms to
// character information from version 10.0 of the Unicode Standard,
// check if code point is "Japanese Era Square character code
// point". If the code point is "Japanese Era Square character code
// point", value of variable "expected" is considered false.
if (i != JAPANESE_ERA_CODEPOINT) {
byte type = (byte) Character.getType(ch);
expected = Character.isLetter(ch)
|| type == Character.LETTER_NUMBER
|| type == Character.CURRENCY_SYMBOL
|| type == Character.CONNECTOR_PUNCTUATION;
}

if (Character.isJavaLetter(ch) != expected) {
throw new RuntimeException(
"Character.isJavaLetter(ch) failed for codepoint "
+ Integer.toHexString(i));
}
}
}

/**
* Assertion testing for public static boolean isJavaLetterOrDigit(char ch),
* A character may be part of a Java identifier if and only if any of the
* following are true:
* <ul>
* <li>it is a letter
* <li>it is a currency symbol (such as '$')
* <li>it is a connecting punctuation character (such as '_')
* <li>it is a digit
* <li>it is a numeric letter (such as a Roman numeral character)
* <li>it is a combining mark
* <li>it is a non-spacing mark
* <li>isIdentifierIgnorable returns true for the character.
* </ul>
* All Unicode code points in the BMP (0x0000..0xFFFF) are tested.
*/
public static void testIsJavaLetterOrDigit() {
for (int i = 0; i <= Character.MAX_VALUE; ++i) {
char ch = (char) i;
boolean expected = false;
// Since Character.isJavaLetterOrDigit(char) strictly conforms to
// character information from version 10.0 of the Unicode Standard,
// check if code point is "Japanese Era Square character code
// point". If the code point is "Japanese Era Square character code
// point", value of variable "expected" is considered false.
if (i != JAPANESE_ERA_CODEPOINT) {
byte type = (byte) Character.getType(ch);
expected = Character.isLetter(ch)
|| type == Character.CURRENCY_SYMBOL
|| type == Character.CONNECTOR_PUNCTUATION
|| Character.isDigit(ch)
|| type == Character.LETTER_NUMBER
|| type == Character.COMBINING_SPACING_MARK
|| type == Character.NON_SPACING_MARK
|| Character.isIdentifierIgnorable(ch);
}

if (Character.isJavaLetterOrDigit(ch) != expected) {
throw new RuntimeException(
"Character.isJavaLetterOrDigit(ch) failed for codepoint "
+ Integer.toHexString(i));
}
}
}
}
1 change: 1 addition & 0 deletions test/jdk/java/lang/Character/UnicodeData.txt
Expand Up @@ -11729,6 +11729,7 @@
32FC;CIRCLED KATAKANA WI;So;0;L;<circle> 30F0;;;;N;;;;;
32FD;CIRCLED KATAKANA WE;So;0;L;<circle> 30F1;;;;N;;;;;
32FE;CIRCLED KATAKANA WO;So;0;L;<circle> 30F2;;;;N;;;;;
32FF;SQUARE ERA NAME NEWERA;So;0;L;<square> 5143 53F7;;;;N;SQUARED TWO IDEOGRAPHS ERA NAME NEWERA;;;;
3300;SQUARE APAATO;So;0;L;<square> 30A2 30D1 30FC 30C8;;;;N;SQUARED APAATO;;;;
3301;SQUARE ARUHUA;So;0;L;<square> 30A2 30EB 30D5 30A1;;;;N;SQUARED ARUHUA;;;;
3302;SQUARE ANPEA;So;0;L;<square> 30A2 30F3 30DA 30A2;;;;N;SQUARED ANPEA;;;;
Expand Down
Binary file modified test/jdk/java/lang/Character/charprop00.bin
Binary file not shown.

0 comments on commit 686855c

Please sign in to comment.