8206120: Add test cases for lenient Japanese era parsing

8211398: Square character support for the Japanese new era 8218915: Change isJavaIdentifierStart and isJavaIdentifierPart to handle new code points Reviewed-by: coffeys, naoto Contributed-by: deepak.kejriwal@oracle.com
AdoptOpenJDK · Feb 22, 2019 · 686855c · 686855c
1 parent 41cc5c8
commit 686855c
Show file tree

Hide file tree

Showing 9 changed files with 395 additions and 3 deletions.
diff --git a/make/data/characterdata/CharacterData00.java.template b/make/data/characterdata/CharacterData00.java.template
@@ -105,11 +105,21 @@ class CharacterData00 extends CharacterData {
     }
 
     boolean isJavaIdentifierStart(int ch) {
+        // isJavaIdentifierStart strictly conforms to code points assigned
+        // in Unicode 10.0. Since code point {32FF} is not from Unicode 10.0,
+        // return false.
+        if(ch == 0x32FF)
+            return false;
         int props = getProperties(ch);
         return ((props & $$maskIdentifierInfo) >= $$lowJavaStart);
     }
 
     boolean isJavaIdentifierPart(int ch) {
+        // isJavaIdentifierPart strictly conforms to code points assigned
+        // in Unicode 10.0. Since code point {32FF} is not from Unicode 10.0,
+        // return false.
+        if(ch == 0x32FF)
+            return false;
         int props = getProperties(ch);
         return ((props & $$nonzeroJavaPart) != 0);
     }

diff --git a/make/data/unicodedata/UnicodeData.txt b/make/data/unicodedata/UnicodeData.txt
@@ -11729,6 +11729,7 @@
 32FC;CIRCLED KATAKANA WI;So;0;L;<circle> 30F0;;;;N;;;;;
 32FD;CIRCLED KATAKANA WE;So;0;L;<circle> 30F1;;;;N;;;;;
 32FE;CIRCLED KATAKANA WO;So;0;L;<circle> 30F2;;;;N;;;;;
+32FF;SQUARE ERA NAME NEWERA;So;0;L;<square> 5143 53F7;;;;N;SQUARED TWO IDEOGRAPHS ERA NAME NEWERA;;;;
 3300;SQUARE APAATO;So;0;L;<square> 30A2 30D1 30FC 30C8;;;;N;SQUARED APAATO;;;;
 3301;SQUARE ARUHUA;So;0;L;<square> 30A2 30EB 30D5 30A1;;;;N;SQUARED ARUHUA;;;;
 3302;SQUARE ANPEA;So;0;L;<square> 30A2 30F3 30DA 30A2;;;;N;SQUARED ANPEA;;;;

diff --git a/src/java.base/share/classes/java/lang/Character.java b/src/java.base/share/classes/java/lang/Character.java
@@ -5396,7 +5396,7 @@ public static enum UnicodeScript {
             0x3260,   // 3260..327E; HANGUL
             0x327F,   // 327F..32CF; COMMON
             0x32D0,   // 32D0..32FE; KATAKANA
-            0x32FF,   // 32FF      ; UNKNOWN
+            0x32FF,   // 32FF      ; COMMON
             0x3300,   // 3300..3357; KATAKANA
             0x3358,   // 3358..33FF; COMMON
             0x3400,   // 3400..4DB5; HAN
@@ -6913,7 +6913,7 @@ public static enum UnicodeScript {
             HANGUL,                   // 3260..327E
             COMMON,                   // 327F..32CF
             KATAKANA,                 // 32D0..32FE
-            UNKNOWN,                  // 32FF
+            COMMON,                   // 32FF
             KATAKANA,                 // 3300..3357
             COMMON,                   // 3358..33FF
             HAN,                      // 3400..4DB5

diff --git a/test/jdk/java/lang/Character/Scripts.txt b/test/jdk/java/lang/Character/Scripts.txt
@@ -402,6 +402,7 @@
 328A..32B0    ; Common # So  [39] CIRCLED IDEOGRAPH MOON..CIRCLED IDEOGRAPH NIGHT
 32B1..32BF    ; Common # No  [15] CIRCLED NUMBER THIRTY SIX..CIRCLED NUMBER FIFTY
 32C0..32CF    ; Common # So  [16] IDEOGRAPHIC TELEGRAPH SYMBOL FOR JANUARY..LIMITED LIABILITY SIGN
+32FF          ; Common # So       SQUARE ERA NAME NEWERA
 3358..33FF    ; Common # So [168] IDEOGRAPHIC TELEGRAPH SYMBOL FOR HOUR ZERO..SQUARE GAL
 4DC0..4DFF    ; Common # So  [64] HEXAGRAM FOR THE CREATIVE HEAVEN..HEXAGRAM FOR BEFORE COMPLETION
 A700..A716    ; Common # Sk  [23] MODIFIER LETTER CHINESE TONE YIN PING..MODIFIER LETTER EXTRA-LOW LEFT-STEM TONE BAR

diff --git a/test/jdk/java/lang/Character/TestIsJavaIdentifierMethods.java b/test/jdk/java/lang/Character/TestIsJavaIdentifierMethods.java
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2019, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/**
+ * @test
+ * @summary Test behavior of isJavaIdentifierXX, isJavaLetter, and
+ *  isJavaLetterOrDigit methods for all code points.
+ * @bug 8218915
+ */
+
+public class TestIsJavaIdentifierMethods {
+
+    // Japanese Era Square character code point not present in Unicode 10.0
+    private static final int JAPANESE_ERA_CODEPOINT = 0x32FF;
+
+    public static void main(String[] args) {
+        testIsJavaIdentifierPart_int();
+        testIsJavaIdentifierPart_char();
+        testIsJavaIdentifierStart_int();
+        testIsJavaIdentifierStart_char();
+        testIsJavaLetter();
+        testIsJavaLetterOrDigit();
+    }
+
+    /**
+     * Assertion testing for public static boolean isJavaIdentifierPart(int
+     * codePoint), A character may be part of a Java identifier if any of the
+     * following are true:
+     * <ul>
+     * <li>it is a letter</li>
+     * <li>it is a currency symbol (such as <code>'$'</code>)</li>
+     * <li>it is a connecting punctuation character (such as <code>'_'</code>)
+     * </li>
+     * <li>it is a digit</li>
+     * <li>it is a numeric letter (such as a Roman numeral character)</li>
+     * <li>it is a combining mark</li>
+     * <li>it is a non-spacing mark</li>
+     * <li><code>isIdentifierIgnorable</code> returns <code>true</code> for the
+     * character</li>
+     * </ul>
+     * All code points from (0x0000..0x10FFFF) are tested.
+     */
+    public static void testIsJavaIdentifierPart_int() {
+        for (int cp = 0; cp <= Character.MAX_CODE_POINT; cp++) {
+            boolean expected = false;
+            // Since Character.isJavaIdentifierPart(int) strictly conforms to
+            // character information from version 10.0 of the Unicode Standard,
+            // check if code point is "Japanese Era Square character code
+            // point". If the code point is "Japanese Era Square character code
+            // point", value of variable "expected" is considered false.
+            if (cp != JAPANESE_ERA_CODEPOINT) {
+                byte type = (byte) Character.getType(cp);
+                expected = Character.isLetter(cp)
+                        || type == Character.CURRENCY_SYMBOL
+                        || type == Character.CONNECTOR_PUNCTUATION
+                        || Character.isDigit(cp)
+                        || type == Character.LETTER_NUMBER
+                        || type == Character.COMBINING_SPACING_MARK
+                        || type == Character.NON_SPACING_MARK
+                        || Character.isIdentifierIgnorable(cp);
+            }
+
+            if (Character.isJavaIdentifierPart(cp) != expected) {
+                throw new RuntimeException(
+                   "Character.isJavaIdentifierPart(int) failed for codepoint "
+                                + Integer.toHexString(cp));
+            }
+        }
+    }
+
+    /**
+     * Assertion testing for public static boolean isJavaIdentifierPart(char
+     * ch), A character may be part of a Java identifier if any of the
+     * following are true:
+     * <ul>
+     * <li>it is a letter;
+     * <li>it is a currency symbol (such as "$");
+     * <li>it is a connecting punctuation character (such as "_");
+     * <li>it is a digit;
+     * <li>it is a numeric letter (such as a Roman numeral character);
+     * <li>it is a combining mark;
+     * <li>it is a non-spacing mark;
+     * <li>isIdentifierIgnorable returns true for the character.
+     * </ul>
+     * All Unicode code points in the BMP (0x0000..0xFFFF) are tested.
+     */
+    public static void testIsJavaIdentifierPart_char() {
+        for (int i = 0; i <= Character.MAX_VALUE; ++i) {
+            char ch = (char) i;
+            boolean expected = false;
+            // Since Character.isJavaIdentifierPart(char) strictly conforms to
+            // character information from version 10.0 of the Unicode Standard,
+            // check if code point is "Japanese Era Square character code
+            // point". If the code point is "Japanese Era Square character code
+            // point", value of variable "expected" is considered false.
+            if (i != JAPANESE_ERA_CODEPOINT) {
+                byte type = (byte) Character.getType(ch);
+                expected = Character.isLetter(ch)
+                        || type == Character.CURRENCY_SYMBOL
+                        || type == Character.CONNECTOR_PUNCTUATION
+                        || Character.isDigit(ch)
+                        || type == Character.LETTER_NUMBER
+                        || type == Character.COMBINING_SPACING_MARK
+                        || type == Character.NON_SPACING_MARK
+                        || Character.isIdentifierIgnorable(ch);
+            }
+
+            if (Character.isJavaIdentifierPart((char) i) != expected) {
+                throw new RuntimeException(
+                    "Character.isJavaIdentifierPart(char) failed for codepoint "
+                                + Integer.toHexString(i));
+            }
+        }
+    }
+
+    /**
+     * Assertion testing for public static boolean isJavaIdentifierStart(int
+     * codePoint), A character may start a Java identifier if and only if it is
+     * one of the following:
+     * <ul>
+     * <li>it is a letter;</li>
+     * <li>getType(ch) returns LETTER_NUMBER;</li>
+     * <li>it is a currency symbol (such as "$");</li>
+     * <li>it is a connecting punctuation character (such as "_");</li>
+     * </ul>
+     * All Code points from (0x0000..0x10FFFF) are tested.
+     */
+    public static void testIsJavaIdentifierStart_int() {
+        for (int cp = 0; cp <= Character.MAX_CODE_POINT; cp++) {
+            boolean expected = false;
+            // Since Character.isJavaIdentifierStart(int) strictly conforms to
+            // character information from version 10.0 of the Unicode Standard,
+            // check if code point is "Japanese Era Square character code
+            // point". If the code point is "Japanese Era Square character code
+            // point", value of variable "expected" is considered false.
+            if (cp != JAPANESE_ERA_CODEPOINT) {
+                byte type = (byte) Character.getType(cp);
+                expected = Character.isLetter(cp)
+                        || type == Character.LETTER_NUMBER
+                        || type == Character.CURRENCY_SYMBOL
+                        || type == Character.CONNECTOR_PUNCTUATION;
+            }
+
+            if (Character.isJavaIdentifierStart(cp) != expected) {
+                throw new RuntimeException(
+                        "Character.isJavaIdentifierStart(int) failed for codepoint "
+                                + Integer.toHexString(cp));
+            }
+        }
+    }
+
+    /**
+     * Assertion testing for public static boolean isJavaIdentifierStart(char),
+     * A character may start a Java identifier if and only if it is
+     * one of the following:
+     * <ul>
+     * <li>it is a letter;</li>
+     * <li>getType(ch) returns LETTER_NUMBER;</li>
+     * <li>it is a currency symbol (such as "$");</li>
+     * <li>it is a connecting punctuation character (such as "_");</li>
+     * </ul>
+     * All Unicode code points in the BMP (0x0000..0xFFFF) are tested.
+     */
+    public static void testIsJavaIdentifierStart_char() {
+        for (int i = 0; i <= Character.MAX_VALUE; i++) {
+            char ch = (char) i;
+            boolean expected = false;
+            // Since Character.isJavaIdentifierStart(char) strictly conforms to
+            // character information from version 10.0 of the Unicode Standard,
+            // check if code point is "Japanese Era Square character code
+            // point". If the code point is "Japanese Era Square character code
+            // point", value of variable "expected" is considered false.
+            if (i != JAPANESE_ERA_CODEPOINT) {
+                byte type = (byte) Character.getType(ch);
+                expected = Character.isLetter(ch)
+                        || type == Character.LETTER_NUMBER
+                        || type == Character.CURRENCY_SYMBOL
+                        || type == Character.CONNECTOR_PUNCTUATION;
+            }
+
+            if (Character.isJavaIdentifierStart(ch) != expected) {
+                throw new RuntimeException(
+                        "Character.isJavaIdentifierStart(char) failed for codepoint "
+                                + Integer.toHexString(i));
+            }
+        }
+    }
+
+    /**
+     * Assertion testing for public static boolean isJavaLetter(char ch), A
+     * character may start a Java identifier if and only if one of the following
+     * is true:
+     * <ul>
+     * <li>isLetter(ch) returns true
+     * <li>getType(ch) returns LETTER_NUMBER
+     * <li>ch is a currency symbol (such as "$")
+     * <li>ch is a connecting punctuation character (such as "_").
+     * </ul>
+     * All Unicode code points in the BMP (0x0000..0xFFFF) are tested.
+     */
+    public static void testIsJavaLetter() {
+        for (int i = 0; i <= Character.MAX_VALUE; ++i) {
+            char ch = (char) i;
+            boolean expected = false;
+            // Since Character.isJavaLetter(char) strictly conforms to
+            // character information from version 10.0 of the Unicode Standard,
+            // check if code point is "Japanese Era Square character code
+            // point". If the code point is "Japanese Era Square character code
+            // point", value of variable "expected" is considered false.
+            if (i != JAPANESE_ERA_CODEPOINT) {
+                byte type = (byte) Character.getType(ch);
+                expected = Character.isLetter(ch)
+                        || type == Character.LETTER_NUMBER
+                        || type == Character.CURRENCY_SYMBOL
+                        || type == Character.CONNECTOR_PUNCTUATION;
+            }
+
+            if (Character.isJavaLetter(ch) != expected) {
+                throw new RuntimeException(
+                        "Character.isJavaLetter(ch) failed for codepoint "
+                                + Integer.toHexString(i));
+            }
+        }
+    }
+
+    /**
+     * Assertion testing for public static boolean isJavaLetterOrDigit(char ch),
+     * A character may be part of a Java identifier if and only if any of the
+     * following are true:
+     * <ul>
+     * <li>it is a letter
+     * <li>it is a currency symbol (such as '$')
+     * <li>it is a connecting punctuation character (such as '_')
+     * <li>it is a digit
+     * <li>it is a numeric letter (such as a Roman numeral character)
+     * <li>it is a combining mark
+     * <li>it is a non-spacing mark
+     * <li>isIdentifierIgnorable returns true for the character.
+     * </ul>
+     * All Unicode code points in the BMP (0x0000..0xFFFF) are tested.
+     */
+    public static void testIsJavaLetterOrDigit() {
+        for (int i = 0; i <= Character.MAX_VALUE; ++i) {
+            char ch = (char) i;
+            boolean expected = false;
+            // Since Character.isJavaLetterOrDigit(char) strictly conforms to
+            // character information from version 10.0 of the Unicode Standard,
+            // check if code point is "Japanese Era Square character code
+            // point". If the code point is "Japanese Era Square character code
+            // point", value of variable "expected" is considered false.
+            if (i != JAPANESE_ERA_CODEPOINT) {
+                byte type = (byte) Character.getType(ch);
+                expected = Character.isLetter(ch)
+                        || type == Character.CURRENCY_SYMBOL
+                        || type == Character.CONNECTOR_PUNCTUATION
+                        || Character.isDigit(ch)
+                        || type == Character.LETTER_NUMBER
+                        || type == Character.COMBINING_SPACING_MARK
+                        || type == Character.NON_SPACING_MARK
+                        || Character.isIdentifierIgnorable(ch);
+            }
+
+            if (Character.isJavaLetterOrDigit(ch) != expected) {
+                throw new RuntimeException(
+                        "Character.isJavaLetterOrDigit(ch) failed for codepoint "
+                                + Integer.toHexString(i));
+            }
+        }
+    }
+}
diff --git a/test/jdk/java/lang/Character/UnicodeData.txt b/test/jdk/java/lang/Character/UnicodeData.txt
@@ -11729,6 +11729,7 @@
 32FC;CIRCLED KATAKANA WI;So;0;L;<circle> 30F0;;;;N;;;;;
 32FD;CIRCLED KATAKANA WE;So;0;L;<circle> 30F1;;;;N;;;;;
 32FE;CIRCLED KATAKANA WO;So;0;L;<circle> 30F2;;;;N;;;;;
+32FF;SQUARE ERA NAME NEWERA;So;0;L;<square> 5143 53F7;;;;N;SQUARED TWO IDEOGRAPHS ERA NAME NEWERA;;;;
 3300;SQUARE APAATO;So;0;L;<square> 30A2 30D1 30FC 30C8;;;;N;SQUARED APAATO;;;;
 3301;SQUARE ARUHUA;So;0;L;<square> 30A2 30EB 30D5 30A1;;;;N;SQUARED ARUHUA;;;;
 3302;SQUARE ANPEA;So;0;L;<square> 30A2 30F3 30DA 30A2;;;;N;SQUARED ANPEA;;;;

diff --git a/test/jdk/java/lang/Character/charprop00.bin b/test/jdk/java/lang/Character/charprop00.bin